From e77a8be9a0a7f2c10151967e3c72c5afcbd41117 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 16 Dec 2016 12:15:54 +0000
Subject: nl80211: better describe field in struct
 nl80211_bss_select_rssi_adjust

The two fields in struct nl80211_bss_select_rssi_adjust did not state
their type or unit. Adding documentation.

Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6b76e3b0c18e..d74e10b1246a 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4964,8 +4964,9 @@ enum nl80211_sched_scan_plan {
 /**
  * struct nl80211_bss_select_rssi_adjust - RSSI adjustment parameters.
  *
- * @band: band of BSS that must match for RSSI value adjustment.
- * @delta: value used to adjust the RSSI value of matching BSS.
+ * @band: band of BSS that must match for RSSI value adjustment. The value
+ *	of this field is according to &enum nl80211_band.
+ * @delta: value used to adjust the RSSI value of matching BSS in dB.
  */
 struct nl80211_bss_select_rssi_adjust {
 	__u8 band;
-- 
cgit v1.2.3


From 3289025aedc018f8fd9d0e37fb9efa0c6d531ffa Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Date: Mon, 4 Jul 2016 22:35:15 -0700
Subject: RDS: add receive message trace used by application

Socket option to tap receive path latency in various stages
in nano seconds. It can be enabled on selective sockets using
using SO_RDS_MSG_RXPATH_LATENCY socket option. RDS will return
the data to application with RDS_CMSG_RXPATH_LATENCY in defined
format. Scope is left to add more trace points for future
without need of change in the interface.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 include/uapi/linux/rds.h | 33 +++++++++++++++++++++++++++++++++
 net/rds/af_rds.c         | 28 ++++++++++++++++++++++++++++
 net/rds/ib_recv.c        |  4 ++++
 net/rds/rds.h            | 10 ++++++++++
 net/rds/recv.c           | 32 +++++++++++++++++++++++++++++---
 net/rds/tcp_recv.c       |  5 +++++
 6 files changed, 109 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 0f9265cb2a96..3833113ab2c0 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -52,6 +52,13 @@
 #define RDS_GET_MR_FOR_DEST		7
 #define SO_RDS_TRANSPORT		8
 
+/* Socket option to tap receive path latency
+ *	SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
+ *	Format used struct rds_rx_trace_so
+ */
+#define SO_RDS_MSG_RXPATH_LATENCY	10
+
+
 /* supported values for SO_RDS_TRANSPORT */
 #define	RDS_TRANS_IB	0
 #define	RDS_TRANS_IWARP	1
@@ -77,6 +84,12 @@
  *	the same as for the GET_MR setsockopt.
  * RDS_CMSG_RDMA_STATUS (recvmsg)
  *	Returns the status of a completed RDMA operation.
+ * RDS_CMSG_RXPATH_LATENCY(recvmsg)
+ *	Returns rds message latencies in various stages of receive
+ *	path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY
+ *	socket option. Legitimate points are defined in
+ *	enum rds_message_rxpath_latency. More points can be added in
+ *	future. CSMG format is struct rds_cmsg_rx_trace.
  */
 #define RDS_CMSG_RDMA_ARGS		1
 #define RDS_CMSG_RDMA_DEST		2
@@ -87,6 +100,7 @@
 #define RDS_CMSG_ATOMIC_CSWP		7
 #define RDS_CMSG_MASKED_ATOMIC_FADD	8
 #define RDS_CMSG_MASKED_ATOMIC_CSWP	9
+#define RDS_CMSG_RXPATH_LATENCY		11
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000
@@ -171,6 +185,25 @@ struct rds_info_rdma_connection {
 	uint32_t	rdma_mr_size;
 };
 
+/* RDS message Receive Path Latency points */
+enum rds_message_rxpath_latency {
+	RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
+	RDS_MSG_RX_DGRAM_REASSEMBLE,
+	RDS_MSG_RX_DGRAM_DELIVERED,
+	RDS_MSG_RX_DGRAM_TRACE_MAX
+};
+
+struct rds_rx_trace_so {
+	u8 rx_traces;
+	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
+struct rds_cmsg_rx_trace {
+	u8 rx_traces;
+	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
 /*
  * Congestion monitoring.
  * Congestion control in RDS happens at the host connection
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2ac1e6194be3..fd8217404162 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,30 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+				  int optlen)
+{
+	struct rds_rx_trace_so trace;
+	int i;
+
+	if (optlen != sizeof(struct rds_rx_trace_so))
+		return -EFAULT;
+
+	if (copy_from_user(&trace, optval, sizeof(trace)))
+		return -EFAULT;
+
+	rs->rs_rx_traces = trace.rx_traces;
+	for (i = 0; i < rs->rs_rx_traces; i++) {
+		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+			rs->rs_rx_traces = 0;
+			return -EFAULT;
+		}
+		rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+	}
+
+	return 0;
+}
+
 static int rds_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -338,6 +362,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
 		release_sock(sock->sk);
 		break;
+	case SO_RDS_MSG_RXPATH_LATENCY:
+		ret = rds_recv_track_latency(rs, optval, optlen);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 	}
@@ -484,6 +511,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_cong_list);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
+	rs->rs_rx_traces = 0;
 
 	spin_lock_bh(&rds_sock_lock);
 	list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4b0f12679219..e10624aa6959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -911,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		ic->i_ibinc = ibinc;
 
 		hdr = &ibinc->ii_inc.i_hdr;
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+				local_clock();
 		memcpy(hdr, ihdr, sizeof(*hdr));
 		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+				local_clock();
 
 		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
 			 ic->i_recv_data_rem, hdr->h_flags);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f713194e4620..07fff73dd4f3 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -253,6 +253,11 @@ struct rds_ext_header_rdma_dest {
 #define RDS_EXTHDR_GEN_NUM	6
 
 #define __RDS_EXTHDR_MAX	16 /* for now */
+#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define	RDS_MSG_RX_HDR		0
+#define	RDS_MSG_RX_START	1
+#define	RDS_MSG_RX_END		2
+#define	RDS_MSG_RX_CMSG		3
 
 struct rds_incoming {
 	atomic_t		i_refcount;
@@ -265,6 +270,7 @@ struct rds_incoming {
 
 	rds_rdma_cookie_t	i_rdma_cookie;
 	struct timeval		i_rx_tstamp;
+	u64			i_rx_lat_trace[RDS_RX_MAX_TRACES];
 };
 
 struct rds_mr {
@@ -575,6 +581,10 @@ struct rds_sock {
 	unsigned char		rs_recverr,
 				rs_cong_monitor;
 	u32			rs_hash_initval;
+
+	/* Socket receive path trace points*/
+	u8			rs_rx_traces;
+	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index ba19eeeae85a..8b7e7b7f2c2d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr)
 {
+	int i;
+
 	atomic_set(&inc->i_refcount, 1);
 	INIT_LIST_HEAD(&inc->i_item);
 	inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 	inc->i_rdma_cookie = 0;
 	inc->i_rx_tstamp.tv_sec = 0;
 	inc->i_rx_tstamp.tv_usec = 0;
+
+	for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+		inc->i_rx_lat_trace[i] = 0;
 }
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
@@ -373,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		if (sock_flag(sk, SOCK_RCVTSTAMP))
 			do_gettimeofday(&inc->i_rx_tstamp);
 		rds_inc_addref(inc);
+		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
 		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
 		__rds_wake_sk_sleep(sk);
 	} else {
@@ -534,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
 				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -543,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 			       sizeof(struct timeval),
 			       &inc->i_rx_tstamp);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
-	return 0;
+	if (rs->rs_rx_traces) {
+		struct rds_cmsg_rx_trace t;
+		int i, j;
+
+		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+		t.rx_traces =  rs->rs_rx_traces;
+		for (i = 0; i < rs->rs_rx_traces; i++) {
+			j = rs->rs_rx_trace[i];
+			t.rx_trace_pos[i] = j;
+			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+					  inc->i_rx_lat_trace[j];
+		}
+
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+			       sizeof(t), &t);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
 }
 
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e97f91..e006ef8e6d40 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 			rdsdebug("alloced tinc %p\n", tinc);
 			rds_inc_path_init(&tinc->ti_inc, cp,
 					  cp->cp_conn->c_faddr);
+			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+					local_clock();
+
 			/*
 			 * XXX * we might be able to use the __ variants when
 			 * we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 				/* could be 0 for a 0 len message */
 				tc->t_tinc_data_rem =
 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+					local_clock();
 			}
 		}
 
-- 
cgit v1.2.3


From 237bab6611c607a9e63d50164609923feb8b83b3 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sun, 25 Dec 2016 19:58:58 +0800
Subject: netfilter: nf_tables: add missing descriptions in nft_ct_keys

We missed to add descriptions about NFT_CT_LABELS, NFT_CT_PKTS and
NFT_CT_BYTES, now add it.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 881d49e94569..5726f90bfc2f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -860,6 +860,9 @@ enum nft_rt_attributes {
  * @NFT_CT_PROTOCOL: conntrack layer 4 protocol
  * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source
  * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination
+ * @NFT_CT_LABELS: conntrack labels
+ * @NFT_CT_PKTS: conntrack packets
+ * @NFT_CT_BYTES: conntrack bytes
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
-- 
cgit v1.2.3


From 949a358418aae397d7cf1622aa6515eca766b9e7 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sun, 25 Dec 2016 19:58:59 +0800
Subject: netfilter: nft_ct: add average bytes per packet support

Similar to xt_connbytes, user can match how many average bytes per packet
a connection has transferred so far.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_ct.c                   | 22 +++++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5726f90bfc2f..b00a05d1ee56 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -863,6 +863,7 @@ enum nft_rt_attributes {
  * @NFT_CT_LABELS: conntrack labels
  * @NFT_CT_PKTS: conntrack packets
  * @NFT_CT_BYTES: conntrack bytes
+ * @NFT_CT_AVGPKT: conntrack average bytes per packet
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -881,6 +882,7 @@ enum nft_ct_keys {
 	NFT_CT_LABELS,
 	NFT_CT_PKTS,
 	NFT_CT_BYTES,
+	NFT_CT_AVGPKT,
 };
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index e6baeaebe653..d774d7823688 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -129,6 +129,22 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 		memcpy(dest, &count, sizeof(count));
 		return;
 	}
+	case NFT_CT_AVGPKT: {
+		const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+		u64 avgcnt = 0, bcnt = 0, pcnt = 0;
+
+		if (acct) {
+			pcnt = nft_ct_get_eval_counter(acct->counter,
+						       NFT_CT_PKTS, priv->dir);
+			bcnt = nft_ct_get_eval_counter(acct->counter,
+						       NFT_CT_BYTES, priv->dir);
+			if (pcnt != 0)
+				avgcnt = div64_u64(bcnt, pcnt);
+		}
+
+		memcpy(dest, &avgcnt, sizeof(avgcnt));
+		return;
+	}
 	case NFT_CT_L3PROTOCOL:
 		*dest = nf_ct_l3num(ct);
 		return;
@@ -316,6 +332,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 		break;
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
+	case NFT_CT_AVGPKT:
 		/* no direction? return sum of original + reply */
 		if (tb[NFTA_CT_DIRECTION] == NULL)
 			priv->dir = IP_CT_DIR_MAX;
@@ -346,7 +363,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS)
+	if (priv->key == NFT_CT_BYTES ||
+	    priv->key == NFT_CT_PKTS  ||
+	    priv->key == NFT_CT_AVGPKT)
 		nf_ct_set_acct(ctx->net, true);
 
 	return 0;
@@ -445,6 +464,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		break;
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
+	case NFT_CT_AVGPKT:
 		if (priv->dir < IP_CT_DIR_MAX &&
 		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
 			goto nla_put_failure;
-- 
cgit v1.2.3


From 1708ebc9636a249e104b83c6d105f15244825281 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 3 Jan 2017 12:13:39 +0100
Subject: ipmr, ip6mr: add RTNH_F_UNRESOLVED flag to unresolved cache entries

While working with ipmr, we noticed that it is impossible to determine
if an entry is actually unresolved or its IIF interface has disappeared
(e.g. virtual interface got deleted). These entries look almost
identical to user-space when dumping or receiving notifications. So in
order to recognize them add a new RTNH_F_UNRESOLVED flag which is set when
sending an unresolved cache entry to user-space.

Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 1 +
 net/ipv4/ipmr.c                | 4 +++-
 net/ipv6/ip6mr.c               | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index e14377f2ec27..8c93ad1ef9ab 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -350,6 +350,7 @@ struct rtnexthop {
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
 #define RTNH_F_OFFLOAD		8	/* offloaded route */
 #define RTNH_F_LINKDOWN		16	/* carrier-down on nexthop */
+#define RTNH_F_UNRESOLVED	32	/* The entry is unresolved (ipmr) */
 
 #define RTNH_COMPARE_MASK	(RTNH_F_DEAD | RTNH_F_LINKDOWN | RTNH_F_OFFLOAD)
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efc1e76d4977..b35dda57586b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2091,8 +2091,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mfc_parent >= MAXVIFS)
+	if (c->mfc_parent >= MAXVIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
+	}
 
 	if (VIF_EXISTS(mrt, c->mfc_parent) &&
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 604d8953c775..e275077e8af2 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2243,8 +2243,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mf6c_parent >= MAXMIFS)
+	if (c->mf6c_parent >= MAXMIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
+	}
 
 	if (MIF_EXISTS(mrt, c->mf6c_parent) &&
 	    nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
-- 
cgit v1.2.3


From 571299d099dcce0ff32c76e70e32e0ba01e55adc Mon Sep 17 00:00:00 2001
From: Song Hongyan <hongyan.song@intel.com>
Date: Thu, 5 Jan 2017 18:24:03 +0800
Subject: iio: Add channel for Gravity

Add new channel types support for gravity sensor.

Gravity sensor provides an application-level or physical collection that
identifies a device that measures exclusively the force of Earth's
gravity along any number of axes.

More information can be found in:
http://www.usb.org/developers/hidpage/HUTRR59_-_Usages_for_Wearables.pdf

Signed-off-by: Song Hongyan <hongyan.song@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 10 ++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/uapi/linux/iio/types.h          |  1 +
 tools/iio/iio_event_monitor.c           |  2 ++
 4 files changed, 14 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index cfd53dba24c4..8ec362bd5948 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -170,6 +170,16 @@ Description:
 		Has all of the equivalent parameters as per voltageY. Units
 		after application of scale and offset are m/s^2.
 
+What:		/sys/bus/iio/devices/iio:deviceX/in_gravity_x_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_gravity_y_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_gravity_z_raw
+KernelVersion:	4.11
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Gravity in direction x, y or z (may be arbitrarily assigned
+		but should match other such assignments on device).
+		Units after application of scale and offset are m/s^2.
+
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_x_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_y_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_z_raw
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index aaca42862389..c601698e0910 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -83,6 +83,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_ELECTRICALCONDUCTIVITY] = "electricalconductivity",
 	[IIO_COUNT] = "count",
 	[IIO_INDEX] = "index",
+	[IIO_GRAVITY]  = "gravity",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index e54d14a7f876..ffafd6c25a48 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -42,6 +42,7 @@ enum iio_chan_type {
 	IIO_ELECTRICALCONDUCTIVITY,
 	IIO_COUNT,
 	IIO_INDEX,
+	IIO_GRAVITY,
 };
 
 enum iio_modifier {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index d9b7e0f306c6..b61245e1181d 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -57,6 +57,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_RESISTANCE] = "resistance",
 	[IIO_PH] = "ph",
 	[IIO_UVINDEX] = "uvindex",
+	[IIO_GRAVITY] = "gravity",
 };
 
 static const char * const iio_ev_type_text[] = {
@@ -149,6 +150,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_RESISTANCE:
 	case IIO_PH:
 	case IIO_UVINDEX:
+	case IIO_GRAVITY:
 		break;
 	default:
 		return false;
-- 
cgit v1.2.3


From 2f5ff26478adaff5ed9b7ad4079d6a710b5f27e7 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:21 +0200
Subject: mlx5: Fix naming convention with respect to UARs

This establishes a solid naming conventions for UARs. A UAR (User Access
Region) can have size identical to a system page or can be fixed 4KB
depending on a value queried by firmware. Each UAR always has 4 blue
flame register which are used to post doorbell to send queue. In
addition, a UAR has section used for posting doorbells to CQs or EQs. In
this patch we change names to reflect this conventions.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c                |   6 +-
 drivers/infiniband/hw/mlx5/main.c              |  80 +++++------
 drivers/infiniband/hw/mlx5/mlx5_ib.h           |   6 +-
 drivers/infiniband/hw/mlx5/qp.c                | 176 ++++++++++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/eq.c   |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/uar.c  |  90 ++++++-------
 include/linux/mlx5/device.h                    |   9 +-
 include/linux/mlx5/driver.h                    |  14 +-
 include/uapi/rdma/mlx5-abi.h                   |  12 +-
 10 files changed, 206 insertions(+), 203 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index b3ef47c3ab73..bb7e91c55003 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -689,7 +689,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 {
 	struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev;
 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
-	void __iomem *uar_page = mdev->priv.uuari.uars[0].map;
+	void __iomem *uar_page = mdev->priv.bfregi.uars[0].map;
 	unsigned long irq_flags;
 	int ret = 0;
 
@@ -790,7 +790,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = to_mucontext(context)->uuari.uars[0].index;
+	*index = to_mucontext(context)->bfregi.uars[0].index;
 
 	if (ucmd.cqe_comp_en == 1) {
 		if (unlikely((*cqe_size != 64) ||
@@ -886,7 +886,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = dev->mdev->priv.uuari.uars[0].index;
+	*index = dev->mdev->priv.bfregi.uars[0].index;
 
 	return 0;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 852b5b7b4897..d5cf82b387d3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -999,12 +999,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_ib_ucontext *context;
-	struct mlx5_uuar_info *uuari;
+	struct mlx5_bfreg_info *bfregi;
 	struct mlx5_uar *uars;
-	int gross_uuars;
+	int gross_bfregs;
 	int num_uars;
 	int ver;
-	int uuarn;
+	int bfregn;
 	int err;
 	int i;
 	size_t reqlen;
@@ -1032,10 +1032,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (req.flags)
 		return ERR_PTR(-EINVAL);
 
-	if (req.total_num_uuars > MLX5_MAX_UUARS)
+	if (req.total_num_bfregs > MLX5_MAX_BFREGS)
 		return ERR_PTR(-ENOMEM);
 
-	if (req.total_num_uuars == 0)
+	if (req.total_num_bfregs == 0)
 		return ERR_PTR(-EINVAL);
 
 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
@@ -1046,13 +1046,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 				 reqlen - sizeof(req)))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	req.total_num_uuars = ALIGN(req.total_num_uuars,
-				    MLX5_NON_FP_BF_REGS_PER_PAGE);
-	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
+	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
+				    MLX5_NON_FP_BFREGS_PER_UAR);
+	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
 		return ERR_PTR(-EINVAL);
 
-	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
-	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
+	num_uars = req.total_num_bfregs / MLX5_NON_FP_BFREGS_PER_UAR;
+	gross_bfregs = num_uars * MLX5_BFREGS_PER_UAR;
 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
 		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
@@ -1072,32 +1072,33 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
-	uuari = &context->uuari;
-	mutex_init(&uuari->lock);
+	bfregi = &context->bfregi;
+	mutex_init(&bfregi->lock);
 	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
 	if (!uars) {
 		err = -ENOMEM;
 		goto out_ctx;
 	}
 
-	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
-				sizeof(*uuari->bitmap),
+	bfregi->bitmap = kcalloc(BITS_TO_LONGS(gross_bfregs),
+				sizeof(*bfregi->bitmap),
 				GFP_KERNEL);
-	if (!uuari->bitmap) {
+	if (!bfregi->bitmap) {
 		err = -ENOMEM;
 		goto out_uar_ctx;
 	}
 	/*
-	 * clear all fast path uuars
+	 * clear all fast path bfregs
 	 */
-	for (i = 0; i < gross_uuars; i++) {
-		uuarn = i & 3;
-		if (uuarn == 2 || uuarn == 3)
-			set_bit(i, uuari->bitmap);
+	for (i = 0; i < gross_bfregs; i++) {
+		bfregn = i & 3;
+		if (bfregn == 2 || bfregn == 3)
+			set_bit(i, bfregi->bitmap);
 	}
 
-	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
-	if (!uuari->count) {
+	bfregi->count = kcalloc(gross_bfregs,
+				sizeof(*bfregi->count), GFP_KERNEL);
+	if (!bfregi->count) {
 		err = -ENOMEM;
 		goto out_bitmap;
 	}
@@ -1130,7 +1131,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
-	resp.tot_uuars = req.total_num_uuars;
+	resp.tot_bfregs = req.total_num_bfregs;
 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
 
 	if (field_avail(typeof(resp), cqe_version, udata->outlen))
@@ -1163,10 +1164,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (err)
 		goto out_td;
 
-	uuari->ver = ver;
-	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
-	uuari->uars = uars;
-	uuari->num_uars = num_uars;
+	bfregi->ver = ver;
+	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
+	bfregi->uars = uars;
+	bfregi->num_uars = num_uars;
 	context->cqe_version = resp.cqe_version;
 
 	return &context->ibucontext;
@@ -1182,10 +1183,10 @@ out_uars:
 	for (i--; i >= 0; i--)
 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
 out_count:
-	kfree(uuari->count);
+	kfree(bfregi->count);
 
 out_bitmap:
-	kfree(uuari->bitmap);
+	kfree(bfregi->bitmap);
 
 out_uar_ctx:
 	kfree(uars);
@@ -1199,7 +1200,7 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
-	struct mlx5_uuar_info *uuari = &context->uuari;
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
 	int i;
 
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
@@ -1207,14 +1208,15 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 
 	free_page(context->upd_xlt_page);
 
-	for (i = 0; i < uuari->num_uars; i++) {
-		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
-			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
+	for (i = 0; i < bfregi->num_uars; i++) {
+		if (mlx5_cmd_free_uar(dev->mdev, bfregi->uars[i].index))
+			mlx5_ib_warn(dev, "Failed to free UAR 0x%x\n",
+				     bfregi->uars[i].index);
 	}
 
-	kfree(uuari->count);
-	kfree(uuari->bitmap);
-	kfree(uuari->uars);
+	kfree(bfregi->count);
+	kfree(bfregi->bitmap);
+	kfree(bfregi->uars);
 	kfree(context);
 
 	return 0;
@@ -1377,7 +1379,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		    struct vm_area_struct *vma,
 		    struct mlx5_ib_ucontext *context)
 {
-	struct mlx5_uuar_info *uuari = &context->uuari;
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
 	int err;
 	unsigned long idx;
 	phys_addr_t pfn, pa;
@@ -1408,10 +1410,10 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		return -EINVAL;
 
 	idx = get_index(vma->vm_pgoff);
-	if (idx >= uuari->num_uars)
+	if (idx >= bfregi->num_uars)
 		return -EINVAL;
 
-	pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+	pfn = uar_index2pfn(dev, bfregi->uars[idx].index);
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 
 	vma->vm_page_prot = prot;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a51c8051aeb2..d4d1329df94a 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -100,7 +100,7 @@ enum mlx5_ib_mad_ifc_flags {
 };
 
 enum {
-	MLX5_CROSS_CHANNEL_UUAR         = 0,
+	MLX5_CROSS_CHANNEL_BFREG         = 0,
 };
 
 enum {
@@ -120,7 +120,7 @@ struct mlx5_ib_ucontext {
 	/* protect doorbell record alloc/free
 	 */
 	struct mutex		db_page_mutex;
-	struct mlx5_uuar_info	uuari;
+	struct mlx5_bfreg_info	bfregi;
 	u8			cqe_version;
 	/* Transport Domain number */
 	u32			tdn;
@@ -355,7 +355,7 @@ struct mlx5_ib_qp {
 	/* only for user space QPs. For kernel
 	 * we have it from the bf object
 	 */
-	int			uuarn;
+	int			bfregn;
 
 	int			create_type;
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 42d021cdc6c5..fbea9bd63c8e 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -475,12 +475,12 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
 	return 1;
 }
 
-static int first_med_uuar(void)
+static int first_med_bfreg(void)
 {
 	return 1;
 }
 
-static int next_uuar(int n)
+static int next_bfreg(int n)
 {
 	n++;
 
@@ -490,45 +490,45 @@ static int next_uuar(int n)
 	return n;
 }
 
-static int num_med_uuar(struct mlx5_uuar_info *uuari)
+static int num_med_bfreg(struct mlx5_bfreg_info *bfregi)
 {
 	int n;
 
-	n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE -
-		uuari->num_low_latency_uuars - 1;
+	n = bfregi->num_uars * MLX5_NON_FP_BFREGS_PER_UAR -
+		bfregi->num_low_latency_bfregs - 1;
 
 	return n >= 0 ? n : 0;
 }
 
-static int max_uuari(struct mlx5_uuar_info *uuari)
+static int max_bfregi(struct mlx5_bfreg_info *bfregi)
 {
-	return uuari->num_uars * 4;
+	return bfregi->num_uars * 4;
 }
 
-static int first_hi_uuar(struct mlx5_uuar_info *uuari)
+static int first_hi_bfreg(struct mlx5_bfreg_info *bfregi)
 {
 	int med;
 	int i;
 	int t;
 
-	med = num_med_uuar(uuari);
-	for (t = 0, i = first_med_uuar();; i = next_uuar(i)) {
+	med = num_med_bfreg(bfregi);
+	for (t = 0, i = first_med_bfreg();; i = next_bfreg(i)) {
 		t++;
 		if (t == med)
-			return next_uuar(i);
+			return next_bfreg(i);
 	}
 
 	return 0;
 }
 
-static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
+static int alloc_high_class_bfreg(struct mlx5_bfreg_info *bfregi)
 {
 	int i;
 
-	for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) {
-		if (!test_bit(i, uuari->bitmap)) {
-			set_bit(i, uuari->bitmap);
-			uuari->count[i]++;
+	for (i = first_hi_bfreg(bfregi); i < max_bfregi(bfregi); i = next_bfreg(i)) {
+		if (!test_bit(i, bfregi->bitmap)) {
+			set_bit(i, bfregi->bitmap);
+			bfregi->count[i]++;
 			return i;
 		}
 	}
@@ -536,87 +536,87 @@ static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
 	return -ENOMEM;
 }
 
-static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)
+static int alloc_med_class_bfreg(struct mlx5_bfreg_info *bfregi)
 {
-	int minidx = first_med_uuar();
+	int minidx = first_med_bfreg();
 	int i;
 
-	for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) {
-		if (uuari->count[i] < uuari->count[minidx])
+	for (i = first_med_bfreg(); i < first_hi_bfreg(bfregi); i = next_bfreg(i)) {
+		if (bfregi->count[i] < bfregi->count[minidx])
 			minidx = i;
 	}
 
-	uuari->count[minidx]++;
+	bfregi->count[minidx]++;
 	return minidx;
 }
 
-static int alloc_uuar(struct mlx5_uuar_info *uuari,
-		      enum mlx5_ib_latency_class lat)
+static int alloc_bfreg(struct mlx5_bfreg_info *bfregi,
+		       enum mlx5_ib_latency_class lat)
 {
-	int uuarn = -EINVAL;
+	int bfregn = -EINVAL;
 
-	mutex_lock(&uuari->lock);
+	mutex_lock(&bfregi->lock);
 	switch (lat) {
 	case MLX5_IB_LATENCY_CLASS_LOW:
-		uuarn = 0;
-		uuari->count[uuarn]++;
+		bfregn = 0;
+		bfregi->count[bfregn]++;
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_MEDIUM:
-		if (uuari->ver < 2)
-			uuarn = -ENOMEM;
+		if (bfregi->ver < 2)
+			bfregn = -ENOMEM;
 		else
-			uuarn = alloc_med_class_uuar(uuari);
+			bfregn = alloc_med_class_bfreg(bfregi);
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_HIGH:
-		if (uuari->ver < 2)
-			uuarn = -ENOMEM;
+		if (bfregi->ver < 2)
+			bfregn = -ENOMEM;
 		else
-			uuarn = alloc_high_class_uuar(uuari);
+			bfregn = alloc_high_class_bfreg(bfregi);
 		break;
 
 	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
-		uuarn = 2;
+		bfregn = 2;
 		break;
 	}
-	mutex_unlock(&uuari->lock);
+	mutex_unlock(&bfregi->lock);
 
-	return uuarn;
+	return bfregn;
 }
 
-static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+static void free_med_class_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	clear_bit(uuarn, uuari->bitmap);
-	--uuari->count[uuarn];
+	clear_bit(bfregn, bfregi->bitmap);
+	--bfregi->count[bfregn];
 }
 
-static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+static void free_high_class_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	clear_bit(uuarn, uuari->bitmap);
-	--uuari->count[uuarn];
+	clear_bit(bfregn, bfregi->bitmap);
+	--bfregi->count[bfregn];
 }
 
-static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+static void free_bfreg(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
-	int high_uuar = nuuars - uuari->num_low_latency_uuars;
+	int nbfregs = bfregi->num_uars * MLX5_BFREGS_PER_UAR;
+	int high_bfreg = nbfregs - bfregi->num_low_latency_bfregs;
 
-	mutex_lock(&uuari->lock);
-	if (uuarn == 0) {
-		--uuari->count[uuarn];
+	mutex_lock(&bfregi->lock);
+	if (bfregn == 0) {
+		--bfregi->count[bfregn];
 		goto out;
 	}
 
-	if (uuarn < high_uuar) {
-		free_med_class_uuar(uuari, uuarn);
+	if (bfregn < high_bfreg) {
+		free_med_class_bfreg(bfregi, bfregn);
 		goto out;
 	}
 
-	free_high_class_uuar(uuari, uuarn);
+	free_high_class_bfreg(bfregi, bfregn);
 
 out:
-	mutex_unlock(&uuari->lock);
+	mutex_unlock(&bfregi->lock);
 }
 
 static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
@@ -657,9 +657,9 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
 static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
 			       struct mlx5_ib_cq *recv_cq);
 
-static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
+static int bfregn_to_uar_index(struct mlx5_bfreg_info *bfregi, int bfregn)
 {
-	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
+	return bfregi->uars[bfregn / MLX5_BFREGS_PER_UAR].index;
 }
 
 static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
@@ -776,7 +776,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	int uar_index;
 	int npages;
 	u32 offset = 0;
-	int uuarn;
+	int bfregn;
 	int ncont = 0;
 	__be64 *pas;
 	void *qpc;
@@ -794,27 +794,27 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	 */
 	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
 		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
-		uuarn = MLX5_CROSS_CHANNEL_UUAR;
+		bfregn = MLX5_CROSS_CHANNEL_BFREG;
 	else {
-		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
-		if (uuarn < 0) {
-			mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+		bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
+		if (bfregn < 0) {
+			mlx5_ib_dbg(dev, "failed to allocate low latency BFREG\n");
 			mlx5_ib_dbg(dev, "reverting to medium latency\n");
-			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
-			if (uuarn < 0) {
-				mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+			bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM);
+			if (bfregn < 0) {
+				mlx5_ib_dbg(dev, "failed to allocate medium latency BFREG\n");
 				mlx5_ib_dbg(dev, "reverting to high latency\n");
-				uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
-				if (uuarn < 0) {
-					mlx5_ib_warn(dev, "uuar allocation failed\n");
-					return uuarn;
+				bfregn = alloc_bfreg(&context->bfregi, MLX5_IB_LATENCY_CLASS_LOW);
+				if (bfregn < 0) {
+					mlx5_ib_warn(dev, "bfreg allocation failed\n");
+					return bfregn;
 				}
 			}
 		}
 	}
 
-	uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
-	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index);
+	uar_index = bfregn_to_uar_index(&context->bfregi, bfregn);
+	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
 
 	qp->rq.offset = 0;
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
@@ -822,7 +822,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	err = set_user_buf_size(dev, qp, &ucmd, base, attr);
 	if (err)
-		goto err_uuar;
+		goto err_bfreg;
 
 	if (ucmd.buf_addr && ubuffer->buf_size) {
 		ubuffer->buf_addr = ucmd.buf_addr;
@@ -831,7 +831,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 				       &ubuffer->umem, &npages, &page_shift,
 				       &ncont, &offset);
 		if (err)
-			goto err_uuar;
+			goto err_bfreg;
 	} else {
 		ubuffer->umem = NULL;
 	}
@@ -854,8 +854,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	MLX5_SET(qpc, qpc, page_offset, offset);
 
 	MLX5_SET(qpc, qpc, uar_page, uar_index);
-	resp->uuar_index = uuarn;
-	qp->uuarn = uuarn;
+	resp->bfreg_index = bfregn;
+	qp->bfregn = bfregn;
 
 	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
 	if (err) {
@@ -882,8 +882,8 @@ err_umem:
 	if (ubuffer->umem)
 		ib_umem_release(ubuffer->umem);
 
-err_uuar:
-	free_uuar(&context->uuari, uuarn);
+err_bfreg:
+	free_bfreg(&context->bfregi, bfregn);
 	return err;
 }
 
@@ -896,7 +896,7 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
 	mlx5_ib_db_unmap_user(context, &qp->db);
 	if (base->ubuffer.umem)
 		ib_umem_release(base->ubuffer.umem);
-	free_uuar(&context->uuari, qp->uuarn);
+	free_bfreg(&context->bfregi, qp->bfregn);
 }
 
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
@@ -906,13 +906,13 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 			    struct mlx5_ib_qp_base *base)
 {
 	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
-	struct mlx5_uuar_info *uuari;
+	struct mlx5_bfreg_info *bfregi;
 	int uar_index;
 	void *qpc;
-	int uuarn;
+	int bfregn;
 	int err;
 
-	uuari = &dev->mdev->priv.uuari;
+	bfregi = &dev->mdev->priv.bfregi;
 	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
 					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
 					IB_QP_CREATE_IPOIB_UD_LSO |
@@ -922,19 +922,19 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
 		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH;
 
-	uuarn = alloc_uuar(uuari, lc);
-	if (uuarn < 0) {
+	bfregn = alloc_bfreg(bfregi, lc);
+	if (bfregn < 0) {
 		mlx5_ib_dbg(dev, "\n");
 		return -ENOMEM;
 	}
 
-	qp->bf = &uuari->bfs[uuarn];
+	qp->bf = &bfregi->bfs[bfregn];
 	uar_index = qp->bf->uar->index;
 
 	err = calc_sq_size(dev, init_attr, qp);
 	if (err < 0) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		goto err_uuar;
+		goto err_bfreg;
 	}
 
 	qp->rq.offset = 0;
@@ -944,7 +944,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		goto err_uuar;
+		goto err_bfreg;
 	}
 
 	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
@@ -1007,8 +1007,8 @@ err_free:
 err_buf:
 	mlx5_buf_free(dev->mdev, &qp->buf);
 
-err_uuar:
-	free_uuar(&dev->mdev->priv.uuari, uuarn);
+err_bfreg:
+	free_bfreg(&dev->mdev->priv.bfregi, bfregn);
 	return err;
 }
 
@@ -1021,7 +1021,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 	kfree(qp->rq.wrid);
 	mlx5_db_free(dev->mdev, &qp->db);
 	mlx5_buf_free(dev->mdev, &qp->buf);
-	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
+	free_bfreg(&dev->mdev->priv.bfregi, qp->bf->bfregn);
 }
 
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
@@ -1353,7 +1353,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	if (init_attr->create_flags || init_attr->send_cq)
 		return -EINVAL;
 
-	min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index);
+	min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index);
 	if (udata->outlen < min_resp_len)
 		return -EINVAL;
 
@@ -4132,7 +4132,7 @@ out:
 			__acquire(&bf->lock);
 
 		/* TBD enable WC */
-		if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) {
+		if (0 && nreq == 1 && bf->bfregn && inl && size > 1 && size <= bf->buf_size / 16) {
 			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
 			/* wc_wmb(); */
 		} else {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 4aff8ac68e14..11a8d638bcd0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -686,7 +686,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
-				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0],
+				 "mlx5_cmd_eq", &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
@@ -697,7 +697,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
 				 MLX5_NUM_ASYNC_EQE, async_event_mask,
-				 "mlx5_async_eq", &dev->priv.uuari.uars[0],
+				 "mlx5_async_eq", &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
@@ -708,7 +708,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 				 MLX5_EQ_VEC_PAGES,
 				 /* TODO: sriov max_vf + */ 1,
 				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
-				 &dev->priv.uuari.uars[0],
+				 &dev->priv.bfregi.uars[0],
 				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
@@ -722,7 +722,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 					 MLX5_NUM_ASYNC_EQE,
 					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
 					 "mlx5_page_fault_eq",
-					 &dev->priv.uuari.uars[0],
+					 &dev->priv.bfregi.uars[0],
 					 MLX5_EQ_TYPE_PF);
 		if (err) {
 			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f4115135e30b..634e96a02516 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -753,7 +753,7 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
-					 name, &dev->priv.uuari.uars[0],
+					 name, &dev->priv.bfregi.uars[0],
 					 MLX5_EQ_TYPE_COMP);
 		if (err) {
 			kfree(eq);
@@ -1094,7 +1094,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_cleanup_once;
 	}
 
-	err = mlx5_alloc_uuars(dev, &priv->uuari);
+	err = mlx5_alloc_bfregs(dev, &priv->bfregi);
 	if (err) {
 		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
 		goto err_disable_msix;
@@ -1170,7 +1170,7 @@ err_stop_eqs:
 	mlx5_stop_eqs(dev);
 
 err_free_uar:
-	mlx5_free_uuars(dev, &priv->uuari);
+	mlx5_free_bfregs(dev, &priv->bfregi);
 
 err_disable_msix:
 	mlx5_disable_msix(dev);
@@ -1230,7 +1230,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
-	mlx5_free_uuars(dev, &priv->uuari);
+	mlx5_free_bfregs(dev, &priv->bfregi);
 	mlx5_disable_msix(dev);
 	if (cleanup)
 		mlx5_cleanup_once(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index ab0b896621a0..ce7fcebb81a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -39,7 +39,7 @@
 
 enum {
 	NUM_DRIVER_UARS		= 4,
-	NUM_LOW_LAT_UUARS	= 4,
+	NUM_LOW_LAT_BFREGS	= 4,
 };
 
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
@@ -67,116 +67,116 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
 }
 EXPORT_SYMBOL(mlx5_cmd_free_uar);
 
-static int need_uuar_lock(int uuarn)
+static int need_bfreg_lock(int bfregn)
 {
-	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+	int tot_bfregs = NUM_DRIVER_UARS * MLX5_BFREGS_PER_UAR;
 
-	if (uuarn == 0 || tot_uuars - NUM_LOW_LAT_UUARS)
+	if (bfregn == 0 || tot_bfregs - NUM_LOW_LAT_BFREGS)
 		return 0;
 
 	return 1;
 }
 
-int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi)
 {
-	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+	int tot_bfregs = NUM_DRIVER_UARS * MLX5_BFREGS_PER_UAR;
 	struct mlx5_bf *bf;
 	phys_addr_t addr;
 	int err;
 	int i;
 
-	uuari->num_uars = NUM_DRIVER_UARS;
-	uuari->num_low_latency_uuars = NUM_LOW_LAT_UUARS;
+	bfregi->num_uars = NUM_DRIVER_UARS;
+	bfregi->num_low_latency_bfregs = NUM_LOW_LAT_BFREGS;
 
-	mutex_init(&uuari->lock);
-	uuari->uars = kcalloc(uuari->num_uars, sizeof(*uuari->uars), GFP_KERNEL);
-	if (!uuari->uars)
+	mutex_init(&bfregi->lock);
+	bfregi->uars = kcalloc(bfregi->num_uars, sizeof(*bfregi->uars), GFP_KERNEL);
+	if (!bfregi->uars)
 		return -ENOMEM;
 
-	uuari->bfs = kcalloc(tot_uuars, sizeof(*uuari->bfs), GFP_KERNEL);
-	if (!uuari->bfs) {
+	bfregi->bfs = kcalloc(tot_bfregs, sizeof(*bfregi->bfs), GFP_KERNEL);
+	if (!bfregi->bfs) {
 		err = -ENOMEM;
 		goto out_uars;
 	}
 
-	uuari->bitmap = kcalloc(BITS_TO_LONGS(tot_uuars), sizeof(*uuari->bitmap),
+	bfregi->bitmap = kcalloc(BITS_TO_LONGS(tot_bfregs), sizeof(*bfregi->bitmap),
 				GFP_KERNEL);
-	if (!uuari->bitmap) {
+	if (!bfregi->bitmap) {
 		err = -ENOMEM;
 		goto out_bfs;
 	}
 
-	uuari->count = kcalloc(tot_uuars, sizeof(*uuari->count), GFP_KERNEL);
-	if (!uuari->count) {
+	bfregi->count = kcalloc(tot_bfregs, sizeof(*bfregi->count), GFP_KERNEL);
+	if (!bfregi->count) {
 		err = -ENOMEM;
 		goto out_bitmap;
 	}
 
-	for (i = 0; i < uuari->num_uars; i++) {
-		err = mlx5_cmd_alloc_uar(dev, &uuari->uars[i].index);
+	for (i = 0; i < bfregi->num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(dev, &bfregi->uars[i].index);
 		if (err)
 			goto out_count;
 
-		addr = dev->iseg_base + ((phys_addr_t)(uuari->uars[i].index) << PAGE_SHIFT);
-		uuari->uars[i].map = ioremap(addr, PAGE_SIZE);
-		if (!uuari->uars[i].map) {
-			mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+		addr = dev->iseg_base + ((phys_addr_t)(bfregi->uars[i].index) << PAGE_SHIFT);
+		bfregi->uars[i].map = ioremap(addr, PAGE_SIZE);
+		if (!bfregi->uars[i].map) {
+			mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
 			err = -ENOMEM;
 			goto out_count;
 		}
 		mlx5_core_dbg(dev, "allocated uar index 0x%x, mmaped at %p\n",
-			      uuari->uars[i].index, uuari->uars[i].map);
+			      bfregi->uars[i].index, bfregi->uars[i].map);
 	}
 
-	for (i = 0; i < tot_uuars; i++) {
-		bf = &uuari->bfs[i];
+	for (i = 0; i < tot_bfregs; i++) {
+		bf = &bfregi->bfs[i];
 
 		bf->buf_size = (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) / 2;
-		bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE];
-		bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map;
+		bf->uar = &bfregi->uars[i / MLX5_BFREGS_PER_UAR];
+		bf->regreg = bfregi->uars[i / MLX5_BFREGS_PER_UAR].map;
 		bf->reg = NULL; /* Add WC support */
-		bf->offset = (i % MLX5_BF_REGS_PER_PAGE) *
+		bf->offset = (i % MLX5_BFREGS_PER_UAR) *
 			     (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) +
 			     MLX5_BF_OFFSET;
-		bf->need_lock = need_uuar_lock(i);
+		bf->need_lock = need_bfreg_lock(i);
 		spin_lock_init(&bf->lock);
 		spin_lock_init(&bf->lock32);
-		bf->uuarn = i;
+		bf->bfregn = i;
 	}
 
 	return 0;
 
 out_count:
 	for (i--; i >= 0; i--) {
-		iounmap(uuari->uars[i].map);
-		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+		iounmap(bfregi->uars[i].map);
+		mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
 	}
-	kfree(uuari->count);
+	kfree(bfregi->count);
 
 out_bitmap:
-	kfree(uuari->bitmap);
+	kfree(bfregi->bitmap);
 
 out_bfs:
-	kfree(uuari->bfs);
+	kfree(bfregi->bfs);
 
 out_uars:
-	kfree(uuari->uars);
+	kfree(bfregi->uars);
 	return err;
 }
 
-int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi)
 {
-	int i = uuari->num_uars;
+	int i = bfregi->num_uars;
 
 	for (i--; i >= 0; i--) {
-		iounmap(uuari->uars[i].map);
-		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+		iounmap(bfregi->uars[i].map);
+		mlx5_cmd_free_uar(dev, bfregi->uars[i].index);
 	}
 
-	kfree(uuari->count);
-	kfree(uuari->bitmap);
-	kfree(uuari->bfs);
-	kfree(uuari->uars);
+	kfree(bfregi->count);
+	kfree(bfregi->bitmap);
+	kfree(bfregi->bfs);
+	kfree(bfregi->uars);
 
 	return 0;
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 3ccaeff15a80..aa851c51ab59 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -212,10 +212,11 @@ enum {
 };
 
 enum {
-	MLX5_BF_REGS_PER_PAGE		= 4,
-	MLX5_MAX_UAR_PAGES		= 1 << 8,
-	MLX5_NON_FP_BF_REGS_PER_PAGE	= 2,
-	MLX5_MAX_UUARS	= MLX5_MAX_UAR_PAGES * MLX5_NON_FP_BF_REGS_PER_PAGE,
+	MLX5_BFREGS_PER_UAR		= 4,
+	MLX5_MAX_UARS			= 1 << 8,
+	MLX5_NON_FP_BFREGS_PER_UAR	= 2,
+	MLX5_MAX_BFREGS			= MLX5_MAX_UARS *
+					  MLX5_NON_FP_BFREGS_PER_UAR,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cfa49bca009c..3d07e25b3bf1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -188,16 +188,16 @@ enum mlx5_eq_type {
 #endif
 };
 
-struct mlx5_uuar_info {
+struct mlx5_bfreg_info {
 	struct mlx5_uar	       *uars;
 	int			num_uars;
-	int			num_low_latency_uuars;
+	int			num_low_latency_bfregs;
 	unsigned long	       *bitmap;
 	unsigned int	       *count;
 	struct mlx5_bf	       *bfs;
 
 	/*
-	 * protect uuar allocation data structs
+	 * protect bfreg allocation data structs
 	 */
 	struct mutex		lock;
 	u32			ver;
@@ -217,7 +217,7 @@ struct mlx5_bf {
 	/* serialize 64 bit writes when done as two 32 bit accesses
 	 */
 	spinlock_t		lock32;
-	int			uuarn;
+	int			bfregn;
 };
 
 struct mlx5_cmd_first {
@@ -579,7 +579,7 @@ struct mlx5_priv {
 	struct mlx5_eq_table	eq_table;
 	struct msix_entry	*msix_arr;
 	struct mlx5_irq_info	*irq_info;
-	struct mlx5_uuar_info	uuari;
+	struct mlx5_bfreg_info	bfregi;
 	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
 	/* pages stuff */
@@ -903,8 +903,8 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
-int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
-int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
+int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
 int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
 		       bool map_wc);
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index fae6cdaeb56d..86a8f30060f3 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -61,13 +61,13 @@ enum {
  */
 
 struct mlx5_ib_alloc_ucontext_req {
-	__u32	total_num_uuars;
-	__u32	num_low_latency_uuars;
+	__u32	total_num_bfregs;
+	__u32	num_low_latency_bfregs;
 };
 
 struct mlx5_ib_alloc_ucontext_req_v2 {
-	__u32	total_num_uuars;
-	__u32	num_low_latency_uuars;
+	__u32	total_num_bfregs;
+	__u32	num_low_latency_bfregs;
 	__u32	flags;
 	__u32	comp_mask;
 	__u8	max_cqe_version;
@@ -88,7 +88,7 @@ enum mlx5_user_cmds_supp_uhw {
 struct mlx5_ib_alloc_ucontext_resp {
 	__u32	qp_tab_size;
 	__u32	bf_reg_size;
-	__u32	tot_uuars;
+	__u32	tot_bfregs;
 	__u32	cache_line_size;
 	__u16	max_sq_desc_sz;
 	__u16	max_rq_desc_sz;
@@ -241,7 +241,7 @@ struct mlx5_ib_create_qp_rss {
 };
 
 struct mlx5_ib_create_qp_resp {
-	__u32	uuar_index;
+	__u32	bfreg_index;
 };
 
 struct mlx5_ib_alloc_mw {
-- 
cgit v1.2.3


From aec745e2c520bf2d046684a284dac11c25d8e717 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:33 -0500
Subject: net-tc: remove unused tc_verd fields

Remove the last reference to tc_verd's munge and redirect ttl bits.
These fields are no longer used.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cb4bcdc58543..c769f71972f5 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -17,10 +17,6 @@
 
 /* verdict bit breakdown 
  *
-bit 0: when set -> this packet has been munged already
-
-bit 1: when set -> It is ok to munge this packet
-
 bit 2,3,4,5: Reclassify counter - sort of reverse TTL - if exceeded
 assume loop
 
@@ -31,8 +27,6 @@ bit 6,7: Where this packet was last seen
 
 bit 8: when set --> Request not to classify on ingress. 
 
-bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
-
  *
  * */
 
@@ -56,7 +50,6 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
 
 #define MAX_REC_LOOP 4
-#define MAX_RED_LOOP 4
 #endif
 
 /* Action attributes */
-- 
cgit v1.2.3


From d6264071ce7d100a2b7c1f295167796ab5178caf Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:34 -0500
Subject: net-tc: make MAX_RECLASSIFY_LOOP local

This field is no longer kept in tc_verd. Remove it from the global
definition of that struct.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 5 -----
 net/sched/sch_api.c          | 3 ++-
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c769f71972f5..bba23dbb3ab6 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -17,9 +17,6 @@
 
 /* verdict bit breakdown 
  *
-bit 2,3,4,5: Reclassify counter - sort of reverse TTL - if exceeded
-assume loop
-
 bit 6,7: Where this packet was last seen 
 0: Above the transmit example at the socket level
 1: on the Ingress
@@ -48,8 +45,6 @@ bit 8: when set --> Request not to classify on ingress.
 #define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
 #define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
 #define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
-
-#define MAX_REC_LOOP 4
 #endif
 
 /* Action attributes */
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d7b93429f0cc..ef53ede11590 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1861,6 +1861,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 {
 	__be16 protocol = tc_skb_protocol(skb);
 #ifdef CONFIG_NET_CLS_ACT
+	const int max_reclassify_loop = 4;
 	const struct tcf_proto *old_tp = tp;
 	int limit = 0;
 
@@ -1885,7 +1886,7 @@ reclassify:
 	return TC_ACT_UNSPEC; /* signal: continue lookup */
 #ifdef CONFIG_NET_CLS_ACT
 reset:
-	if (unlikely(limit++ >= MAX_REC_LOOP)) {
+	if (unlikely(limit++ >= max_reclassify_loop)) {
 		net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
 				       tp->q->ops->id, tp->prio & 0xffff,
 				       ntohs(tp->protocol));
-- 
cgit v1.2.3


From e7246e122aaa99ebbb8ad7da80f35a20577bd8af Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:35 -0500
Subject: net-tc: extract skip classify bit from tc_verd

Packets sent by the IFB device skip subsequent tc classification.
A single bit governs this state. Move it out of tc_verd in
anticipation of removing that __u16 completely.

The new bitfield tc_skip_classify temporarily uses one bit of a
hole, until tc_verd is removed completely in a follow-up patch.

Remove the bit hole comment. It could be 2, 3, 4 or 5 bits long.
With that many options, little value in documenting it.

Introduce a helper function to deduplicate the logic in the two
sites that check this bit.

The field tc_skip_classify is set only in IFB on skbs cloned in
act_mirred, so original packet sources do not have to clear the
bit when reusing packets (notably, pktgen and octeon).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c            |  2 +-
 include/linux/skbuff.h       |  5 ++++-
 include/net/sch_generic.h    | 11 +++++++++++
 include/uapi/linux/pkt_cls.h |  6 ------
 net/core/dev.c               | 10 +++-------
 net/sched/act_api.c          | 11 ++++-------
 6 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 082534e187fc..442c4c4a9606 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -81,7 +81,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
 		txp->tx_packets++;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..570f60ec6cb4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -589,6 +589,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@pkt_type: Packet class
  *	@fclone: skbuff clone status
  *	@ipvs_property: skbuff is owned by ipvs
+ *	@tc_skip_classify: do not classify packet. set by IFB device
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -749,7 +750,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 #endif
-	/* 2, 4 or 5 bit hole */
+#ifdef CONFIG_NET_CLS_ACT
+	__u8			tc_skip_classify:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 498f81b229a4..857356f2d74b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -418,6 +418,17 @@ static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_skip_tc_classify(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (skb->tc_skip_classify) {
+		skb->tc_skip_classify = 0;
+		return true;
+	}
+#endif
+	return false;
+}
+
 /* Reset all TX qdiscs greater then index of a device.  */
 static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index bba23dbb3ab6..1eed5d7509bc 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -22,8 +22,6 @@ bit 6,7: Where this packet was last seen
 1: on the Ingress
 2: on the Egress
 
-bit 8: when set --> Request not to classify on ingress. 
-
  *
  * */
 
@@ -36,10 +34,6 @@ bit 8: when set --> Request not to classify on ingress.
 #define AT_INGRESS	0x1
 #define AT_EGRESS	0x2
 
-#define TC_NCLS          _TC_MAKEMASK1(8)
-#define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
-#define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)
-
 #define S_TC_AT          _TC_MAKE32(12)
 #define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
 #define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
diff --git a/net/core/dev.c b/net/core/dev.c
index 56818f7eab2b..e39e35d2e082 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4093,12 +4093,8 @@ another_round:
 			goto out;
 	}
 
-#ifdef CONFIG_NET_CLS_ACT
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		goto ncls;
-	}
-#endif
+	if (skb_skip_tc_classify(skb))
+		goto skip_classify;
 
 	if (pfmemalloc)
 		goto skip_taps;
@@ -4128,8 +4124,8 @@ skip_taps:
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = 0;
-ncls:
 #endif
+skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 2095c83ce773..f04715a57300 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -426,11 +426,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 {
 	int ret = -1, i;
 
-	if (skb->tc_verd & TC_NCLS) {
-		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		ret = TC_ACT_OK;
-		goto exec_done;
-	}
+	if (skb_skip_tc_classify(skb))
+		return TC_ACT_OK;
+
 	for (i = 0; i < nr_actions; i++) {
 		const struct tc_action *a = actions[i];
 
@@ -439,9 +437,8 @@ repeat:
 		if (ret == TC_ACT_REPEAT)
 			goto repeat;	/* we need a ttl - JHS */
 		if (ret != TC_ACT_PIPE)
-			goto exec_done;
+			break;
 	}
-exec_done:
 	return ret;
 }
 EXPORT_SYMBOL(tcf_action_exec);
-- 
cgit v1.2.3


From a5135bcfba7345031df45e02cd150a45add47cf8 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:36 -0500
Subject: net-tc: convert tc_verd to integer bitfields

Extract the remaining two fields from tc_verd and remove the __u16
completely. TC_AT and TC_FROM are converted to equivalent two-bit
integer fields tc_at and tc_from. Where possible, use existing
helper skb_at_tc_ingress when reading tc_at. Introduce helper
skb_reset_tc to clear fields.

Not documenting tc_from and tc_at, because they will be replaced
with single bit fields in follow-on patches.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c                    |  7 +++----
 drivers/staging/octeon/ethernet-tx.c |  5 ++---
 include/linux/skbuff.h               |  6 ++----
 include/net/sch_generic.h            | 10 +++++++++-
 include/uapi/linux/pkt_cls.h         | 31 -------------------------------
 net/core/dev.c                       | 10 ++++------
 net/core/pktgen.c                    |  4 +---
 net/core/skbuff.c                    |  3 ---
 net/sched/act_ife.c                  |  7 +++----
 net/sched/act_mirred.c               |  9 ++++-----
 net/sched/sch_netem.c                |  2 +-
 11 files changed, 29 insertions(+), 65 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 442c4c4a9606..b73b6b6c066b 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -78,9 +78,9 @@ static void ifb_ri_tasklet(unsigned long _txp)
 	}
 
 	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
+		u32 from = skb->tc_from;
 
-		skb->tc_verd = 0;
+		skb_reset_tc(skb);
 		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
@@ -239,7 +239,6 @@ static void ifb_setup(struct net_device *dev)
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ifb_dev_private *dp = netdev_priv(dev);
-	u32 from = G_TC_FROM(skb->tc_verd);
 	struct ifb_q_private *txp = dp->tx_private + skb_get_queue_mapping(skb);
 
 	u64_stats_update_begin(&txp->rsync);
@@ -247,7 +246,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	txp->rx_bytes += skb->len;
 	u64_stats_update_end(&txp->rsync);
 
-	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
+	if (skb->tc_from == AT_STACK || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		dev->stats.rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c
index 6b4c20872323..0b8053205091 100644
--- a/drivers/staging/octeon/ethernet-tx.c
+++ b/drivers/staging/octeon/ethernet-tx.c
@@ -23,6 +23,7 @@
 #endif /* CONFIG_XFRM */
 
 #include <linux/atomic.h>
+#include <net/sch_generic.h>
 
 #include <asm/octeon/octeon.h>
 
@@ -369,9 +370,7 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 
 #ifdef CONFIG_NET_SCHED
 	skb->tc_index = 0;
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-#endif /* CONFIG_NET_CLS_ACT */
+	skb_reset_tc(skb);
 #endif /* CONFIG_NET_SCHED */
 #endif /* REUSE_SKBUFFS_WITHOUT_FREE */
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 570f60ec6cb4..f738d09947b2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -599,7 +599,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
- *	@tc_verd: traffic control verdict
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
@@ -752,13 +751,12 @@ struct sk_buff {
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
+	__u8			tc_at:2;
+	__u8			tc_from:2;
 #endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
-#ifdef CONFIG_NET_CLS_ACT
-	__u16			tc_verd;	/* traffic control verdict */
-#endif
 #endif
 
 	union {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 857356f2d74b..f80dba516964 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -409,10 +409,18 @@ bool tcf_destroy(struct tcf_proto *tp, bool force);
 void tcf_destroy_chain(struct tcf_proto __rcu **fl);
 int skb_do_redirect(struct sk_buff *);
 
+static inline void skb_reset_tc(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	skb->tc_at = 0;
+	skb->tc_from = 0;
+#endif
+}
+
 static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return G_TC_AT(skb->tc_verd) & AT_INGRESS;
+	return skb->tc_at & AT_INGRESS;
 #else
 	return false;
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 1eed5d7509bc..cee753a7a40c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -5,40 +5,9 @@
 #include <linux/pkt_sched.h>
 
 #ifdef __KERNEL__
-/* I think i could have done better macros ; for now this is stolen from
- * some arch/mips code - jhs
-*/
-#define _TC_MAKE32(x) ((x))
-
-#define _TC_MAKEMASK1(n) (_TC_MAKE32(1) << _TC_MAKE32(n))
-#define _TC_MAKEMASK(v,n) (_TC_MAKE32((_TC_MAKE32(1)<<(v))-1) << _TC_MAKE32(n))
-#define _TC_MAKEVALUE(v,n) (_TC_MAKE32(v) << _TC_MAKE32(n))
-#define _TC_GETVALUE(v,n,m) ((_TC_MAKE32(v) & _TC_MAKE32(m)) >> _TC_MAKE32(n))
-
-/* verdict bit breakdown 
- *
-bit 6,7: Where this packet was last seen 
-0: Above the transmit example at the socket level
-1: on the Ingress
-2: on the Egress
-
- *
- * */
-
-#define S_TC_FROM          _TC_MAKE32(6)
-#define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
-#define G_TC_FROM(x)       _TC_GETVALUE(x,S_TC_FROM,M_TC_FROM)
-#define V_TC_FROM(x)       _TC_MAKEVALUE(x,S_TC_FROM)
-#define SET_TC_FROM(v,n)   ((V_TC_FROM(n)) | (v & ~M_TC_FROM))
 #define AT_STACK	0x0
 #define AT_INGRESS	0x1
 #define AT_EGRESS	0x2
-
-#define S_TC_AT          _TC_MAKE32(12)
-#define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
-#define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
-#define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
-#define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
 #endif
 
 /* Action attributes */
diff --git a/net/core/dev.c b/net/core/dev.c
index e39e35d2e082..8b5d6d033473 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3153,7 +3153,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!cl)
 		return skb;
 
-	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
+	/* skb->tc_at and qdisc_skb_cb(skb)->pkt_len were already set
 	 * earlier by the caller.
 	 */
 	qdisc_bstats_cpu_update(cl->q, skb);
@@ -3320,7 +3320,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+	skb->tc_at = AT_EGRESS;
 # ifdef CONFIG_NET_EGRESS
 	if (static_key_false(&egress_needed)) {
 		skb = sch_handle_egress(skb, &rc, dev);
@@ -3920,7 +3920,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+	skb->tc_at = AT_INGRESS;
 	qdisc_bstats_cpu_update(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res, false)) {
@@ -4122,9 +4122,7 @@ skip_taps:
 			goto out;
 	}
 #endif
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = 0;
-#endif
+	skb_reset_tc(skb);
 skip_classify:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8e69ce472236..96947f5d41e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 			/* skb was 'freed' by stack, so clean few
 			 * bits and reuse it
 			 */
-#ifdef CONFIG_NET_CLS_ACT
-			skb->tc_verd = 0; /* reset reclass/redir ttl */
-#endif
+			skb_reset_tc(skb);
 		} while (--burst > 0);
 		goto out; /* Skips xmit_mode M_START_XMIT */
 	} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..adec4bf807d8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -878,9 +878,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
-#ifdef CONFIG_NET_CLS_ACT
-	CHECK_SKB_FIELD(tc_verd);
-#endif
 #endif
 
 }
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 80b848d3f096..921fb20eaa7c 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -736,12 +736,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
 	unsigned int skboff = skb->dev->hard_header_len;
-	u32 at = G_TC_AT(skb->tc_verd);
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
 	int err;
 
-	if (at & AT_EGRESS) {
+	if (!skb_at_tc_ingress(skb)) {
 		if (new_len > skb->dev->mtu)
 			exceed_mtu = true;
 	}
@@ -773,7 +772,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_push(skb, skb->dev->hard_header_len);
 
 	iethh = (struct ethhdr *)skb->data;
@@ -816,7 +815,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
-	if (!(at & AT_EGRESS))
+	if (skb_at_tc_ingress(skb))
 		skb_pull(skb, skb->dev->hard_header_len);
 
 	spin_unlock(&ife->tcf_lock);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 2d9fa6e0a1b4..8543279bba49 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -170,7 +170,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	int retval, err = 0;
 	int m_eaction;
 	int mac_len;
-	u32 at;
 
 	tcf_lastuse_update(&m->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -191,7 +190,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 		goto out;
 	}
 
-	at = G_TC_AT(skb->tc_verd);
 	skb2 = skb_clone(skb, GFP_ATOMIC);
 	if (!skb2)
 		goto out;
@@ -200,8 +198,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	 * and devices expect a mac header on xmit, then mac push/pull is
 	 * needed.
 	 */
-	if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) {
-		if (at & AT_EGRESS) {
+	if (skb->tc_at != tcf_mirred_act_direction(m_eaction) &&
+	    m_mac_header_xmit) {
+		if (!skb_at_tc_ingress(skb)) {
 			/* caught at egress, act ingress: pull mac */
 			mac_len = skb_network_header(skb) - skb_mac_header(skb);
 			skb_pull_rcsum(skb2, mac_len);
@@ -213,7 +212,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	/* mirror is always swallowed */
 	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+		skb2->tc_from = skb2->tc_at;
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bcfadfdea8e0..bb5c638b6852 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+			if (skb->tc_from & AT_INGRESS)
 				skb->tstamp = 0;
 #endif
 
-- 
cgit v1.2.3


From bc31c905e946b5c55df5d2938335e78ffb3157ca Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 7 Jan 2017 17:06:38 -0500
Subject: net-tc: convert tc_from to tc_from_ingress and tc_redirected

The tc_from field fulfills two roles. It encodes whether a packet was
redirected by an act_mirred device and, if so, whether act_mirred was
called on ingress or egress. Split it into separate fields.

The information is needed by the special IFB loop, where packets are
taken out of the normal path by act_mirred, forwarded to IFB, then
reinjected at their original location (ingress or egress) by IFB.

The IFB device cannot use skb->tc_at_ingress, because that may have
been overwritten as the packet travels from act_mirred to ifb_xmit,
when it passes through tc_classify on the IFB egress path. Cache this
value in skb->tc_from_ingress.

That field is valid only if a packet arriving at ifb_xmit came from
act_mirred. Other packets can be crafted to reach ifb_xmit. These
must be dropped. Set tc_redirected on redirection and drop all packets
that do not have this bit set.

Both fields are set only on cloned skbs in tc actions, so original
packet sources do not have to clear the bit when reusing packets
(notably, pktgen and octeon).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c            | 13 +++++--------
 include/linux/skbuff.h       |  5 ++++-
 include/net/sch_generic.h    |  2 +-
 include/uapi/linux/pkt_cls.h |  6 ------
 net/sched/act_mirred.c       |  6 ++++--
 net/sched/sch_netem.c        |  2 +-
 6 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index b73b6b6c066b..312fce7302d3 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -78,9 +78,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 	}
 
 	while ((skb = __skb_dequeue(&txp->tq)) != NULL) {
-		u32 from = skb->tc_from;
-
-		skb_reset_tc(skb);
+		skb->tc_redirected = 0;
 		skb->tc_skip_classify = 1;
 
 		u64_stats_update_begin(&txp->tsync);
@@ -101,13 +99,12 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		rcu_read_unlock();
 		skb->skb_iif = txp->dev->ifindex;
 
-		if (from & AT_EGRESS) {
+		if (!skb->tc_from_ingress) {
 			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
+		} else {
 			skb_pull(skb, skb->mac_len);
 			netif_receive_skb(skb);
-		} else
-			BUG();
+		}
 	}
 
 	if (__netif_tx_trylock(txq)) {
@@ -246,7 +243,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	txp->rx_bytes += skb->len;
 	u64_stats_update_end(&txp->rsync);
 
-	if (skb->tc_from == AT_STACK || !skb->skb_iif) {
+	if (!skb->tc_redirected || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		dev->stats.rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fab3f87e9bd1..3149a88de548 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -591,6 +591,8 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@ipvs_property: skbuff is owned by ipvs
  *	@tc_skip_classify: do not classify packet. set by IFB device
  *	@tc_at_ingress: used within tc_classify to distinguish in/egress
+ *	@tc_redirected: packet was redirected by a tc action
+ *	@tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
@@ -753,7 +755,8 @@ struct sk_buff {
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;
-	__u8			tc_from:2;
+	__u8			tc_redirected:1;
+	__u8			tc_from_ingress:1;
 #endif
 
 #ifdef CONFIG_NET_SCHED
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4bd6d5387209..e2f426f6d62f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -412,7 +412,7 @@ int skb_do_redirect(struct sk_buff *);
 static inline void skb_reset_tc(struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	skb->tc_from = 0;
+	skb->tc_redirected = 0;
 #endif
 }
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cee753a7a40c..a081efbd61a2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -4,12 +4,6 @@
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
 
-#ifdef __KERNEL__
-#define AT_STACK	0x0
-#define AT_INGRESS	0x1
-#define AT_EGRESS	0x2
-#endif
-
 /* Action attributes */
 enum {
 	TCA_ACT_UNSPEC,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e832c62fd705..84682f02b611 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -211,8 +211,10 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	/* mirror is always swallowed */
-	if (tcf_mirred_is_act_redirect(m_eaction))
-		skb2->tc_from = skb_at_tc_ingress(skb) ? AT_INGRESS : AT_EGRESS;
+	if (tcf_mirred_is_act_redirect(m_eaction)) {
+		skb2->tc_redirected = 1;
+		skb2->tc_from_ingress = skb2->tc_at_ingress;
+	}
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb5c638b6852..c8bb62a1e744 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -626,7 +626,7 @@ deliver:
 			 * If it's at ingress let's pretend the delay is
 			 * from the network (tstamp will be updated).
 			 */
-			if (skb->tc_from & AT_INGRESS)
+			if (skb->tc_redirected && skb->tc_from_ingress)
 				skb->tstamp = 0;
 #endif
 
-- 
cgit v1.2.3


From bd2522b168847106c1885f0319a2833bdf88bf9a Mon Sep 17 00:00:00 2001
From: Andrzej Zaborowski <andrew.zaborowski@intel.com>
Date: Fri, 6 Jan 2017 16:33:43 -0500
Subject: cfg80211: NL80211_ATTR_SOCKET_OWNER support for CMD_CONNECT

Disconnect or deauthenticate when the owning socket is closed if this
flag is supplied to CMD_CONNECT or CMD_ASSOCIATE.  This may be used
to ensure userspace daemon doesn't leave an unmanaged connection behind.

In some situations it would be possible to account for that, to some
degree, in the deamon restart code or in the up/down scripts without
the use of this attribute.  But there will be systems where the daemon
can go away for varying periods without a warning due to local resource
management.

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  7 +++++++
 include/uapi/linux/nl80211.h |  2 ++
 net/wireless/core.c          |  3 +++
 net/wireless/core.h          |  1 +
 net/wireless/mlme.c          |  5 +++++
 net/wireless/nl80211.c       | 26 +++++++++++++++++++++++++-
 net/wireless/sme.c           | 33 +++++++++++++++++++++++++++++++++
 7 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 41a9ecd82ca0..cb13789ebaef 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3865,6 +3865,9 @@ struct cfg80211_cached_keys;
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
  * @conn_bss_type: connecting/connected BSS type
+ * @conn_owner_nlportid: (private) connection owner socket port ID
+ * @disconnect_wk: (private) auto-disconnect work
+ * @disconnect_bssid: (private) the BSSID to use for auto-disconnect
  * @ibss_fixed: (private) IBSS is using fixed BSSID
  * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
@@ -3896,6 +3899,10 @@ struct wireless_dev {
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
 	enum ieee80211_bss_type conn_bss_type;
+	u32 conn_owner_nlportid;
+
+	struct work_struct disconnect_wk;
+	u8 disconnect_bssid[ETH_ALEN];
 
 	struct list_head event_list;
 	spinlock_t event_lock;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d74e10b1246a..174f4b30e804 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1820,6 +1820,8 @@ enum nl80211_commands {
  *	and remove functions. NAN notifications will be sent in unicast to that
  *	socket. Without this attribute, any socket can add functions and the
  *	notifications will be sent to the %NL80211_MCGRP_NAN multicast group.
+ *	If set during %NL80211_CMD_ASSOCIATE or %NL80211_CMD_CONNECT the
+ *	station will deauthenticate when the socket is closed.
  *
  * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is
  *	the TDLS link initiator.
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 158c59ecf90a..903fc419217a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1142,6 +1142,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
 			dev->priv_flags |= IFF_DONT_BRIDGE;
 
+		INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
+
 		nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
 		break;
 	case NETDEV_GOING_DOWN:
@@ -1230,6 +1232,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 #ifdef CONFIG_CFG80211_WEXT
 			kzfree(wdev->wext.keys);
 #endif
+			flush_work(&wdev->disconnect_wk);
 		}
 		/*
 		 * synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index bc8ba6e57519..ba42055a036d 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -400,6 +400,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 		       const u8 *resp_ie, size_t resp_ie_len);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
+void cfg80211_autodisconnect_wk(struct work_struct *work);
 
 /* SME implementation */
 void cfg80211_conn_work(struct work_struct *work);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 4646cf5695b9..1c63a77aea34 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -345,6 +345,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 	     !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
 		return 0;
 
+	if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
+	    (wdev->current_bss &&
+	     ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+		wdev->conn_owner_nlportid = 0;
+
 	return rdev_deauth(rdev, dev, &req);
 }
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fed33ec20a71..b378d0a04003 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8050,8 +8050,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
 		wdev_lock(dev->ieee80211_ptr);
+
 		err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
 					  ssid, ssid_len, &req);
+
+		if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+			dev->ieee80211_ptr->conn_owner_nlportid =
+				info->snd_portid;
+			memcpy(dev->ieee80211_ptr->disconnect_bssid,
+			       bssid, ETH_ALEN);
+		}
+
 		wdev_unlock(dev->ieee80211_ptr);
 	}
 
@@ -8770,11 +8779,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	wdev_lock(dev->ieee80211_ptr);
+
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
 			       connect.prev_bssid);
-	wdev_unlock(dev->ieee80211_ptr);
 	if (err)
 		kzfree(connkeys);
+
+	if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
+		if (connect.bssid)
+			memcpy(dev->ieee80211_ptr->disconnect_bssid,
+			       connect.bssid, ETH_ALEN);
+		else
+			memset(dev->ieee80211_ptr->disconnect_bssid,
+			       0, ETH_ALEN);
+	}
+
+	wdev_unlock(dev->ieee80211_ptr);
+
 	return err;
 }
 
@@ -14491,6 +14513,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
 
 			if (wdev->owner_nlportid == notify->portid)
 				schedule_destroy_work = true;
+			else if (wdev->conn_owner_nlportid == notify->portid)
+				schedule_work(&wdev->disconnect_wk);
 		}
 
 		spin_lock_bh(&rdev->beacon_registrations_lock);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5e0d19380302..46693913fcea 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -727,6 +727,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		kzfree(wdev->connect_keys);
 		wdev->connect_keys = NULL;
 		wdev->ssid_len = 0;
+		wdev->conn_owner_nlportid = 0;
 		if (bss) {
 			cfg80211_unhold_bss(bss_from_pub(bss));
 			cfg80211_put_bss(wdev->wiphy, bss);
@@ -955,6 +956,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
 	wdev->current_bss = NULL;
 	wdev->ssid_len = 0;
+	wdev->conn_owner_nlportid = 0;
 
 	nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
 
@@ -1098,6 +1100,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	kzfree(wdev->connect_keys);
 	wdev->connect_keys = NULL;
 
+	wdev->conn_owner_nlportid = 0;
+
 	if (wdev->conn)
 		err = cfg80211_sme_disconnect(wdev, reason);
 	else if (!rdev->ops->disconnect)
@@ -1107,3 +1111,32 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 
 	return err;
 }
+
+/*
+ * Used to clean up after the connection / connection attempt owner socket
+ * disconnects
+ */
+void cfg80211_autodisconnect_wk(struct work_struct *work)
+{
+	struct wireless_dev *wdev =
+		container_of(work, struct wireless_dev, disconnect_wk);
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+	wdev_lock(wdev);
+
+	if (wdev->conn_owner_nlportid) {
+		/*
+		 * Use disconnect_bssid if still connecting and ops->disconnect
+		 * not implemented.  Otherwise we can use cfg80211_disconnect.
+		 */
+		if (rdev->ops->disconnect || wdev->current_bss)
+			cfg80211_disconnect(rdev, wdev->netdev,
+					    WLAN_REASON_DEAUTH_LEAVING, true);
+		else
+			cfg80211_mlme_deauth(rdev, wdev->netdev,
+					     wdev->disconnect_bssid, NULL, 0,
+					     WLAN_REASON_DEAUTH_LEAVING, false);
+	}
+
+	wdev_unlock(wdev);
+}
-- 
cgit v1.2.3


From 30aa60b3bd12bd79b5324b7b595bd3446ab24b52 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 3 Jan 2017 23:55:27 +0200
Subject: IB/mlx5: Support 4k UAR for libmlx5

Add fields to structs to convey to kernel an indication whether the
library supports multi UARs per page and return to the library the size
of a UAR based on the queried value.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                  | 21 +++++++-
 drivers/net/ethernet/mellanox/mlx5/core/cq.c       |  2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  9 ++--
 .../net/ethernet/mellanox/mlx5/core/en_common.c    | 12 +----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 21 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/uar.c      | 56 ----------------------
 include/linux/mlx5/cq.h                            |  2 +-
 include/linux/mlx5/driver.h                        | 12 -----
 include/uapi/rdma/mlx5-abi.h                       |  7 +++
 9 files changed, 42 insertions(+), 100 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 664067239de5..a191b9327b0c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -992,6 +992,12 @@ out:
 	return err;
 }
 
+static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
+{
+	mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
+		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
+}
+
 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
 			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
 			     u32 *num_sys_pages)
@@ -1122,6 +1128,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	resp.cqe_version = min_t(__u8,
 				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
 				 req.max_cqe_version);
+	resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
+	resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+					MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
 	resp.response_length = min(offsetof(typeof(resp), response_length) +
 				   sizeof(resp.response_length), udata->outlen);
 
@@ -1129,7 +1139,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
-	lib_uar_4k = false;
+	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
 	bfregi = &context->bfregi;
 
 	/* updates req->total_num_bfregs */
@@ -1209,6 +1219,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 					sizeof(resp.reserved2);
 	}
 
+	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
+		resp.response_length += sizeof(resp.log_uar_size);
+
+	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
+		resp.response_length += sizeof(resp.num_uars_per_page);
+
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
 		goto out_td;
@@ -1216,7 +1232,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	bfregi->ver = ver;
 	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
 	context->cqe_version = resp.cqe_version;
-	context->lib_caps = false;
+	context->lib_caps = req.lib_caps;
+	print_lib_caps(dev, context->lib_caps);
 
 	return &context->ibucontext;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 32d4af9b594d..336d4738b807 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -179,6 +179,8 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 		mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n",
 			      cq->cqn);
 
+	cq->uar = dev->priv.uar;
+
 	return 0;
 
 err_cmd:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3037631570b1..a473cea10c16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -465,7 +465,6 @@ struct mlx5e_sq {
 	/* read only */
 	struct mlx5_wq_cyc         wq;
 	u32                        dma_fifo_mask;
-	void __iomem              *uar_map;
 	struct netdev_queue       *txq;
 	u32                        sqn;
 	u16                        bf_buf_size;
@@ -479,7 +478,7 @@ struct mlx5e_sq {
 
 	/* control path */
 	struct mlx5_wq_ctrl        wq_ctrl;
-	struct mlx5_uar            uar;
+	struct mlx5_sq_bfreg	   bfreg;
 	struct mlx5e_channel      *channel;
 	int                        tc;
 	u32                        rate_limit;
@@ -806,7 +805,7 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 				      struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
 {
-	u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
+	u16 ofst = sq->bf_offset;
 
 	/* ensure wqe is visible to device before updating doorbell record */
 	dma_wmb();
@@ -818,9 +817,9 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 	 */
 	wmb();
 	if (bf_sz)
-		__iowrite64_copy(sq->uar_map + ofst, ctrl, bf_sz);
+		__iowrite64_copy(sq->bfreg.map + ofst, ctrl, bf_sz);
 	else
-		mlx5_write64((__be32 *)ctrl, sq->uar_map + ofst, NULL);
+		mlx5_write64((__be32 *)ctrl, sq->bfreg.map + ofst, NULL);
 	/* flush the write-combining mapped buffer */
 	wmb();
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index f175518ff07a..bd898d8deda0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -89,16 +89,10 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 	struct mlx5e_resources *res = &mdev->mlx5e_res;
 	int err;
 
-	err = mlx5_alloc_map_uar(mdev, &res->cq_uar, false);
-	if (err) {
-		mlx5_core_err(mdev, "alloc_map uar failed, %d\n", err);
-		return err;
-	}
-
 	err = mlx5_core_alloc_pd(mdev, &res->pdn);
 	if (err) {
 		mlx5_core_err(mdev, "alloc pd failed, %d\n", err);
-		goto err_unmap_free_uar;
+		return err;
 	}
 
 	err = mlx5_core_alloc_transport_domain(mdev, &res->td.tdn);
@@ -121,9 +115,6 @@ err_dealloc_transport_domain:
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, res->pdn);
-err_unmap_free_uar:
-	mlx5_unmap_free_uar(mdev, &res->cq_uar);
-
 	return err;
 }
 
@@ -134,7 +125,6 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 	mlx5_core_destroy_mkey(mdev, &res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
 	mlx5_core_dealloc_pd(mdev, res->pdn);
-	mlx5_unmap_free_uar(mdev, &res->cq_uar);
 }
 
 int mlx5e_refresh_tirs_self_loopback(struct mlx5_core_dev *mdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5ff86f0ecb7b..c32754b1598e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -991,7 +991,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	sq->channel   = c;
 	sq->tc        = tc;
 
-	err = mlx5_alloc_map_uar(mdev, &sq->uar, !!MLX5_CAP_GEN(mdev, bf));
+	err = mlx5_alloc_bfreg(mdev, &sq->bfreg, MLX5_CAP_GEN(mdev, bf), false);
 	if (err)
 		return err;
 
@@ -1003,12 +1003,9 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 		goto err_unmap_free_uar;
 
 	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
-	if (sq->uar.bf_map) {
+	if (sq->bfreg.wc)
 		set_bit(MLX5E_SQ_STATE_BF_ENABLE, &sq->state);
-		sq->uar_map = sq->uar.bf_map;
-	} else {
-		sq->uar_map = sq->uar.map;
-	}
+
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 	sq->max_inline  = param->max_inline;
 	sq->min_inline_mode =
@@ -1036,7 +1033,7 @@ err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
 err_unmap_free_uar:
-	mlx5_unmap_free_uar(mdev, &sq->uar);
+	mlx5_free_bfreg(mdev, &sq->bfreg);
 
 	return err;
 }
@@ -1048,7 +1045,7 @@ static void mlx5e_destroy_sq(struct mlx5e_sq *sq)
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
-	mlx5_unmap_free_uar(priv->mdev, &sq->uar);
+	mlx5_free_bfreg(priv->mdev, &sq->bfreg);
 }
 
 static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
@@ -1082,7 +1079,7 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
 	MLX5_SET(sqc,  sqc, tis_lst_sz, param->type == MLX5E_SQ_ICO ? 0 : 1);
 
 	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
-	MLX5_SET(wq,   wq, uar_page,      sq->uar.index);
+	MLX5_SET(wq,   wq, uar_page,      sq->bfreg.index);
 	MLX5_SET(wq,   wq, log_wq_pg_sz,  sq->wq_ctrl.buf.page_shift -
 					  MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr,      sq->wq_ctrl.db.dma);
@@ -1240,7 +1237,6 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
 	mcq->comp       = mlx5e_completion_event;
 	mcq->event      = mlx5e_cq_error_event;
 	mcq->irqn       = irqn;
-	mcq->uar        = &mdev->mlx5e_res.cq_uar;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
@@ -1289,7 +1285,7 @@ static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 
 	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
 	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
-	MLX5_SET(cqc,   cqc, uar_page,      mcq->uar->index);
+	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
 	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
@@ -1701,7 +1697,7 @@ static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
 {
 	void *cqc = param->cqc;
 
-	MLX5_SET(cqc, cqc, uar_page, priv->mdev->mlx5e_res.cq_uar.index);
+	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
 }
 
 static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
@@ -2320,7 +2316,6 @@ static int mlx5e_create_drop_cq(struct mlx5e_priv *priv,
 	mcq->comp       = mlx5e_completion_event;
 	mcq->event      = mlx5e_cq_error_event;
 	mcq->irqn       = irqn;
-	mcq->uar        = &mdev->mlx5e_res.cq_uar;
 
 	cq->priv = priv;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 07b273cccc26..2e6b0f290ddc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -37,11 +37,6 @@
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
 
-enum {
-	NUM_DRIVER_UARS		= 4,
-	NUM_LOW_LAT_BFREGS	= 4,
-};
-
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
 {
 	u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {0};
@@ -67,57 +62,6 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
 }
 EXPORT_SYMBOL(mlx5_cmd_free_uar);
 
-int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
-		       bool map_wc)
-{
-	phys_addr_t pfn;
-	phys_addr_t uar_bar_start;
-	int err;
-
-	err = mlx5_cmd_alloc_uar(mdev, &uar->index);
-	if (err) {
-		mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err);
-		return err;
-	}
-
-	uar_bar_start = pci_resource_start(mdev->pdev, 0);
-	pfn           = (uar_bar_start >> PAGE_SHIFT) + uar->index;
-
-	if (map_wc) {
-		uar->bf_map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE);
-		if (!uar->bf_map) {
-			mlx5_core_warn(mdev, "ioremap_wc() failed\n");
-			uar->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
-			if (!uar->map)
-				goto err_free_uar;
-		}
-	} else {
-		uar->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
-		if (!uar->map)
-			goto err_free_uar;
-	}
-
-	return 0;
-
-err_free_uar:
-	mlx5_core_warn(mdev, "ioremap() failed\n");
-	err = -ENOMEM;
-	mlx5_cmd_free_uar(mdev, uar->index);
-
-	return err;
-}
-EXPORT_SYMBOL(mlx5_alloc_map_uar);
-
-void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
-{
-	if (uar->map)
-		iounmap(uar->map);
-	else
-		iounmap(uar->bf_map);
-	mlx5_cmd_free_uar(mdev, uar->index);
-}
-EXPORT_SYMBOL(mlx5_unmap_free_uar);
-
 static int uars_per_sys_page(struct mlx5_core_dev *mdev)
 {
 	if (MLX5_CAP_GEN(mdev, uar_4k))
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 996863381bc8..95898847c7d4 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -42,13 +42,13 @@ struct mlx5_core_cq {
 	int			cqe_sz;
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
+	struct mlx5_uars_page  *uar;
 	atomic_t		refcount;
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
 	void (*comp)		(struct mlx5_core_cq *);
 	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
-	struct mlx5_uar	       *uar;
 	u32			cons_index;
 	unsigned		arm_sn;
 	struct mlx5_rsc_debug	*dbg;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7e7394fef835..10e632588cd5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -467,12 +467,6 @@ struct mlx5_sq_bfreg {
 	unsigned int		offset;
 };
 
-struct mlx5_uar {
-	u32			index;
-	void __iomem	       *map;
-	void __iomem	       *bf_map;
-};
-
 struct mlx5_core_health {
 	struct health_buffer __iomem   *health;
 	__be32 __iomem		       *health_counter;
@@ -725,7 +719,6 @@ struct mlx5_td {
 };
 
 struct mlx5e_resources {
-	struct mlx5_uar            cq_uar;
 	u32                        pdn;
 	struct mlx5_td             td;
 	struct mlx5_core_mkey      mkey;
@@ -915,11 +908,6 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
-int mlx5_alloc_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
-int mlx5_free_bfregs(struct mlx5_core_dev *dev, struct mlx5_bfreg_info *bfregi);
-int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar,
-		       bool map_wc);
-void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 86a8f30060f3..85dc966ea70b 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -65,6 +65,10 @@ struct mlx5_ib_alloc_ucontext_req {
 	__u32	num_low_latency_bfregs;
 };
 
+enum mlx5_lib_caps {
+	MLX5_LIB_CAP_4K_UAR	= (u64)1 << 0,
+};
+
 struct mlx5_ib_alloc_ucontext_req_v2 {
 	__u32	total_num_bfregs;
 	__u32	num_low_latency_bfregs;
@@ -74,6 +78,7 @@ struct mlx5_ib_alloc_ucontext_req_v2 {
 	__u8	reserved0;
 	__u16	reserved1;
 	__u32	reserved2;
+	__u64	lib_caps;
 };
 
 enum mlx5_ib_alloc_ucontext_resp_mask {
@@ -103,6 +108,8 @@ struct mlx5_ib_alloc_ucontext_resp {
 	__u8	cmds_supp_uhw;
 	__u16	reserved2;
 	__u64	hca_core_clock_offset;
+	__u32	log_uar_size;
+	__u32	num_uars_per_page;
 };
 
 struct mlx5_ib_alloc_pd_resp {
-- 
cgit v1.2.3


From c008b33f3ef0915dfb57432dba1fa0ce34fdcc29 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 9 Jan 2017 11:24:21 +0100
Subject: net/sched: act_csum: compute crc32c on SCTP packets

modify act_csum to compute crc32c on IPv4/IPv6 packets having SCTP in
their payload, and extend UAPI definitions accordingly.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_csum.h |  3 ++-
 net/sched/act_csum.c                | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tc_act/tc_csum.h b/include/uapi/linux/tc_act/tc_csum.h
index 8ac8041ab5f1..a11bb355dbfb 100644
--- a/include/uapi/linux/tc_act/tc_csum.h
+++ b/include/uapi/linux/tc_act/tc_csum.h
@@ -21,7 +21,8 @@ enum {
 	TCA_CSUM_UPDATE_FLAG_IGMP    = 4,
 	TCA_CSUM_UPDATE_FLAG_TCP     = 8,
 	TCA_CSUM_UPDATE_FLAG_UDP     = 16,
-	TCA_CSUM_UPDATE_FLAG_UDPLITE = 32
+	TCA_CSUM_UPDATE_FLAG_UDPLITE = 32,
+	TCA_CSUM_UPDATE_FLAG_SCTP    = 64,
 };
 
 struct tc_csum {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index a0edd80a44db..e978ccd4402c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -30,6 +30,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/ip6_checksum.h>
+#include <net/sctp/checksum.h>
 
 #include <net/act_api.h>
 
@@ -322,6 +323,25 @@ ignore_obscure_skb:
 	return 1;
 }
 
+static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
+			 unsigned int ipl)
+{
+	struct sctphdr *sctph;
+
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)
+		return 1;
+
+	sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph));
+	if (!sctph)
+		return 0;
+
+	sctph->checksum = sctp_compute_cksum(skb,
+					     skb_network_offset(skb) + ihl);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
 static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 {
 	const struct iphdr *iph;
@@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 					       ntohs(iph->tot_len), 1))
 				goto fail;
 		break;
+	case IPPROTO_SCTP:
+		if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+		    !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len)))
+			goto fail;
+		break;
 	}
 
 	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
@@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
 						       pl + sizeof(*ip6h), 1))
 					goto fail;
 			goto done;
+		case IPPROTO_SCTP:
+			if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) &&
+			    !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h)))
+				goto fail;
+			goto done;
 		default:
 			goto ignore_skb;
 		}
-- 
cgit v1.2.3


From 6812baabf24d5c299c13223366a23c269408f4d0 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.vnet.ibm.com>
Date: Mon, 9 Jan 2017 16:55:15 +0100
Subject: smc: establish pnet table management

Connection creation with SMC-R starts through an internal
TCP-connection. The Ethernet interface for this TCP-connection is not
restricted to the Ethernet interface of a RoCE device. Any existing
Ethernet interface belonging to the same physical net can be used, as
long as there is a defined relation between the Ethernet interface and
some RoCE devices. This relation is defined with the help of an
identification string called "Physical Net ID" or short "pnet ID".
Information about defined pnet IDs and their related Ethernet
interfaces and RoCE devices is stored in the SMC-R pnet table.

A pnet table entry consists of the identifying pnet ID and the
associated network and IB device.
This patch adds pnet table configuration support using the
generic netlink message interface referring to network and IB device
by their names. Commands exist to add, delete, and display pnet table
entries, and to flush or display the entire pnet table.

There are cross-checks to verify whether the ethernet interfaces
or infiniband devices really exist in the system. If either device
is not available, the pnet ID entry is not created.
Loss of network devices and IB devices is also monitored;
a pnet ID entry is removed when an associated network or
IB device is removed.

Signed-off-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc.h |  35 ++++
 net/smc/Makefile         |   2 +-
 net/smc/af_smc.c         |  11 +-
 net/smc/smc_ib.c         |   2 +
 net/smc/smc_pnet.c       | 534 +++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_pnet.h       |  23 ++
 6 files changed, 604 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/smc.h
 create mode 100644 net/smc/smc_pnet.c
 create mode 100644 net/smc/smc_pnet.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
new file mode 100644
index 000000000000..ab1dea8e53ee
--- /dev/null
+++ b/include/uapi/linux/smc.h
@@ -0,0 +1,35 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for generic netlink based configuration of an SMC-R PNET table
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _UAPI_LINUX_SMC_H_
+#define _UAPI_LINUX_SMC_H_
+
+/* Netlink SMC_PNETID attributes */
+enum {
+	SMC_PNETID_UNSPEC,
+	SMC_PNETID_NAME,
+	SMC_PNETID_ETHNAME,
+	SMC_PNETID_IBNAME,
+	SMC_PNETID_IBPORT,
+	__SMC_PNETID_MAX,
+	SMC_PNETID_MAX = __SMC_PNETID_MAX - 1
+};
+
+enum {				/* SMC PNET Table commands */
+	SMC_PNETID_GET = 1,
+	SMC_PNETID_ADD,
+	SMC_PNETID_DEL,
+	SMC_PNETID_FLUSH
+};
+
+#define SMCR_GENL_FAMILY_NAME		"SMC_PNETID"
+#define SMCR_GENL_FAMILY_VERSION	1
+
+#endif /* _UAPI_LINUX_SMC_H */
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 56f2170e6f64..50f39ffec8c9 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_SMC)	+= smc.o
-smc-y := af_smc.o smc_ib.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 50492ee495ce..8b059b2fc34d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -21,6 +21,7 @@
 
 #include "smc.h"
 #include "smc_ib.h"
+#include "smc_pnet.h"
 
 static void smc_set_keepalive(struct sock *sk, int val)
 {
@@ -586,10 +587,14 @@ static int __init smc_init(void)
 {
 	int rc;
 
+	rc = smc_pnet_init();
+	if (rc)
+		return rc;
+
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
-		goto out;
+		goto out_pnet;
 	}
 
 	rc = sock_register(&smc_sock_family_ops);
@@ -610,7 +615,8 @@ out_sock:
 	sock_unregister(PF_SMC);
 out_proto:
 	proto_unregister(&smc_proto);
-out:
+out_pnet:
+	smc_pnet_exit();
 	return rc;
 }
 
@@ -619,6 +625,7 @@ static void __exit smc_exit(void)
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
 	proto_unregister(&smc_proto);
+	smc_pnet_exit();
 }
 
 module_init(smc_init);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 7222b7ede900..5b037f435bc1 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -14,6 +14,7 @@
 #include <linux/random.h>
 #include <rdma/ib_verbs.h>
 
+#include "smc_pnet.h"
 #include "smc_ib.h"
 #include "smc.h"
 
@@ -123,6 +124,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
 	spin_lock(&smc_ib_devices.lock);
 	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
 	spin_unlock(&smc_ib_devices.lock);
+	smc_pnet_remove_by_ibdev(smcibdev);
 	kfree(smcibdev);
 }
 
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic netlink support functions to configure an SMC-R PNET table
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+
+#define SMC_MAX_PNET_ID_LEN	16	/* Max. length of PNET id */
+
+static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+	[SMC_PNETID_NAME] = {
+		.type = NLA_NUL_STRING,
+		.len = SMC_MAX_PNET_ID_LEN - 1
+	},
+	[SMC_PNETID_ETHNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IFNAMSIZ - 1
+	},
+	[SMC_PNETID_IBNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IB_DEVICE_NAME_MAX - 1
+	},
+	[SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+static struct smc_pnettable {
+	rwlock_t lock;
+	struct list_head pnetlist;
+} smc_pnettable = {
+	.pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
+	.lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
+};
+
+/**
+ * struct smc_pnetentry - pnet identifier name entry
+ * @list: List node.
+ * @pnet_name: Pnet identifier name
+ * @ndev: pointer to network device.
+ * @smcibdev: Pointer to IB device.
+ */
+struct smc_pnetentry {
+	struct list_head list;
+	char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+	struct net_device *ndev;
+	struct smc_ib_device *smcibdev;
+	u8 ib_port;
+};
+
+/* Check if two RDMA device entries are identical. Use device name and port
+ * number for comparison.
+ */
+static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
+				 u8 ibport)
+{
+	return pnetelem->ib_port == ibport &&
+	       !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
+			sizeof(pnetelem->smcibdev->ibdev->name));
+}
+
+/* Find a pnetid in the pnet table.
+ */
+static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
+{
+	struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
+
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (!strncmp(pnetelem->pnet_name, pnet_name,
+			     sizeof(pnetelem->pnet_name))) {
+			found_pnetelem = pnetelem;
+			break;
+		}
+	}
+	read_unlock(&smc_pnettable.lock);
+	return found_pnetelem;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(char *pnet_name)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (!strncmp(pnetelem->pnet_name, pnet_name,
+			     sizeof(pnetelem->pnet_name))) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Remove a pnet entry mentioning a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (pnetelem->ndev == ndev) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Remove a pnet entry mentioning a given ib device from the pnet table.
+ */
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	int rc = -ENOENT;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		if (pnetelem->smcibdev == ibdev) {
+			list_del(&pnetelem->list);
+			dev_put(pnetelem->ndev);
+			kfree(pnetelem);
+			rc = 0;
+			break;
+		}
+	}
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
+{
+	struct smc_pnetentry *pnetelem;
+	int rc = -EEXIST;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
+			     sizeof(new_pnetelem->pnet_name)) ||
+		    !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
+			     sizeof(new_pnetelem->ndev->name)) ||
+		    smc_pnet_same_ibname(pnetelem,
+					 new_pnetelem->smcibdev->ibdev->name,
+					 new_pnetelem->ib_port))
+			goto found;
+	}
+	list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
+	rc = 0;
+found:
+	write_unlock(&smc_pnettable.lock);
+	return rc;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+	char *bf = skip_spaces(pnet_name);
+	size_t len = strlen(bf);
+	char *end = bf + len;
+
+	if (!len)
+		return false;
+	while (--end >= bf && isspace(*end))
+		;
+	if (end - bf >= SMC_MAX_PNET_ID_LEN)
+		return false;
+	while (bf <= end) {
+		if (!isalnum(*bf))
+			return false;
+		*pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+		bf++;
+	}
+	*pnetid = '\0';
+	return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+	struct smc_ib_device *ibdev;
+
+	spin_lock(&smc_ib_devices.lock);
+	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+		if (!strncmp(ibdev->ibdev->name, ib_name,
+			     sizeof(ibdev->ibdev->name))) {
+			goto out;
+		}
+	}
+	ibdev = NULL;
+out:
+	spin_unlock(&smc_ib_devices.lock);
+	return ibdev;
+}
+
+/* Parse the supplied netlink attributes and fill a pnetentry structure.
+ * For ethernet and infiniband device names verify that the devices exist.
+ */
+static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
+			       struct nlattr *tb[])
+{
+	char *string, *ibname = NULL;
+	int rc = 0;
+
+	memset(pnetelem, 0, sizeof(*pnetelem));
+	INIT_LIST_HEAD(&pnetelem->list);
+	if (tb[SMC_PNETID_NAME]) {
+		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+	if (tb[SMC_PNETID_ETHNAME]) {
+		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+		pnetelem->ndev = dev_get_by_name(net, string);
+		if (!pnetelem->ndev)
+			return -ENOENT;
+	}
+	if (tb[SMC_PNETID_IBNAME]) {
+		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+		ibname = strim(ibname);
+		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+		if (!pnetelem->smcibdev) {
+			rc = -ENOENT;
+			goto error;
+		}
+	}
+	if (tb[SMC_PNETID_IBPORT]) {
+		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+		if (pnetelem->ib_port > SMC_MAX_PORTS) {
+			rc = -EINVAL;
+			goto error;
+		}
+	}
+	return 0;
+
+error:
+	if (pnetelem->ndev)
+		dev_put(pnetelem->ndev);
+	return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
+{
+	if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
+	    nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
+	    nla_put_string(msg, SMC_PNETID_IBNAME,
+			   pnetelem->smcibdev->ibdev->name) ||
+	    nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+		return -1;
+	return 0;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct smc_pnetentry *pnetelem;
+	struct sk_buff *msg;
+	void *hdr;
+	int rc;
+
+	pnetelem = smc_pnet_find_pnetid(
+				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+	if (!pnetelem)
+		return -ENOENT;
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &smc_pnet_nl_family, 0, SMC_PNETID_GET);
+	if (!hdr) {
+		rc = -EMSGSIZE;
+		goto err_out;
+	}
+
+	if (smc_pnet_set_nla(msg, pnetelem)) {
+		rc = -ENOBUFS;
+		goto err_out;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+err_out:
+	nlmsg_free(msg);
+	return rc;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct smc_pnetentry *pnetelem;
+	int rc;
+
+	pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
+	if (!pnetelem)
+		return -ENOMEM;
+	rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
+	if (!rc)
+		rc = smc_pnet_enter(pnetelem);
+	if (rc) {
+		kfree(pnetelem);
+		return rc;
+	}
+	rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
+	if (rc)
+		smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
+	return rc;
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+	return smc_pnet_remove_by_pnetid(
+				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+	cb->args[0] = 0;
+	return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+			     u32 portid, u32 seq, u32 flags,
+			     struct smc_pnetentry *pnetelem)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+			  flags, SMC_PNETID_GET);
+	if (!hdr)
+		return -ENOMEM;
+	if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+		genlmsg_cancel(skb, hdr);
+		return -EMSGSIZE;
+	}
+	genlmsg_end(skb, hdr);
+	return 0;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct smc_pnetentry *pnetelem;
+	int idx = 0;
+
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (idx++ < cb->args[0])
+			continue;
+		if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				      pnetelem)) {
+			--idx;
+			break;
+		}
+	}
+	cb->args[0] = idx;
+	read_unlock(&smc_pnettable.lock);
+	return skb->len;
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+
+	write_lock(&smc_pnettable.lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+				 list) {
+		list_del(&pnetelem->list);
+		dev_put(pnetelem->ndev);
+		kfree(pnetelem);
+	}
+	write_unlock(&smc_pnettable.lock);
+	return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+	{
+		.cmd = SMC_PNETID_GET,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_get,
+		.dumpit = smc_pnet_dump,
+		.start = smc_pnet_dump_start
+	},
+	{
+		.cmd = SMC_PNETID_ADD,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_add
+	},
+	{
+		.cmd = SMC_PNETID_DEL,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_del
+	},
+	{
+		.cmd = SMC_PNETID_FLUSH,
+		.flags = GENL_ADMIN_PERM,
+		.policy = smc_pnet_policy,
+		.doit = smc_pnet_flush
+	}
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family = {
+	.hdrsize = 0,
+	.name = SMCR_GENL_FAMILY_NAME,
+	.version = SMCR_GENL_FAMILY_VERSION,
+	.maxattr = SMC_PNETID_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = smc_pnet_ops,
+	.n_ops =  ARRAY_SIZE(smc_pnet_ops)
+};
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_REBOOT:
+	case NETDEV_UNREGISTER:
+		smc_pnet_remove_by_ndev(event_dev);
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block smc_netdev_notifier = {
+	.notifier_call = smc_pnet_netdev_event
+};
+
+int __init smc_pnet_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&smc_pnet_nl_family);
+	if (rc)
+		return rc;
+	rc = register_netdevice_notifier(&smc_netdev_notifier);
+	if (rc)
+		genl_unregister_family(&smc_pnet_nl_family);
+	return rc;
+}
+
+void smc_pnet_exit(void)
+{
+	smc_pnet_flush(NULL, NULL);
+	unregister_netdevice_notifier(&smc_netdev_notifier);
+	genl_unregister_family(&smc_pnet_nl_family);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+				 struct smc_ib_device **smcibdev, u8 *ibport)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+	struct smc_pnetentry *pnetelem;
+
+	*smcibdev = NULL;
+	*ibport = 0;
+
+	if (!dst)
+		return;
+	if (!dst->dev)
+		goto out_rel;
+	read_lock(&smc_pnettable.lock);
+	list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+		if (dst->dev == pnetelem->ndev) {
+			*smcibdev = pnetelem->smcibdev;
+			*ibport = pnetelem->ib_port;
+			break;
+		}
+	}
+	read_unlock(&smc_pnettable.lock);
+out_rel:
+	dst_release(dst);
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  PNET table queries
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+struct smc_ib_device;
+
+int smc_pnet_init(void) __init;
+void smc_pnet_exit(void);
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
+void smc_pnet_find_roce_resource(struct sock *sk,
+				 struct smc_ib_device **smcibdev, u8 *ibport);
+
+#endif
-- 
cgit v1.2.3


From f16a7dd5cf27eeda187425c9c7d96802a549f9c4 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Mon, 9 Jan 2017 16:55:26 +0100
Subject: smc: netlink interface for SMC sockets

Support for SMC socket monitoring via netlink sockets of protocol
NETLINK_SOCK_DIAG.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/smc.h             |  20 ++++
 include/net/sock.h            |   3 +
 include/uapi/linux/netlink.h  |   1 +
 include/uapi/linux/smc_diag.h |  85 +++++++++++++++++
 net/smc/Kconfig               |   9 ++
 net/smc/Makefile              |   1 +
 net/smc/af_smc.c              |  43 ++++++++-
 net/smc/smc.h                 |   2 +
 net/smc/smc_close.c           |   1 +
 net/smc/smc_diag.c            | 215 ++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 include/net/smc.h
 create mode 100644 include/uapi/linux/smc_diag.h
 create mode 100644 net/smc/smc_diag.c

(limited to 'include/uapi')

diff --git a/include/net/smc.h b/include/net/smc.h
new file mode 100644
index 000000000000..12d26358ad9f
--- /dev/null
+++ b/include/net/smc.h
@@ -0,0 +1,20 @@
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Definitions for the SMC module (socket related)
+ *
+ *  Copyright IBM Corp. 2016
+ *
+ *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef _SMC_H
+#define _SMC_H
+
+struct smc_hashinfo {
+	rwlock_t lock;
+	struct hlist_head ht;
+};
+
+int smc_hash_sk(struct sock *sk);
+void smc_unhash_sk(struct sock *sk);
+#endif	/* _SMC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 99deda67eba0..389a0a619b45 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -70,6 +70,7 @@
 #include <net/checksum.h>
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
+#include <net/smc.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -986,6 +987,7 @@ struct request_sock_ops;
 struct timewait_sock_ops;
 struct inet_hashinfo;
 struct raw_hashinfo;
+struct smc_hashinfo;
 struct module;
 
 /*
@@ -1094,6 +1096,7 @@ struct proto {
 		struct inet_hashinfo	*hashinfo;
 		struct udp_table	*udp_table;
 		struct raw_hashinfo	*raw_hash;
+		struct smc_hashinfo	*smc_hash;
 	} h;
 
 	struct module		*owner;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 0dba4e4ed2be..f3946a27bd07 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -27,6 +27,7 @@
 #define NETLINK_ECRYPTFS	19
 #define NETLINK_RDMA		20
 #define NETLINK_CRYPTO		21	/* Crypto layer */
+#define NETLINK_SMC		22	/* SMC monitoring */
 
 #define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG
 
diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
new file mode 100644
index 000000000000..0063919fea34
--- /dev/null
+++ b/include/uapi/linux/smc_diag.h
@@ -0,0 +1,85 @@
+#ifndef _UAPI_SMC_DIAG_H_
+#define _UAPI_SMC_DIAG_H_
+
+#include <linux/types.h>
+#include <linux/inet_diag.h>
+#include <rdma/ib_verbs.h>
+
+/* Request structure */
+struct smc_diag_req {
+	__u8	diag_family;
+	__u8	pad[2];
+	__u8	diag_ext;		/* Query extended information */
+	struct inet_diag_sockid	id;
+};
+
+/* Base info structure. It contains socket identity (addrs/ports/cookie) based
+ * on the internal clcsock, and more SMC-related socket data
+ */
+struct smc_diag_msg {
+	__u8	diag_family;
+	__u8	diag_state;
+	__u8	diag_fallback;
+	__u8	diag_shutdown;
+	struct inet_diag_sockid id;
+
+	__u32	diag_uid;
+	__u64	diag_inode;
+};
+
+/* Extensions */
+
+enum {
+	SMC_DIAG_NONE,
+	SMC_DIAG_CONNINFO,
+	SMC_DIAG_LGRINFO,
+	SMC_DIAG_SHUTDOWN,
+	__SMC_DIAG_MAX,
+};
+
+#define SMC_DIAG_MAX (__SMC_DIAG_MAX - 1)
+
+/* SMC_DIAG_CONNINFO */
+
+struct smc_diag_cursor {
+	__u16	reserved;
+	__u16	wrap;
+	__u32	count;
+};
+
+struct smc_diag_conninfo {
+	__u32			token;		/* unique connection id */
+	__u32			sndbuf_size;	/* size of send buffer */
+	__u32			rmbe_size;	/* size of RMB element */
+	__u32			peer_rmbe_size;	/* size of peer RMB element */
+	/* local RMB element cursors */
+	struct smc_diag_cursor	rx_prod;	/* received producer cursor */
+	struct smc_diag_cursor	rx_cons;	/* received consumer cursor */
+	/* peer RMB element cursors */
+	struct smc_diag_cursor	tx_prod;	/* sent producer cursor */
+	struct smc_diag_cursor	tx_cons;	/* sent consumer cursor */
+	__u8			rx_prod_flags;	/* received producer flags */
+	__u8			rx_conn_state_flags; /* recvd connection flags*/
+	__u8			tx_prod_flags;	/* sent producer flags */
+	__u8			tx_conn_state_flags; /* sent connection flags*/
+	/* send buffer cursors */
+	struct smc_diag_cursor	tx_prep;	/* prepared to be sent cursor */
+	struct smc_diag_cursor	tx_sent;	/* sent cursor */
+	struct smc_diag_cursor	tx_fin;		/* confirmed sent cursor */
+};
+
+/* SMC_DIAG_LINKINFO */
+
+struct smc_diag_linkinfo {
+	__u8 link_id;			/* link identifier */
+	__u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
+	__u8 ibport;			/* RDMA device port number */
+	__u8 gid[40];			/* local GID */
+	__u8 peer_gid[40];		/* peer GID */
+};
+
+struct smc_diag_lgrinfo {
+	struct smc_diag_linkinfo	lnk[1];
+	__u8				role;
+};
+#endif /* _UAPI_SMC_DIAG_H_ */
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index bc029803e728..c717ef0896aa 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -9,3 +9,12 @@ config SMC
 	  a separate socket family SMC.
 
 	  Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+	tristate "SMC: socket monitoring interface"
+	depends on SMC
+	---help---
+	  Support for SMC socket monitoring interface used by tools such as
+	  smcss.
+
+	  if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 5cf0cafaa208..188104654b54 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_SMC)	+= smc.o
+obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3f543d58bc5c..5d4208ad029e 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -29,6 +29,7 @@
 #include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/smc.h>
 
 #include "smc.h"
 #include "smc_clc.h"
@@ -59,13 +60,48 @@ static void smc_set_keepalive(struct sock *sk, int val)
 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
 }
 
-static struct proto smc_proto = {
+static struct smc_hashinfo smc_v4_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+	struct hlist_head *head;
+
+	head = &h->ht;
+
+	write_lock_bh(&h->lock);
+	sk_add_node(sk, head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&h->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+	write_lock_bh(&h->lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
 	.name		= "SMC",
 	.owner		= THIS_MODULE,
 	.keepalive	= smc_set_keepalive,
+	.hash		= smc_hash_sk,
+	.unhash		= smc_unhash_sk,
 	.obj_size	= sizeof(struct smc_sock),
+	.h.smc_hash	= &smc_v4_hashinfo,
 	.slab_flags	= SLAB_DESTROY_BY_RCU,
 };
+EXPORT_SYMBOL_GPL(smc_proto);
 
 static int smc_release(struct socket *sock)
 {
@@ -109,6 +145,7 @@ static int smc_release(struct socket *sock)
 		schedule_delayed_work(&smc->sock_put_work,
 				      SMC_CLOSE_SOCK_PUT_DELAY);
 	}
+	sk->sk_prot->unhash(sk);
 	release_sock(sk);
 
 	sock_put(sk);
@@ -144,6 +181,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
 	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
+	sk->sk_prot->hash(sk);
 	sk_refcnt_debug_inc(sk);
 
 	return sk;
@@ -536,6 +574,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 		lsmc->sk.sk_err = -rc;
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -545,6 +584,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 			sock_release(new_clcsock);
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
+		sk->sk_prot->unhash(new_sk);
 		sock_put(new_sk);
 		*new_smc = NULL;
 		goto out;
@@ -1320,6 +1360,7 @@ static int __init smc_init(void)
 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
 		goto out_proto;
 	}
+	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
 
 	rc = smc_ib_register_client();
 	if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 959a5d2014ab..ee5fbea24549 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,6 +21,8 @@
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
 
+extern struct proto smc_proto;
+
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
 #endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index d70c05b57021..03dfcc6b7661 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -384,6 +384,7 @@ void smc_close_sock_put_work(struct work_struct *work)
 					    struct smc_sock,
 					    sock_put_work);
 
+	smc->sk.sk_prot->unhash(&smc->sk);
 	sock_put(&smc->sk);
 }
 
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+	sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+		be16_to_cpu(((__be16 *)gid_raw)[0]),
+		be16_to_cpu(((__be16 *)gid_raw)[1]),
+		be16_to_cpu(((__be16 *)gid_raw)[2]),
+		be16_to_cpu(((__be16 *)gid_raw)[3]),
+		be16_to_cpu(((__be16 *)gid_raw)[4]),
+		be16_to_cpu(((__be16 *)gid_raw)[5]),
+		be16_to_cpu(((__be16 *)gid_raw)[6]),
+		be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	r->diag_family = sk->sk_family;
+	if (!smc->clcsock)
+		return;
+	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+	r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+	r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+	r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+				   struct smc_diag_msg *r,
+				   struct user_namespace *user_ns)
+{
+	if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+		return 1;
+
+	r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->diag_inode = sock_i_ino(sk);
+	return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+			   struct netlink_callback *cb,
+			   const struct smc_diag_req *req,
+			   struct nlattr *bc)
+{
+	struct smc_sock *smc = smc_sk(sk);
+	struct user_namespace *user_ns;
+	struct smc_diag_msg *r;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	smc_diag_msg_common_fill(r, sk);
+	r->diag_state = sk->sk_state;
+	r->diag_fallback = smc->use_fallback;
+	user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+		goto errout;
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+		struct smc_connection *conn = &smc->conn;
+		struct smc_diag_conninfo cinfo = {
+			.token = conn->alert_token_local,
+			.sndbuf_size = conn->sndbuf_size,
+			.rmbe_size = conn->rmbe_size,
+			.peer_rmbe_size = conn->peer_rmbe_size,
+
+			.rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+			.rx_prod.count = conn->local_rx_ctrl.prod.count,
+			.rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+			.rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+			.tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+			.tx_prod.count = conn->local_tx_ctrl.prod.count,
+			.tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+			.tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+			.tx_prod_flags =
+				*(u8 *)&conn->local_tx_ctrl.prod_flags,
+			.tx_conn_state_flags =
+				*(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+			.rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+			.rx_conn_state_flags =
+				*(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+			.tx_prep.wrap = conn->tx_curs_prep.wrap,
+			.tx_prep.count = conn->tx_curs_prep.count,
+			.tx_sent.wrap = conn->tx_curs_sent.wrap,
+			.tx_sent.count = conn->tx_curs_sent.count,
+			.tx_fin.wrap = conn->tx_curs_fin.wrap,
+			.tx_fin.count = conn->tx_curs_fin.count,
+		};
+
+		if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+			goto errout;
+	}
+
+	if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+		struct smc_diag_lgrinfo linfo = {
+			.role = smc->conn.lgr->role,
+			.lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+			.lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+		};
+
+		memcpy(linfo.lnk[0].ibname,
+		       smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+		       sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+		smc_gid_be16_convert(linfo.lnk[0].gid,
+				     smc->conn.lgr->lnk[0].gid.raw);
+		smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+				     smc->conn.lgr->lnk[0].peer_gid);
+
+		if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+			goto errout;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *bc = NULL;
+	struct hlist_head *head;
+	struct sock *sk;
+	int rc = 0;
+
+	read_lock(&smc_proto.h.smc_hash->lock);
+	head = &smc_proto.h.smc_hash->ht;
+	if (hlist_empty(head))
+		goto out;
+
+	sk_for_each(sk, head) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+		if (rc)
+			break;
+	}
+
+out:
+	read_unlock(&smc_proto.h.smc_hash->lock);
+	return rc;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	struct net *net = sock_net(skb->sk);
+
+	if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+	    h->nlmsg_flags & NLM_F_DUMP) {
+		{
+			struct netlink_dump_control c = {
+				.dump = smc_diag_dump,
+				.min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+			};
+			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+		}
+	}
+	return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+	.family = AF_SMC,
+	.dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+	return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+	sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
-- 
cgit v1.2.3


From 843debb889c7a95c7f591acaed185734694b0ff7 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 4 Sep 2016 14:30:30 +0300
Subject: RDMA/core: Commonize RDMA IOCTL declarations location

This patch provides one common file (rdma_user_ioctl.h)
for all RDMA UAPI IOCTLs.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/Kbuild            |  1 +
 include/uapi/rdma/hfi/hfi1_user.h   |  2 +-
 include/uapi/rdma/ib_user_mad.h     |  4 +---
 include/uapi/rdma/rdma_user_ioctl.h | 43 +++++++++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 include/uapi/rdma/rdma_user_ioctl.h

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild
index 82bdf5626859..541311115505 100644
--- a/include/uapi/rdma/Kbuild
+++ b/include/uapi/rdma/Kbuild
@@ -1,5 +1,6 @@
 # UAPI Header export list
 header-y += ib_user_cm.h
+header-y += rdma_user_ioctl.h
 header-y += ib_user_mad.h
 header-y += ib_user_sa.h
 header-y += ib_user_verbs.h
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index 587b7360e820..57d3613f2ff8 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -57,6 +57,7 @@
 #define _LINUX__HFI1_USER_H
 
 #include <linux/types.h>
+#include <rdma/rdma_user_ioctl.h>
 
 /*
  * This version number is given to the driver by the user code during
@@ -132,7 +133,6 @@
  * User IOCTLs can not go above 128 if they do then see common.h and change the
  * base for the snoop ioctl
  */
-#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */
 
 /*
  * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h
index 09f809f323ea..9de7a2b273d7 100644
--- a/include/uapi/rdma/ib_user_mad.h
+++ b/include/uapi/rdma/ib_user_mad.h
@@ -35,7 +35,7 @@
 #define IB_USER_MAD_H
 
 #include <linux/types.h>
-#include <linux/ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
 
 /*
  * Increment this value if any changes that break userspace ABI
@@ -230,8 +230,6 @@ struct ib_user_mad_reg_req2 {
 	__u8	reserved[3];
 };
 
-#define IB_IOCTL_MAGIC		0x1b
-
 #define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
 					      struct ib_user_mad_reg_req)
 
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
new file mode 100644
index 000000000000..ba1bcdd7178d
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies, LTD. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_IOCTL_H
+#define RDMA_USER_IOCTL_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* Documentation/ioctl/ioctl-number.txt */
+#define RDMA_IOCTL_MAGIC		0x1b
+#define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
+
+#endif /* RDMA_USER_IOCTL_H */
-- 
cgit v1.2.3


From 06393bc39e2fdcd9ca281401f25fa3b834bca799 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 4 Sep 2016 14:30:31 +0300
Subject: RDMA/core: Move legacy MAD IOCTL declarations to common file

Move legacy MAD IOCTL declarations to rdma_user_ioctl.h file.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/ib_user_mad.h     | 10 ----------
 include/uapi/rdma/rdma_user_ioctl.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h
index 9de7a2b273d7..5c7abd859e0f 100644
--- a/include/uapi/rdma/ib_user_mad.h
+++ b/include/uapi/rdma/ib_user_mad.h
@@ -230,14 +230,4 @@ struct ib_user_mad_reg_req2 {
 	__u8	reserved[3];
 };
 
-#define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
-					      struct ib_user_mad_reg_req)
-
-#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(IB_IOCTL_MAGIC, 2, __u32)
-
-#define IB_USER_MAD_ENABLE_PKEY		_IO(IB_IOCTL_MAGIC, 3)
-
-#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(IB_IOCTL_MAGIC, 4, \
-					      struct ib_user_mad_reg_req2)
-
 #endif /* IB_USER_MAD_H */
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index ba1bcdd7178d..820bf3485608 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -35,9 +35,20 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <rdma/ib_user_mad.h>
 
 /* Documentation/ioctl/ioctl-number.txt */
 #define RDMA_IOCTL_MAGIC		0x1b
 #define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
 
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
+					      struct ib_user_mad_reg_req)
+
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(IB_IOCTL_MAGIC, 2, __u32)
+
+#define IB_USER_MAD_ENABLE_PKEY		_IO(IB_IOCTL_MAGIC, 3)
+
+#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(IB_IOCTL_MAGIC, 4, \
+					      struct ib_user_mad_reg_req2)
+
 #endif /* RDMA_USER_IOCTL_H */
-- 
cgit v1.2.3


From 38e8b671bf04af09fc8de74ea857f920285bd211 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 4 Sep 2016 14:30:32 +0300
Subject: RDMA/hfi1: Avoid redeclaration error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move hfi1 ioctl definitions to a new header which can be included by
both the hfi1 and qib drivers to avoid a duplicate enum definition
as shown in this build error for qib:

  CC [M] drivers/infiniband/hw/qib/qib_sysfs.o
In file included from ./include/uapi/rdma/rdma_user_ioctl.h:39:0,
		 from include/uapi/rdma/ib_user_mad.h:38,
		 from include/rdma/ib_mad.h:43,
		 from include/rdma/ib_pma.h:38,
		 from drivers/infiniband/hw/qib/qib_mad.h:37,
		 from drivers/infiniband/hw/qib/qib_init.c:49:
./include/uapi/rdma/hfi/hfi1_user.h:370:2: error: redeclaration of
enumerator ‘ur_rcvhdrtail’
  ur_rcvhdrtail = 0,

Move hfi1 structures to separate file to avoid this failure.

The actual move of the ioctl definitions comes in a follow on patch.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/hfi/Kbuild       |   1 +
 include/uapi/rdma/hfi/hfi1_ioctl.h | 173 +++++++++++++++++++++++++++++++++++++
 include/uapi/rdma/hfi/hfi1_user.h  | 120 +------------------------
 3 files changed, 175 insertions(+), 119 deletions(-)
 create mode 100644 include/uapi/rdma/hfi/hfi1_ioctl.h

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/hfi/Kbuild b/include/uapi/rdma/hfi/Kbuild
index ef23c294fc71..b65b0b3a5f63 100644
--- a/include/uapi/rdma/hfi/Kbuild
+++ b/include/uapi/rdma/hfi/Kbuild
@@ -1,2 +1,3 @@
 # UAPI Header export list
 header-y += hfi1_user.h
+header-y += hfi1_ioctl.h
diff --git a/include/uapi/rdma/hfi/hfi1_ioctl.h b/include/uapi/rdma/hfi/hfi1_ioctl.h
new file mode 100644
index 000000000000..4791cc8cb09b
--- /dev/null
+++ b/include/uapi/rdma/hfi/hfi1_ioctl.h
@@ -0,0 +1,173 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LINUX__HFI1_IOCTL_H
+#define _LINUX__HFI1_IOCTL_H
+#include <linux/types.h>
+
+/*
+ * This structure is passed to the driver to tell it where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct hfi1_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to HFI1_USER_SWVERSION.
+	 */
+	__u32 userversion;
+	__u32 pad;
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the subcontext_cnt and subcontext_id to the same
+	 * values.  The only restriction on the subcontext_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 subctxt_cnt;
+	__u16 subctxt_id;
+	/* 128bit UUID passed in by PSM. */
+	__u8 uuid[16];
+};
+
+struct hfi1_ctxt_info {
+	__u64 runtime_flags;    /* chip/drv runtime flags (HFI1_CAP_*) */
+	__u32 rcvegr_size;      /* size of each eager buffer */
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 rcvtids;          /* number of Rcv TIDs for this context */
+	__u16 credits;          /* number of PIO credits for this context */
+	__u16 numa_node;        /* NUMA node of the assigned device */
+	__u16 rec_cpu;          /* cpu # for affinity (0xffff if none) */
+	__u16 send_ctxt;        /* send context in use by this user context */
+	__u16 egrtids;          /* number of RcvArray entries for Eager Rcvs */
+	__u16 rcvhdrq_cnt;      /* number of RcvHdrQ entries */
+	__u16 rcvhdrq_entsize;  /* size (in bytes) for each RcvHdrQ entry */
+	__u16 sdma_ring_size;   /* number of entries in SDMA request ring */
+};
+
+struct hfi1_tid_info {
+	/* virtual address of first page in transfer */
+	__u64 vaddr;
+	/* pointer to tid array. this array is big enough */
+	__u64 tidlist;
+	/* number of tids programmed by this request */
+	__u32 tidcnt;
+	/* length of transfer buffer programmed by this request */
+	__u32 length;
+};
+
+/*
+ * This structure is returned by the driver immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explicit pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct hfi1_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 hw_version;
+	/* version of software, for feature checking. */
+	__u32 sw_version;
+	/* Job key */
+	__u16 jkey;
+	__u16 padding1;
+	/*
+	 * The special QP (queue pair) value that identifies PSM
+	 * protocol packet from standard IB packets.
+	 */
+	__u32 bthqp;
+	/* PIO credit return address, */
+	__u64 sc_credits_addr;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__u64 pio_bufbase_sop;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__u64 pio_bufbase;
+	/* address where receive buffer queue is mapped into */
+	__u64 rcvhdr_bufbase;
+	/* base address of Eager receive buffers. */
+	__u64 rcvegr_bufbase;
+	/* base address of SDMA completion ring */
+	__u64 sdma_comp_bufbase;
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always maps real chip register space.
+	 * the register addresses are:
+	 * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
+	 * ur_rcvtidflow
+	 */
+	__u64 user_regbase;
+	/* notification events */
+	__u64 events_bufbase;
+	/* status page */
+	__u64 status_bufbase;
+	/* rcvhdrtail update */
+	__u64 rcvhdrtail_base;
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 subctxt_uregbase;
+	__u64 subctxt_rcvegrbuf;
+	__u64 subctxt_rcvhdrbuf;
+};
+#endif /* _LINIUX__HFI1_IOCTL_H */
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index 57d3613f2ff8..220a020d606d 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -58,6 +58,7 @@
 
 #include <linux/types.h>
 #include <rdma/rdma_user_ioctl.h>
+#include <rdma/hfi/hfi1_ioctl.h>
 
 /*
  * This version number is given to the driver by the user code during
@@ -211,60 +212,6 @@ struct hfi1_cmd;
 #define HFI1_POLL_TYPE_ANYRCV     0x0
 #define HFI1_POLL_TYPE_URGENT     0x1
 
-/*
- * This structure is passed to the driver to tell it where
- * user code buffers are, sizes, etc.   The offsets and sizes of the
- * fields must remain unchanged, for binary compatibility.  It can
- * be extended, if userversion is changed so user code can tell, if needed
- */
-struct hfi1_user_info {
-	/*
-	 * version of user software, to detect compatibility issues.
-	 * Should be set to HFI1_USER_SWVERSION.
-	 */
-	__u32 userversion;
-	__u32 pad;
-	/*
-	 * If two or more processes wish to share a context, each process
-	 * must set the subcontext_cnt and subcontext_id to the same
-	 * values.  The only restriction on the subcontext_id is that
-	 * it be unique for a given node.
-	 */
-	__u16 subctxt_cnt;
-	__u16 subctxt_id;
-	/* 128bit UUID passed in by PSM. */
-	__u8 uuid[16];
-};
-
-struct hfi1_ctxt_info {
-	__u64 runtime_flags;    /* chip/drv runtime flags (HFI1_CAP_*) */
-	__u32 rcvegr_size;      /* size of each eager buffer */
-	__u16 num_active;       /* number of active units */
-	__u16 unit;             /* unit (chip) assigned to caller */
-	__u16 ctxt;             /* ctxt on unit assigned to caller */
-	__u16 subctxt;          /* subctxt on unit assigned to caller */
-	__u16 rcvtids;          /* number of Rcv TIDs for this context */
-	__u16 credits;          /* number of PIO credits for this context */
-	__u16 numa_node;        /* NUMA node of the assigned device */
-	__u16 rec_cpu;          /* cpu # for affinity (0xffff if none) */
-	__u16 send_ctxt;        /* send context in use by this user context */
-	__u16 egrtids;          /* number of RcvArray entries for Eager Rcvs */
-	__u16 rcvhdrq_cnt;      /* number of RcvHdrQ entries */
-	__u16 rcvhdrq_entsize;  /* size (in bytes) for each RcvHdrQ entry */
-	__u16 sdma_ring_size;   /* number of entries in SDMA request ring */
-};
-
-struct hfi1_tid_info {
-	/* virtual address of first page in transfer */
-	__u64 vaddr;
-	/* pointer to tid array. this array is big enough */
-	__u64 tidlist;
-	/* number of tids programmed by this request */
-	__u32 tidcnt;
-	/* length of transfer buffer programmed by this request */
-	__u32 length;
-};
-
 enum hfi1_sdma_comp_state {
 	FREE = 0,
 	QUEUED,
@@ -289,71 +236,6 @@ struct hfi1_status {
 	char freezemsg[0];
 };
 
-/*
- * This structure is returned by the driver immediately after
- * open to get implementation-specific info, and info specific to this
- * instance.
- *
- * This struct must have explicit pad fields where type sizes
- * may result in different alignments between 32 and 64 bit
- * programs, since the 64 bit * bit kernel requires the user code
- * to have matching offsets
- */
-struct hfi1_base_info {
-	/* version of hardware, for feature checking. */
-	__u32 hw_version;
-	/* version of software, for feature checking. */
-	__u32 sw_version;
-	/* Job key */
-	__u16 jkey;
-	__u16 padding1;
-	/*
-	 * The special QP (queue pair) value that identifies PSM
-	 * protocol packet from standard IB packets.
-	 */
-	__u32 bthqp;
-	/* PIO credit return address, */
-	__u64 sc_credits_addr;
-	/*
-	 * Base address of write-only pio buffers for this process.
-	 * Each buffer has sendpio_credits*64 bytes.
-	 */
-	__u64 pio_bufbase_sop;
-	/*
-	 * Base address of write-only pio buffers for this process.
-	 * Each buffer has sendpio_credits*64 bytes.
-	 */
-	__u64 pio_bufbase;
-	/* address where receive buffer queue is mapped into */
-	__u64 rcvhdr_bufbase;
-	/* base address of Eager receive buffers. */
-	__u64 rcvegr_bufbase;
-	/* base address of SDMA completion ring */
-	__u64 sdma_comp_bufbase;
-	/*
-	 * User register base for init code, not to be used directly by
-	 * protocol or applications.  Always maps real chip register space.
-	 * the register addresses are:
-	 * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
-	 * ur_rcvtidflow
-	 */
-	__u64 user_regbase;
-	/* notification events */
-	__u64 events_bufbase;
-	/* status page */
-	__u64 status_bufbase;
-	/* rcvhdrtail update */
-	__u64 rcvhdrtail_base;
-	/*
-	 * shared memory pages for subctxts if ctxt is shared; these cover
-	 * all the processes in the group sharing a single context.
-	 * all have enough space for the num_subcontexts value on this job.
-	 */
-	__u64 subctxt_uregbase;
-	__u64 subctxt_rcvegrbuf;
-	__u64 subctxt_rcvhdrbuf;
-};
-
 enum sdma_req_opcode {
 	EXPECTED = 0,
 	EAGER
-- 
cgit v1.2.3


From 8edec0b55a47d667031d078f29f7a23f0a11122f Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 4 Sep 2016 14:30:33 +0300
Subject: RDMA/core: Move HFI1 IOCTL declarations to common file

Move HFI1 IOCTL declarations to rdma_user_ioctl.h file.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/hfi/hfi1_user.h   | 55 -------------------------------------
 include/uapi/rdma/rdma_user_ioctl.h | 54 ++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 55 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index 220a020d606d..3f4ee93ae5eb 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -58,7 +58,6 @@
 
 #include <linux/types.h>
 #include <rdma/rdma_user_ioctl.h>
-#include <rdma/hfi/hfi1_ioctl.h>
 
 /*
  * This version number is given to the driver by the user code during
@@ -114,60 +113,6 @@
 #define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
 #define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
 
-/* User commands. */
-#define HFI1_CMD_ASSIGN_CTXT     1	/* allocate HFI and context */
-#define HFI1_CMD_CTXT_INFO       2	/* find out what resources we got */
-#define HFI1_CMD_USER_INFO       3	/* set up userspace */
-#define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
-#define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
-#define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
-
-#define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
-#define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
-#define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
-#define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
-#define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
-#define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
-#define HFI1_CMD_GET_VERS	 14	/* get the version of the user cdev */
-
-/*
- * User IOCTLs can not go above 128 if they do then see common.h and change the
- * base for the snoop ioctl
- */
-
-/*
- * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
- */
-#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
-
-struct hfi1_cmd;
-#define HFI1_IOCTL_ASSIGN_CTXT \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
-#define HFI1_IOCTL_CTXT_INFO \
-	_IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
-#define HFI1_IOCTL_USER_INFO \
-	_IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
-#define HFI1_IOCTL_TID_UPDATE \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
-#define HFI1_IOCTL_TID_FREE \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
-#define HFI1_IOCTL_CREDIT_UPD \
-	_IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
-#define HFI1_IOCTL_RECV_CTRL \
-	_IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
-#define HFI1_IOCTL_POLL_TYPE \
-	_IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
-#define HFI1_IOCTL_ACK_EVENT \
-	_IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
-#define HFI1_IOCTL_SET_PKEY \
-	_IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
-#define HFI1_IOCTL_CTXT_RESET \
-	_IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
-#define HFI1_IOCTL_TID_INVAL_READ \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
-#define HFI1_IOCTL_GET_VERS \
-	_IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
-
 #define _HFI1_EVENT_FROZEN_BIT         0
 #define _HFI1_EVENT_LINKDOWN_BIT       1
 #define _HFI1_EVENT_LID_CHANGE_BIT     2
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index 820bf3485608..e9a69f0b18a2 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -36,6 +36,7 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 #include <rdma/ib_user_mad.h>
+#include <rdma/hfi/hfi1_ioctl.h>
 
 /* Documentation/ioctl/ioctl-number.txt */
 #define RDMA_IOCTL_MAGIC		0x1b
@@ -51,4 +52,57 @@
 #define IB_USER_MAD_REGISTER_AGENT2     _IOWR(IB_IOCTL_MAGIC, 4, \
 					      struct ib_user_mad_reg_req2)
 
+/* User commands. */
+#define HFI1_CMD_ASSIGN_CTXT     1	/* allocate HFI and context */
+#define HFI1_CMD_CTXT_INFO       2	/* find out what resources we got */
+#define HFI1_CMD_USER_INFO       3	/* set up userspace */
+#define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
+#define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
+#define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
+
+#define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
+#define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
+#define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
+#define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
+#define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
+#define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
+#define HFI1_CMD_GET_VERS	 14	/* get the version of the user cdev */
+
+/*
+ * User IOCTLs can not go above 128 if they do then see common.h and change the
+ * base for the snoop ioctl
+ */
+
+/*
+ * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
+ */
+#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
+
+#define HFI1_IOCTL_ASSIGN_CTXT \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
+#define HFI1_IOCTL_CTXT_INFO \
+	_IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
+#define HFI1_IOCTL_USER_INFO \
+	_IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
+#define HFI1_IOCTL_TID_UPDATE \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
+#define HFI1_IOCTL_TID_FREE \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
+#define HFI1_IOCTL_CREDIT_UPD \
+	_IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
+#define HFI1_IOCTL_RECV_CTRL \
+	_IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
+#define HFI1_IOCTL_POLL_TYPE \
+	_IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
+#define HFI1_IOCTL_ACK_EVENT \
+	_IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
+#define HFI1_IOCTL_SET_PKEY \
+	_IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
+#define HFI1_IOCTL_CTXT_RESET \
+	_IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
+#define HFI1_IOCTL_TID_INVAL_READ \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
+#define HFI1_IOCTL_GET_VERS \
+	_IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
+
 #endif /* RDMA_USER_IOCTL_H */
-- 
cgit v1.2.3


From 10b31e792e5862ad9e232d7aa2c65e7b7e46bc39 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 4 Sep 2016 14:30:34 +0300
Subject: RDMA/core: Rename RDMA magic number

Rename RDMA magic number to better describe IOCTLs.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/rdma_user_ioctl.h | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index e9a69f0b18a2..7ecf8cd17f58 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -40,16 +40,17 @@
 
 /* Documentation/ioctl/ioctl-number.txt */
 #define RDMA_IOCTL_MAGIC		0x1b
+/* Legacy name, for user space application which already use it */
 #define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
 
-#define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(RDMA_IOCTL_MAGIC, 1, \
 					      struct ib_user_mad_reg_req)
 
-#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(IB_IOCTL_MAGIC, 2, __u32)
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(RDMA_IOCTL_MAGIC, 2, __u32)
 
-#define IB_USER_MAD_ENABLE_PKEY		_IO(IB_IOCTL_MAGIC, 3)
+#define IB_USER_MAD_ENABLE_PKEY		_IO(RDMA_IOCTL_MAGIC, 3)
 
-#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(IB_IOCTL_MAGIC, 4, \
+#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(RDMA_IOCTL_MAGIC, 4, \
 					      struct ib_user_mad_reg_req2)
 
 /* User commands. */
@@ -79,30 +80,30 @@
 #define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
 
 #define HFI1_IOCTL_ASSIGN_CTXT \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
+	_IOWR(RDMA_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
 #define HFI1_IOCTL_CTXT_INFO \
-	_IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
+	_IOW(RDMA_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
 #define HFI1_IOCTL_USER_INFO \
-	_IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
+	_IOW(RDMA_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
 #define HFI1_IOCTL_TID_UPDATE \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
+	_IOWR(RDMA_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
 #define HFI1_IOCTL_TID_FREE \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
+	_IOWR(RDMA_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
 #define HFI1_IOCTL_CREDIT_UPD \
-	_IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
+	_IO(RDMA_IOCTL_MAGIC, __NUM(CREDIT_UPD))
 #define HFI1_IOCTL_RECV_CTRL \
-	_IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
+	_IOW(RDMA_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
 #define HFI1_IOCTL_POLL_TYPE \
-	_IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
+	_IOW(RDMA_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
 #define HFI1_IOCTL_ACK_EVENT \
-	_IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
+	_IOW(RDMA_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
 #define HFI1_IOCTL_SET_PKEY \
-	_IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
+	_IOW(RDMA_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
 #define HFI1_IOCTL_CTXT_RESET \
-	_IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
+	_IO(RDMA_IOCTL_MAGIC, __NUM(CTXT_RESET))
 #define HFI1_IOCTL_TID_INVAL_READ \
-	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
+	_IOWR(RDMA_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
 #define HFI1_IOCTL_GET_VERS \
-	_IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
+	_IOR(RDMA_IOCTL_MAGIC, __NUM(GET_VERS), int)
 
 #endif /* RDMA_USER_IOCTL_H */
-- 
cgit v1.2.3


From fa83b7936e9799d9cd96a26fbcba22d2baa7ae11 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 4 Sep 2016 14:30:35 +0300
Subject: RDMA/core: Unify style of IOCTL commands

MAD and HFI1 have different naming convention, this patch
simplifies and unifies their defines and names.

As part of cleanup, the HFI1 _NUM() macro and command indexes
were removed (controversial). This will cause intentional (and
arguably unnecessary) breakage to the PSM user space library.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/rdma_user_ioctl.h | 98 ++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 60 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index 7ecf8cd17f58..9388125ad51b 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -39,71 +39,49 @@
 #include <rdma/hfi/hfi1_ioctl.h>
 
 /* Documentation/ioctl/ioctl-number.txt */
-#define RDMA_IOCTL_MAGIC		0x1b
+#define RDMA_IOCTL_MAGIC	0x1b
 /* Legacy name, for user space application which already use it */
-#define IB_IOCTL_MAGIC			RDMA_IOCTL_MAGIC
-
-#define IB_USER_MAD_REGISTER_AGENT	_IOWR(RDMA_IOCTL_MAGIC, 1, \
-					      struct ib_user_mad_reg_req)
-
-#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(RDMA_IOCTL_MAGIC, 2, __u32)
-
-#define IB_USER_MAD_ENABLE_PKEY		_IO(RDMA_IOCTL_MAGIC, 3)
-
-#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(RDMA_IOCTL_MAGIC, 4, \
-					      struct ib_user_mad_reg_req2)
-
-/* User commands. */
-#define HFI1_CMD_ASSIGN_CTXT     1	/* allocate HFI and context */
-#define HFI1_CMD_CTXT_INFO       2	/* find out what resources we got */
-#define HFI1_CMD_USER_INFO       3	/* set up userspace */
-#define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
-#define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
-#define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
-
-#define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
-#define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
-#define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
-#define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
-#define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
-#define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
-#define HFI1_CMD_GET_VERS	 14	/* get the version of the user cdev */
+#define IB_IOCTL_MAGIC		RDMA_IOCTL_MAGIC
 
 /*
- * User IOCTLs can not go above 128 if they do then see common.h and change the
- * base for the snoop ioctl
+ * General blocks assignments
+ * It is closed on purpose do not expose it it user space
+ * #define MAD_CMD_BASE		0x00
+ * #define HFI1_CMD_BAS		0xE0
  */
 
-/*
- * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
- */
-#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
+/* MAD specific section */
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(RDMA_IOCTL_MAGIC, 0x01, struct ib_user_mad_reg_req)
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(RDMA_IOCTL_MAGIC,  0x02, __u32)
+#define IB_USER_MAD_ENABLE_PKEY		_IO(RDMA_IOCTL_MAGIC,   0x03)
+#define IB_USER_MAD_REGISTER_AGENT2	_IOWR(RDMA_IOCTL_MAGIC, 0x04, struct ib_user_mad_reg_req2)
 
-#define HFI1_IOCTL_ASSIGN_CTXT \
-	_IOWR(RDMA_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
-#define HFI1_IOCTL_CTXT_INFO \
-	_IOW(RDMA_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
-#define HFI1_IOCTL_USER_INFO \
-	_IOW(RDMA_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
-#define HFI1_IOCTL_TID_UPDATE \
-	_IOWR(RDMA_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
-#define HFI1_IOCTL_TID_FREE \
-	_IOWR(RDMA_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
-#define HFI1_IOCTL_CREDIT_UPD \
-	_IO(RDMA_IOCTL_MAGIC, __NUM(CREDIT_UPD))
-#define HFI1_IOCTL_RECV_CTRL \
-	_IOW(RDMA_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
-#define HFI1_IOCTL_POLL_TYPE \
-	_IOW(RDMA_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
-#define HFI1_IOCTL_ACK_EVENT \
-	_IOW(RDMA_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
-#define HFI1_IOCTL_SET_PKEY \
-	_IOW(RDMA_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
-#define HFI1_IOCTL_CTXT_RESET \
-	_IO(RDMA_IOCTL_MAGIC, __NUM(CTXT_RESET))
-#define HFI1_IOCTL_TID_INVAL_READ \
-	_IOWR(RDMA_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
-#define HFI1_IOCTL_GET_VERS \
-	_IOR(RDMA_IOCTL_MAGIC, __NUM(GET_VERS), int)
+/* HFI specific section */
+/* allocate HFI and context */
+#define HFI1_IOCTL_ASSIGN_CTXT		_IOWR(RDMA_IOCTL_MAGIC, 0xE1, struct hfi1_user_info)
+/* find out what resources we got */
+#define HFI1_IOCTL_CTXT_INFO		_IOW(RDMA_IOCTL_MAGIC,  0xE2, struct hfi1_ctxt_info)
+/* set up userspace */
+#define HFI1_IOCTL_USER_INFO		_IOW(RDMA_IOCTL_MAGIC,  0xE3, struct hfi1_base_info)
+/* update expected TID entries */
+#define HFI1_IOCTL_TID_UPDATE		_IOWR(RDMA_IOCTL_MAGIC, 0xE4, struct hfi1_tid_info)
+/* free expected TID entries */
+#define HFI1_IOCTL_TID_FREE		_IOWR(RDMA_IOCTL_MAGIC, 0xE5, struct hfi1_tid_info)
+/* force an update of PIO credit */
+#define HFI1_IOCTL_CREDIT_UPD		_IO(RDMA_IOCTL_MAGIC,   0xE6)
+/* control receipt of packets */
+#define HFI1_IOCTL_RECV_CTRL		_IOW(RDMA_IOCTL_MAGIC,  0xE8, int)
+/* set the kind of polling we want */
+#define HFI1_IOCTL_POLL_TYPE		_IOW(RDMA_IOCTL_MAGIC,  0xE9, int)
+/* ack & clear user status bits */
+#define HFI1_IOCTL_ACK_EVENT		_IOW(RDMA_IOCTL_MAGIC,  0xEA, unsigned long)
+/* set context's pkey */
+#define HFI1_IOCTL_SET_PKEY		_IOW(RDMA_IOCTL_MAGIC,  0xEB, __u16)
+/* reset context's HW send context */
+#define HFI1_IOCTL_CTXT_RESET		_IO(RDMA_IOCTL_MAGIC,   0xEC)
+/* read TID cache invalidations */
+#define HFI1_IOCTL_TID_INVAL_READ	_IOWR(RDMA_IOCTL_MAGIC, 0xED, struct hfi1_tid_info)
+/* get the version of the user cdev */
+#define HFI1_IOCTL_GET_VERS		_IOR(RDMA_IOCTL_MAGIC,  0xEE, int)
 
 #endif /* RDMA_USER_IOCTL_H */
-- 
cgit v1.2.3


From 69ae543969abeba48e04dd93277684c8c0895f3b Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Mon, 19 Dec 2016 11:28:46 -0800
Subject: RDMA: Adding ethertype ETH_P_IBOE

Update the if_ether.h with the  ethertype for Infiniband over
Ethernet packets. Also, removing the occurances of 0x8915
from infiniband vendor drivers.

Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/qp.c                    | 6 +-----
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c           | 4 ++--
 drivers/infiniband/hw/ocrdma/ocrdma_hw.c           | 3 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_sli.h          | 5 -----
 drivers/infiniband/hw/qedr/qedr_cm.c               | 2 +-
 drivers/infiniband/hw/qedr/qedr_cm.h               | 1 -
 drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h | 1 -
 drivers/infiniband/hw/usnic/usnic_fwd.h            | 3 ++-
 include/uapi/linux/if_ether.h                      | 1 +
 9 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index c068add8838b..7d76f769233c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -76,10 +76,6 @@ enum {
 	MLX4_IB_LSO_HEADER_SPARE	= 128,
 };
 
-enum {
-	MLX4_IB_IBOE_ETHERTYPE		= 0x8915
-};
-
 struct mlx4_ib_sqp {
 	struct mlx4_ib_qp	qp;
 	int			pkey_index;
@@ -2588,7 +2584,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 		u16 ether_type;
 		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
 
-		ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE :
+		ether_type = (!is_udp) ? ETH_P_IBOE:
 			(ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
 
 		mlx->sched_prio = cpu_to_be16(pcp);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 14d33b0f3950..cd66e1e45dd7 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -59,7 +59,7 @@ static u16 ocrdma_hdr_type_to_proto_num(int devid, u8 hdr_type)
 {
 	switch (hdr_type) {
 	case OCRDMA_L3_TYPE_IB_GRH:
-		return (u16)0x8915;
+		return (u16)ETH_P_IBOE;
 	case OCRDMA_L3_TYPE_IPV4:
 		return (u16)0x0800;
 	case OCRDMA_L3_TYPE_IPV6:
@@ -94,7 +94,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 	proto_num = ocrdma_hdr_type_to_proto_num(dev->id, ah->hdr_type);
 	if (!proto_num)
 		return -EINVAL;
-	nxthdr = (proto_num == 0x8915) ? 0x1b : 0x11;
+	nxthdr = (proto_num == ETH_P_IBOE) ? 0x1b : 0x11;
 	/* VLAN */
 	if (!vlan_tag || (vlan_tag > 0xFFF))
 		vlan_tag = dev->pvid;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index 9a305201545e..aa6967197620 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -44,6 +44,7 @@
 #include <linux/interrupt.h>
 #include <linux/log2.h>
 #include <linux/dma-mapping.h>
+#include <linux/if_ether.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_user_verbs.h>
@@ -2984,7 +2985,7 @@ static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype,
 				OCRDMA_APP_PARAM_APP_PROTO_MASK;
 
 		if (
-			valid && proto == OCRDMA_APP_PROTO_ROCE &&
+			valid && proto == ETH_P_IBOE &&
 			proto_sel == OCRDMA_PROTO_SELECT_L2) {
 			for (slindx = 0; slindx <
 				OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 37df4481bb8f..6ef89c226ad8 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -1901,7 +1901,6 @@ struct ocrdma_eth_vlan {
 	u8 smac[6];
 	__be16 eth_type;
 	__be16 vlan_tag;
-#define OCRDMA_ROCE_ETH_TYPE 0x8915
 	__be16 roce_eth_type;
 } __packed;
 
@@ -2179,10 +2178,6 @@ enum OCRDMA_DCBX_PARAM_TYPE {
 	OCRDMA_PARAMETER_TYPE_PEER	= 0x02
 };
 
-enum OCRDMA_DCBX_APP_PROTO {
-	OCRDMA_APP_PROTO_ROCE	= 0x8915
-};
-
 enum OCRDMA_DCBX_PROTO {
 	OCRDMA_PROTO_SELECT_L2	= 0x00,
 	OCRDMA_PROTO_SELECT_L4	= 0x01
diff --git a/drivers/infiniband/hw/qedr/qedr_cm.c b/drivers/infiniband/hw/qedr/qedr_cm.c
index 63890ebb72bd..c6aee0333f30 100644
--- a/drivers/infiniband/hw/qedr/qedr_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_cm.c
@@ -293,7 +293,7 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev,
 	has_udp = (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP);
 	if (!has_udp) {
 		/* RoCE v1 */
-		ether_type = ETH_P_ROCE;
+		ether_type = ETH_P_IBOE;
 		*roce_mode = ROCE_V1;
 	} else if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) {
 		/* RoCE v2 IPv4 */
diff --git a/drivers/infiniband/hw/qedr/qedr_cm.h b/drivers/infiniband/hw/qedr/qedr_cm.h
index 9ba6e15cd93f..78efb1b056d1 100644
--- a/drivers/infiniband/hw/qedr/qedr_cm.h
+++ b/drivers/infiniband/hw/qedr/qedr_cm.h
@@ -37,7 +37,6 @@
 
 #define QEDR_GSI_MAX_RECV_SGE	(1)	/* LL2 FW limitation */
 
-#define ETH_P_ROCE		(0x8915)
 #define QEDR_ROCE_V2_UDP_SPORT	(0000)
 
 static inline u32 qedr_get_ipv4_from_gid(u8 *gid)
diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
index 596e0ed49a8e..bf7d197a9f42 100644
--- a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
+++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
@@ -34,7 +34,6 @@
 #ifndef USNIC_CMN_PKT_HDR_H
 #define USNIC_CMN_PKT_HDR_H
 
-#define USNIC_ROCE_ETHERTYPE		(0x8915)
 #define USNIC_ROCE_GRH_VER              (8)
 #define USNIC_PROTO_VER                 (1)
 #define USNIC_ROCE_GRH_VER_SHIFT        (4)
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h
index 3a8add9ddf46..b2ac22be0731 100644
--- a/drivers/infiniband/hw/usnic/usnic_fwd.h
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.h
@@ -36,6 +36,7 @@
 
 #include <linux/if.h>
 #include <linux/netdevice.h>
+#include <linux/if_ether.h>
 #include <linux/pci.h>
 #include <linux/in.h>
 
@@ -97,7 +98,7 @@ static inline void usnic_fwd_init_usnic_filter(struct filter *filter,
 						uint32_t usnic_id)
 {
 	filter->type = FILTER_USNIC_ID;
-	filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE;
+	filter->u.usnic.ethtype = ETH_P_IBOE;
 	filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE |
 				FILTER_FIELD_USNIC_ID |
 				FILTER_FIELD_USNIC_PROTO;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3e5185e9ef03..5bc9bfd816b7 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -93,6 +93,7 @@
 #define ETH_P_NCSI	0x88F8		/* NCSI protocol		*/
 #define ETH_P_PRP	0x88FB		/* IEC 62439-3 PRP/HSRv0	*/
 #define ETH_P_FCOE	0x8906		/* Fibre Channel over Ethernet  */
+#define ETH_P_IBOE	0x8915		/* Infiniband over Ethernet	*/
 #define ETH_P_TDLS	0x890D          /* TDLS */
 #define ETH_P_FIP	0x8914		/* FCoE Initialization Protocol */
 #define ETH_P_80221	0x8917		/* IEEE 802.21 Media Independent Handover Protocol */
-- 
cgit v1.2.3


From 99d31326cbe6951872af5c8a6bc2679388a4d9ef Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Wed, 11 Jan 2017 14:05:43 +0100
Subject: net/sched: cls_flower: Support matching on ARP

Support matching on ARP operation, and hardware and protocol addresses
for Ethernet hardware and IPv4 protocol addresses.

Example usage:

tc qdisc add dev eth0 ingress

tc filter add dev eth0 protocol arp parent ffff: flower indev eth0 \
	arp_op request arp_sip 10.0.0.1 action drop
tc filter add dev eth0 protocol rarp parent ffff: flower indev eth0 \
	arp_op reply arp_tha 52:54:3f:00:00:00/24 action drop

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 11 ++++++++++
 net/sched/cls_flower.c       | 51 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index a081efbd61a2..1e5e1ddfdaca 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -416,6 +416,17 @@ enum {
 	TCA_FLOWER_KEY_ICMPV6_TYPE,	/* u8 */
 	TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */
 
+	TCA_FLOWER_KEY_ARP_SIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_SIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_OP,		/* u8 */
+	TCA_FLOWER_KEY_ARP_OP_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ARP_SHA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_SHA_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA_MASK,	/* ETH_ALEN */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 970db7a41684..a3bfda3091a4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -40,6 +40,7 @@ struct fl_flow_key {
 	};
 	struct flow_dissector_key_ports tp;
 	struct flow_dissector_key_icmp icmp;
+	struct flow_dissector_key_arp arp;
 	struct flow_dissector_key_keyid enc_key_id;
 	union {
 		struct flow_dissector_key_ipv4_addrs enc_ipv4;
@@ -401,6 +402,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ICMPV6_CODE]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_SIP]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_SIP_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_TIP]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_TIP_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ARP_OP]		= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_OP_MASK]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ARP_SHA]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_SHA_MASK]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_THA]	= { .len = ETH_ALEN },
+	[TCA_FLOWER_KEY_ARP_THA_MASK]	= { .len = ETH_ALEN },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -572,6 +583,23 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       &mask->icmp.code,
 			       TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
 			       sizeof(key->icmp.code));
+	} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
+		   key->basic.n_proto == htons(ETH_P_RARP)) {
+		fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP,
+			       &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK,
+			       sizeof(key->arp.sip));
+		fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP,
+			       &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK,
+			       sizeof(key->arp.tip));
+		fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP,
+			       &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK,
+			       sizeof(key->arp.op));
+		fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+			       mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+			       sizeof(key->arp.sha));
+		fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+			       mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+			       sizeof(key->arp.tha));
 	}
 
 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
@@ -688,6 +716,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
 			     FLOW_DISSECTOR_KEY_PORTS, tp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ICMP, icmp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_ARP, arp);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_VLAN, vlan);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
@@ -1112,6 +1142,27 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				  TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
 				  sizeof(key->icmp.code))))
 		goto nla_put_failure;
+	else if ((key->basic.n_proto == htons(ETH_P_ARP) ||
+		  key->basic.n_proto == htons(ETH_P_RARP)) &&
+		 (fl_dump_key_val(skb, &key->arp.sip,
+				  TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip,
+				  TCA_FLOWER_KEY_ARP_SIP_MASK,
+				  sizeof(key->arp.sip)) ||
+		  fl_dump_key_val(skb, &key->arp.tip,
+				  TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip,
+				  TCA_FLOWER_KEY_ARP_TIP_MASK,
+				  sizeof(key->arp.tip)) ||
+		  fl_dump_key_val(skb, &key->arp.op,
+				  TCA_FLOWER_KEY_ARP_OP, &mask->arp.op,
+				  TCA_FLOWER_KEY_ARP_OP_MASK,
+				  sizeof(key->arp.op)) ||
+		  fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA,
+				  mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK,
+				  sizeof(key->arp.sha)) ||
+		  fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
+				  mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
+				  sizeof(key->arp.tha))))
+		goto nla_put_failure;
 
 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
-- 
cgit v1.2.3


From 696d0b0c715360ce28fedd3c8b009d3771a5ddeb Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mrochs@linux.vnet.ibm.com>
Date: Wed, 11 Jan 2017 19:19:33 -0600
Subject: scsi: cxlflash: Support SQ Command Mode

The SISLite specification outlines a new queuing model to improve
over the MMIO-based IOARRIN model that exists today. This new model
uses a submission queue that exists in host memory and is shared with
the device. Each entry in the queue is an IOARCB that describes a
transfer request. When requests are submitted, IOARCBs ('current'
position tracked in host software) are populated and the submission
queue tail pointer is then updated via MMIO to make the device aware
of the requests.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Uma Krishnan <ukrishn@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/cxlflash/common.h     | 30 +++++++++++-
 drivers/scsi/cxlflash/main.c       | 98 ++++++++++++++++++++++++++++++++++++--
 drivers/scsi/cxlflash/sislite.h    | 19 +++++++-
 drivers/scsi/cxlflash/superpipe.c  | 18 +++++--
 include/uapi/scsi/cxlflash_ioctl.h |  1 +
 5 files changed, 153 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index 0e9de5d62da2..dee865735ac0 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -54,6 +54,9 @@ extern const struct file_operations cxlflash_cxl_fops;
 /* RRQ for master issued cmds */
 #define NUM_RRQ_ENTRY                   CXLFLASH_MAX_CMDS
 
+/* SQ for master issued cmds */
+#define NUM_SQ_ENTRY			CXLFLASH_MAX_CMDS
+
 
 static inline void check_sizes(void)
 {
@@ -155,8 +158,8 @@ static inline struct afu_cmd *sc_to_afucz(struct scsi_cmnd *sc)
 
 struct afu {
 	/* Stuff requiring alignment go first. */
-
-	u64 rrq_entry[NUM_RRQ_ENTRY];	/* 2K RRQ */
+	struct sisl_ioarcb sq[NUM_SQ_ENTRY];		/* 16K SQ */
+	u64 rrq_entry[NUM_RRQ_ENTRY];			/* 2K RRQ */
 
 	/* Beware of alignment till here. Preferably introduce new
 	 * fields after this point
@@ -174,6 +177,12 @@ struct afu {
 	struct kref mapcount;
 
 	ctx_hndl_t ctx_hndl;	/* master's context handle */
+
+	atomic_t hsq_credits;
+	spinlock_t hsq_slock;
+	struct sisl_ioarcb *hsq_start;
+	struct sisl_ioarcb *hsq_end;
+	struct sisl_ioarcb *hsq_curr;
 	u64 *hrrq_start;
 	u64 *hrrq_end;
 	u64 *hrrq_curr;
@@ -191,6 +200,23 @@ struct afu {
 
 };
 
+static inline bool afu_is_cmd_mode(struct afu *afu, u64 cmd_mode)
+{
+	u64 afu_cap = afu->interface_version >> SISL_INTVER_CAP_SHIFT;
+
+	return afu_cap & cmd_mode;
+}
+
+static inline bool afu_is_sq_cmd_mode(struct afu *afu)
+{
+	return afu_is_cmd_mode(afu, SISL_INTVER_CAP_SQ_CMD_MODE);
+}
+
+static inline bool afu_is_ioarrin_cmd_mode(struct afu *afu)
+{
+	return afu_is_cmd_mode(afu, SISL_INTVER_CAP_IOARRIN_CMD_MODE);
+}
+
 static inline u64 lun_to_lunid(u64 lun)
 {
 	__be64 lun_id;
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index a990efb27197..d2bac4b7b85f 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -226,6 +226,17 @@ static void context_reset_ioarrin(struct afu_cmd *cmd)
 	context_reset(cmd, &afu->host_map->ioarrin);
 }
 
+/**
+ * context_reset_sq() - reset command owner context w/ SQ Context Reset register
+ * @cmd:	AFU command that timed out.
+ */
+static void context_reset_sq(struct afu_cmd *cmd)
+{
+	struct afu *afu = cmd->parent;
+
+	context_reset(cmd, &afu->host_map->sq_ctx_reset);
+}
+
 /**
  * send_cmd_ioarrin() - sends an AFU command via IOARRIN register
  * @afu:	AFU associated with the host.
@@ -268,6 +279,49 @@ out:
 	return rc;
 }
 
+/**
+ * send_cmd_sq() - sends an AFU command via SQ ring
+ * @afu:	AFU associated with the host.
+ * @cmd:	AFU command to send.
+ *
+ * Return:
+ *	0 on success, SCSI_MLQUEUE_HOST_BUSY on failure
+ */
+static int send_cmd_sq(struct afu *afu, struct afu_cmd *cmd)
+{
+	struct cxlflash_cfg *cfg = afu->parent;
+	struct device *dev = &cfg->dev->dev;
+	int rc = 0;
+	int newval;
+	ulong lock_flags;
+
+	newval = atomic_dec_if_positive(&afu->hsq_credits);
+	if (newval <= 0) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
+
+	cmd->rcb.ioasa = &cmd->sa;
+
+	spin_lock_irqsave(&afu->hsq_slock, lock_flags);
+
+	*afu->hsq_curr = cmd->rcb;
+	if (afu->hsq_curr < afu->hsq_end)
+		afu->hsq_curr++;
+	else
+		afu->hsq_curr = afu->hsq_start;
+	writeq_be((u64)afu->hsq_curr, &afu->host_map->sq_tail);
+
+	spin_unlock_irqrestore(&afu->hsq_slock, lock_flags);
+out:
+	dev_dbg(dev, "%s: cmd=%p len=%d ea=%p ioasa=%p rc=%d curr=%p "
+	       "head=%016llX tail=%016llX\n", __func__, cmd, cmd->rcb.data_len,
+	       (void *)cmd->rcb.data_ea, cmd->rcb.ioasa, rc, afu->hsq_curr,
+	       readq_be(&afu->host_map->sq_head),
+	       readq_be(&afu->host_map->sq_tail));
+	return rc;
+}
+
 /**
  * wait_resp() - polls for a response or timeout to a sent AFU command
  * @afu:	AFU associated with the host.
@@ -739,7 +793,7 @@ static int alloc_mem(struct cxlflash_cfg *cfg)
 	int rc = 0;
 	struct device *dev = &cfg->dev->dev;
 
-	/* AFU is ~12k, i.e. only one 64k page or up to four 4k pages */
+	/* AFU is ~28k, i.e. only one 64k page or up to seven 4k pages */
 	cfg->afu = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 					    get_order(sizeof(struct afu)));
 	if (unlikely(!cfg->afu)) {
@@ -1127,6 +1181,8 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
 	struct afu_cmd *cmd;
+	struct sisl_ioasa *ioasa;
+	struct sisl_ioarcb *ioarcb;
 	bool toggle = afu->toggle;
 	u64 entry,
 	    *hrrq_start = afu->hrrq_start,
@@ -1140,7 +1196,16 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 		if ((entry & SISL_RESP_HANDLE_T_BIT) != toggle)
 			break;
 
-		cmd = (struct afu_cmd *)(entry & ~SISL_RESP_HANDLE_T_BIT);
+		entry &= ~SISL_RESP_HANDLE_T_BIT;
+
+		if (afu_is_sq_cmd_mode(afu)) {
+			ioasa = (struct sisl_ioasa *)entry;
+			cmd = container_of(ioasa, struct afu_cmd, sa);
+		} else {
+			ioarcb = (struct sisl_ioarcb *)entry;
+			cmd = container_of(ioarcb, struct afu_cmd, rcb);
+		}
+
 		cmd_complete(cmd);
 
 		/* Advance to next entry or wrap and flip the toggle bit */
@@ -1150,6 +1215,8 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 			hrrq_curr = hrrq_start;
 			toggle ^= SISL_RESP_HANDLE_T_BIT;
 		}
+
+		atomic_inc(&afu->hsq_credits);
 	}
 
 	afu->hrrq_curr = hrrq_curr;
@@ -1402,10 +1469,15 @@ static int init_global(struct cxlflash_cfg *cfg)
 
 	pr_debug("%s: wwpn0=0x%llX wwpn1=0x%llX\n", __func__, wwpn[0], wwpn[1]);
 
-	/* Set up RRQ in AFU for master issued cmds */
+	/* Set up RRQ and SQ in AFU for master issued cmds */
 	writeq_be((u64) afu->hrrq_start, &afu->host_map->rrq_start);
 	writeq_be((u64) afu->hrrq_end, &afu->host_map->rrq_end);
 
+	if (afu_is_sq_cmd_mode(afu)) {
+		writeq_be((u64)afu->hsq_start, &afu->host_map->sq_start);
+		writeq_be((u64)afu->hsq_end, &afu->host_map->sq_end);
+	}
+
 	/* AFU configuration */
 	reg = readq_be(&afu->afu_map->global.regs.afu_config);
 	reg |= SISL_AFUCONF_AR_ALL|SISL_AFUCONF_ENDIAN;
@@ -1480,6 +1552,17 @@ static int start_afu(struct cxlflash_cfg *cfg)
 	afu->hrrq_curr = afu->hrrq_start;
 	afu->toggle = 1;
 
+	/* Initialize SQ */
+	if (afu_is_sq_cmd_mode(afu)) {
+		memset(&afu->sq, 0, sizeof(afu->sq));
+		afu->hsq_start = &afu->sq[0];
+		afu->hsq_end = &afu->sq[NUM_SQ_ENTRY - 1];
+		afu->hsq_curr = afu->hsq_start;
+
+		spin_lock_init(&afu->hsq_slock);
+		atomic_set(&afu->hsq_credits, NUM_SQ_ENTRY - 1);
+	}
+
 	rc = init_global(cfg);
 
 	pr_debug("%s: returning rc=%d\n", __func__, rc);
@@ -1641,8 +1724,13 @@ static int init_afu(struct cxlflash_cfg *cfg)
 		goto err2;
 	}
 
-	afu->send_cmd = send_cmd_ioarrin;
-	afu->context_reset = context_reset_ioarrin;
+	if (afu_is_sq_cmd_mode(afu)) {
+		afu->send_cmd = send_cmd_sq;
+		afu->context_reset = context_reset_sq;
+	} else {
+		afu->send_cmd = send_cmd_ioarrin;
+		afu->context_reset = context_reset_ioarrin;
+	}
 
 	pr_debug("%s: afu version %s, interface version 0x%llX\n", __func__,
 		 afu->version, afu->interface_version);
diff --git a/drivers/scsi/cxlflash/sislite.h b/drivers/scsi/cxlflash/sislite.h
index 1a2d09c148b3..a6e48a893fef 100644
--- a/drivers/scsi/cxlflash/sislite.h
+++ b/drivers/scsi/cxlflash/sislite.h
@@ -72,7 +72,10 @@ struct sisl_ioarcb {
 	u16 timeout;		/* in units specified by req_flags */
 	u32 rsvd1;
 	u8 cdb[16];		/* must be in big endian */
-	u64 reserved;		/* Reserved area */
+	union {
+		u64 reserved;			/* Reserved for IOARRIN mode */
+		struct sisl_ioasa *ioasa;	/* IOASA EA for SQ Mode */
+	};
 } __packed;
 
 struct sisl_rc {
@@ -260,6 +263,11 @@ struct sisl_host_map {
 	__be64 cmd_room;
 	__be64 ctx_ctrl;	/* least significant byte or b56:63 is LISN# */
 	__be64 mbox_w;		/* restricted use */
+	__be64 sq_start;	/* Submission Queue (R/W): write sequence and */
+	__be64 sq_end;		/* inclusion semantics are the same as RRQ    */
+	__be64 sq_head;		/* Submission Queue Head (R): for debugging   */
+	__be64 sq_tail;		/* Submission Queue TAIL (R/W): next IOARCB   */
+	__be64 sq_ctx_reset;	/* Submission Queue Context Reset (R/W)	      */
 };
 
 /* per context provisioning & control MMIO */
@@ -348,6 +356,15 @@ struct sisl_global_regs {
 	__be64 rsvd[0xf8];
 	__le64 afu_version;
 	__be64 interface_version;
+#define SISL_INTVER_CAP_SHIFT			16
+#define SISL_INTVER_MAJ_SHIFT			8
+#define SISL_INTVER_CAP_MASK			0xFFFFFFFF00000000ULL
+#define SISL_INTVER_MAJ_MASK			0x00000000FFFF0000ULL
+#define SISL_INTVER_MIN_MASK			0x000000000000FFFFULL
+#define SISL_INTVER_CAP_IOARRIN_CMD_MODE	0x800000000000ULL
+#define SISL_INTVER_CAP_SQ_CMD_MODE		0x400000000000ULL
+#define SISL_INTVER_CAP_RESERVED_CMD_MODE_A	0x200000000000ULL
+#define SISL_INTVER_CAP_RESERVED_CMD_MODE_B	0x100000000000ULL
 };
 
 #define CXLFLASH_NUM_FC_PORTS   2
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index 9636970d9611..42674ae6f4dd 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -1287,6 +1287,7 @@ static int cxlflash_disk_attach(struct scsi_device *sdev,
 	int rc = 0;
 	u32 perms;
 	int ctxid = -1;
+	u64 flags = 0UL;
 	u64 rctxid = 0UL;
 	struct file *file = NULL;
 
@@ -1426,10 +1427,11 @@ static int cxlflash_disk_attach(struct scsi_device *sdev,
 
 out_attach:
 	if (fd != -1)
-		attach->hdr.return_flags = DK_CXLFLASH_APP_CLOSE_ADAP_FD;
-	else
-		attach->hdr.return_flags = 0;
+		flags |= DK_CXLFLASH_APP_CLOSE_ADAP_FD;
+	if (afu_is_sq_cmd_mode(afu))
+		flags |= DK_CXLFLASH_CONTEXT_SQ_CMD_MODE;
 
+	attach->hdr.return_flags = flags;
 	attach->context_id = ctxi->ctxid;
 	attach->block_size = gli->blk_len;
 	attach->mmio_size = sizeof(afu->afu_map->hosts[0].harea);
@@ -1617,6 +1619,7 @@ static int cxlflash_afu_recover(struct scsi_device *sdev,
 	struct afu *afu = cfg->afu;
 	struct ctx_info *ctxi = NULL;
 	struct mutex *mutex = &cfg->ctx_recovery_mutex;
+	u64 flags;
 	u64 ctxid = DECODE_CTXID(recover->context_id),
 	    rctxid = recover->context_id;
 	long reg;
@@ -1672,11 +1675,16 @@ retry_recover:
 		}
 
 		ctxi->err_recovery_active = false;
+
+		flags = DK_CXLFLASH_APP_CLOSE_ADAP_FD |
+			DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET;
+		if (afu_is_sq_cmd_mode(afu))
+			flags |= DK_CXLFLASH_CONTEXT_SQ_CMD_MODE;
+
+		recover->hdr.return_flags = flags;
 		recover->context_id = ctxi->ctxid;
 		recover->adap_fd = new_adap_fd;
 		recover->mmio_size = sizeof(afu->afu_map->hosts[0].harea);
-		recover->hdr.return_flags = DK_CXLFLASH_APP_CLOSE_ADAP_FD |
-			DK_CXLFLASH_RECOVER_AFU_CONTEXT_RESET;
 		goto out;
 	}
 
diff --git a/include/uapi/scsi/cxlflash_ioctl.h b/include/uapi/scsi/cxlflash_ioctl.h
index 6bf1f8a022b1..e9fdc12ad984 100644
--- a/include/uapi/scsi/cxlflash_ioctl.h
+++ b/include/uapi/scsi/cxlflash_ioctl.h
@@ -40,6 +40,7 @@ struct dk_cxlflash_hdr {
  */
 #define DK_CXLFLASH_ALL_PORTS_ACTIVE	0x0000000000000001ULL
 #define DK_CXLFLASH_APP_CLOSE_ADAP_FD	0x0000000000000002ULL
+#define DK_CXLFLASH_CONTEXT_SQ_CMD_MODE	0x0000000000000004ULL
 
 /*
  * General Notes:
-- 
cgit v1.2.3


From a2d6a987bfe4a2e344fae9d255200072eb082427 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Thu, 5 Jan 2017 12:54:18 -0600
Subject: serial: 8250: Add new port type for TI DA8xx/66AK2x

This adds a new UART port type for TI DA8xx/OMAPL13x/AM17xx/AM18xx/66AK2x.
These SoCs have standard 8250 registers plus some extra non-standard
registers.

The UART will not function unless the non-standard Power and Emulation
Management Register (PWREMU_MGMT) is configured correctly. This is
currently handled in arch/arm/mach-davinci/serial.c for non-device-tree
boards. Making this part of the UART driver will allow UART to work on
device-tree boards as well and the mach code can eventually be removed.

Signed-off-by: David Lechner <david@lechnology.com>
Acked-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_of.c   |  1 +
 drivers/tty/serial/8250/8250_port.c | 22 ++++++++++++++++++++++
 include/uapi/linux/serial_core.h    |  3 ++-
 include/uapi/linux/serial_reg.h     |  8 ++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c
index d25ab1cd4295..52812524abfb 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c
@@ -332,6 +332,7 @@ static const struct of_device_id of_platform_serial_table[] = {
 		.data = (void *)PORT_ALTR_16550_F128, },
 	{ .compatible = "mrvl,mmp-uart",
 		.data = (void *)PORT_XSCALE, },
+	{ .compatible = "ti,da830-uart", .data = (void *)PORT_DA830, },
 	{ /* end of list */ },
 };
 MODULE_DEVICE_TABLE(of, of_platform_serial_table);
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 3cfdd745a97a..f88028a62f23 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -273,6 +273,15 @@ static const struct serial8250_config uart_config[] = {
 		.rxtrig_bytes	= {1, 4, 8, 14},
 		.flags		= UART_CAP_FIFO,
 	},
+	[PORT_DA830] = {
+		.name		= "TI DA8xx/66AK2x",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
+				  UART_FCR_R_TRIG_10,
+		.rxtrig_bytes	= {1, 4, 8, 14},
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
 };
 
 /* Uart divisor latch read */
@@ -2114,6 +2123,19 @@ int serial8250_do_startup(struct uart_port *port)
 		serial_port_out(port, UART_LCR, 0);
 	}
 
+	if (port->type == PORT_DA830) {
+		/* Reset the port */
+		serial_port_out(port, UART_IER, 0);
+		serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
+		mdelay(10);
+
+		/* Enable Tx, Rx and free run mode */
+		serial_port_out(port, UART_DA830_PWREMU_MGMT,
+				UART_DA830_PWREMU_MGMT_UTRST |
+				UART_DA830_PWREMU_MGMT_URRST |
+				UART_DA830_PWREMU_MGMT_FREE);
+	}
+
 #ifdef CONFIG_SERIAL_8250_RSA
 	/*
 	 * If this is an RSA port, see if we can kick it up to the
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 99dbed8a8874..9ec741b133fe 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -56,7 +56,8 @@
 #define PORT_ALTR_16550_F128 28 /* Altera 16550 UART with 128 FIFOs */
 #define PORT_RT2880	29	/* Ralink RT2880 internal UART */
 #define PORT_16550A_FSL64 30	/* Freescale 16550 UART with 64 FIFOs */
-#define PORT_MAX_8250	30	/* max port ID */
+#define PORT_DA830	31	/* TI DA8xx/66AK2x */
+#define PORT_MAX_8250	31	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index b4c04842a8c0..274d8fc206e3 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -327,6 +327,14 @@
 #define SERIAL_RSA_BAUD_BASE (921600)
 #define SERIAL_RSA_BAUD_BASE_LO (SERIAL_RSA_BAUD_BASE / 8)
 
+/* Extra registers for TI DA8xx/66AK2x */
+#define UART_DA830_PWREMU_MGMT	12
+
+/* PWREMU_MGMT register bits */
+#define UART_DA830_PWREMU_MGMT_FREE	(1 << 0)  /* Free-running mode */
+#define UART_DA830_PWREMU_MGMT_URRST	(1 << 13) /* Receiver reset/enable */
+#define UART_DA830_PWREMU_MGMT_UTRST	(1 << 14) /* Transmitter reset/enable */
+
 /*
  * Extra serial register definitions for the internal UARTs
  * in TI OMAP processors.
-- 
cgit v1.2.3


From ab5bb2d51ba8da4add4612b529968446cdb504b1 Mon Sep 17 00:00:00 2001
From: vamsi krishna <vamsin@qti.qualcomm.com>
Date: Fri, 13 Jan 2017 01:12:19 +0200
Subject: cfg80211: Add support for randomizing TA of Public Action frames

Add support to use a random local address (Address 2 = TA in transmit
and the same address in receive functionality) for Public Action frames
in order to improve privacy of WLAN clients. Applications fill the
random transmit address in the frame buffer in the NL80211_CMD_FRAME
command. This can be used only with the drivers that indicate support
for random local address by setting the new
NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA and/or
NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED in ext_features.

The driver needs to configure receive behavior to accept frames to the
specified random address during the time the frame exchange is pending
and such frames need to be acknowledged similarly to frames sent to the
local permanent address when this random address functionality is not
used.

Signed-off-by: vamsi krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  6 ++++++
 net/wireless/mlme.c          | 21 +++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 174f4b30e804..908886c83894 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4699,6 +4699,10 @@ enum nl80211_feature_flags {
  *	configuration (AP/mesh) with VHT rates.
  * @NL80211_EXT_FEATURE_FILS_STA: This driver supports Fast Initial Link Setup
  *	with user space SME (NL80211_CMD_AUTHENTICATE) in station mode.
+ * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA: This driver supports randomized TA
+ *	in @NL80211_CMD_FRAME while not associated.
+ * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED: This driver supports
+ *	randomized TA in @NL80211_CMD_FRAME while associated.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4714,6 +4718,8 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_BEACON_RATE_HT,
 	NL80211_EXT_FEATURE_BEACON_RATE_VHT,
 	NL80211_EXT_FEATURE_FILS_STA,
+	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA,
+	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 1c63a77aea34..b876f40c9dad 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -662,8 +662,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			return err;
 	}
 
-	if (!ether_addr_equal(mgmt->sa, wdev_address(wdev)))
-		return -EINVAL;
+	if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) {
+		/* Allow random TA to be used with Public Action frames if the
+		 * driver has indicated support for this. Otherwise, only allow
+		 * the local address to be used.
+		 */
+		if (!ieee80211_is_action(mgmt->frame_control) ||
+		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
+			return -EINVAL;
+		if (!wdev->current_bss &&
+		    !wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
+			return -EINVAL;
+		if (wdev->current_bss &&
+		    !wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
+			return -EINVAL;
+	}
 
 	/* Transmit the Action frame as requested by user space */
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
-- 
cgit v1.2.3


From bf95ecdba93b98d27ac219e79f773f2074b4ca47 Mon Sep 17 00:00:00 2001
From: vamsi krishna <vamsin@qti.qualcomm.com>
Date: Fri, 13 Jan 2017 01:12:20 +0200
Subject: cfg80211: Add support to sched scan to report better BSSs

Enhance sched scan to support option of finding a better BSS while in
connected state. Firmware scans the medium and reports when it finds a
known BSS which has better RSSI than the current connected BSS. New
attributes to specify the relative RSSI (compared to the current BSS)
are added to the sched scan to implement this.

Signed-off-by: vamsi krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 36 +++++++++++++++++++++++++-----------
 include/uapi/linux/nl80211.h | 30 ++++++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cb13789ebaef..4456491132cd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1619,6 +1619,17 @@ struct cfg80211_sched_scan_plan {
 	u32 iterations;
 };
 
+/**
+ * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
+ *
+ * @band: band of BSS which should match for RSSI level adjustment.
+ * @delta: value of RSSI level adjustment.
+ */
+struct cfg80211_bss_select_adjust {
+	enum nl80211_band band;
+	s8 delta;
+};
+
 /**
  * struct cfg80211_sched_scan_request - scheduled scan request description
  *
@@ -1654,6 +1665,16 @@ struct cfg80211_sched_scan_plan {
  *	cycle.  The driver may ignore this parameter and start
  *	immediately (or at any other time), if this feature is not
  *	supported.
+ * @relative_rssi_set: Indicates whether @relative_rssi is set or not.
+ * @relative_rssi: Relative RSSI threshold in dB to restrict scan result
+ *	reporting in connected state to cases where a matching BSS is determined
+ *	to have better or slightly worse RSSI than the current connected BSS.
+ *	The relative RSSI threshold values are ignored in disconnected state.
+ * @rssi_adjust: delta dB of RSSI preference to be given to the BSSs that belong
+ *	to the specified band while deciding whether a better BSS is reported
+ *	using @relative_rssi. If delta is a negative number, the BSSs that
+ *	belong to the specified band will be penalized by delta dB in relative
+ *	comparisions.
  */
 struct cfg80211_sched_scan_request {
 	struct cfg80211_ssid *ssids;
@@ -1673,6 +1694,10 @@ struct cfg80211_sched_scan_request {
 	u8 mac_addr[ETH_ALEN] __aligned(2);
 	u8 mac_addr_mask[ETH_ALEN] __aligned(2);
 
+	bool relative_rssi_set;
+	s8 relative_rssi;
+	struct cfg80211_bss_select_adjust rssi_adjust;
+
 	/* internal */
 	struct wiphy *wiphy;
 	struct net_device *dev;
@@ -1980,17 +2005,6 @@ struct cfg80211_ibss_params {
 	struct ieee80211_ht_cap ht_capa_mask;
 };
 
-/**
- * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
- *
- * @band: band of BSS which should match for RSSI level adjustment.
- * @delta: value of RSSI level adjustment.
- */
-struct cfg80211_bss_select_adjust {
-	enum nl80211_band band;
-	s8 delta;
-};
-
 /**
  * struct cfg80211_bss_selection - connection parameters for BSS selection.
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 908886c83894..6b17feb5e839 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1982,6 +1982,20 @@ enum nl80211_commands {
  * @NL80211_ATTR_BSSID: The BSSID of the AP. Note that %NL80211_ATTR_MAC is also
  *	used in various commands/events for specifying the BSSID.
  *
+ * @NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI: Relative RSSI threshold by which
+ *	other BSSs has to be better or slightly worse than the current
+ *	connected BSS so that they get reported to user space.
+ *	This will give an opportunity to userspace to consider connecting to
+ *	other matching BSSs which have better or slightly worse RSSI than
+ *	the current connected BSS by using an offloaded operation to avoid
+ *	unnecessary wakeups.
+ *
+ * @NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST: When present the RSSI level for BSSs in
+ *	the specified band is to be adjusted before doing
+ *	%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI based comparision to figure out
+ *	better BSSs. The attribute value is a packed structure
+ *	value as specified by &struct nl80211_bss_select_rssi_adjust.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2388,6 +2402,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_BSSID,
 
+	NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
+	NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3080,6 +3097,13 @@ enum nl80211_reg_rule_attr {
  *	how this API was implemented in the past. Also, due to the same problem,
  *	the only way to create a matchset with only an RSSI filter (with this
  *	attribute) is if there's only a single matchset with the RSSI attribute.
+ * @NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI: Flag indicating whether
+ *	%NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to be used as absolute RSSI or
+ *	relative to current bss's RSSI.
+ * @NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST: When present the RSSI level for
+ *	BSS-es in the specified band is to be adjusted before doing
+ *	RSSI-based BSS selection. The attribute value is a packed structure
+ *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter
  *	attribute number currently defined
  * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use
@@ -3089,6 +3113,8 @@ enum nl80211_sched_scan_match_attr {
 
 	NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
 	NL80211_SCHED_SCAN_MATCH_ATTR_RSSI,
+	NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI,
+	NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST,
 
 	/* keep last */
 	__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST,
@@ -4703,6 +4729,9 @@ enum nl80211_feature_flags {
  *	in @NL80211_CMD_FRAME while not associated.
  * @NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED: This driver supports
  *	randomized TA in @NL80211_CMD_FRAME while associated.
+ * @NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI: The driver supports sched_scan
+ *	for reporting BSSs with better RSSI than the current connected BSS
+ *	(%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI).
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4720,6 +4749,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_FILS_STA,
 	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA,
 	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED,
+	NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b378d0a04003..71c66ff9a702 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -405,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
 	[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
 	[NL80211_ATTR_BSSID] = { .len = ETH_ALEN },
+	[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
+	[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
+		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
+	},
 };
 
 /* policy for the key attributes */
@@ -6950,6 +6954,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 	if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
 		return ERR_PTR(-EINVAL);
 
+	if (!wiphy_ext_feature_isset(
+		    wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
+	    (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
+	     attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
+		return ERR_PTR(-EINVAL);
+
 	request = kzalloc(sizeof(*request)
 			+ sizeof(*request->ssids) * n_ssids
 			+ sizeof(*request->match_sets) * n_match_sets
@@ -7156,6 +7166,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 		request->delay =
 			nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
 
+	if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
+		request->relative_rssi = nla_get_s8(
+			attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
+		request->relative_rssi_set = true;
+	}
+
+	if (request->relative_rssi_set &&
+	    attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
+		struct nl80211_bss_select_rssi_adjust *rssi_adjust;
+
+		rssi_adjust = nla_data(
+			attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
+		request->rssi_adjust.band = rssi_adjust->band;
+		request->rssi_adjust.delta = rssi_adjust->delta;
+		if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
+			err = -EINVAL;
+			goto out_free;
+		}
+	}
+
 	err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
 	if (err)
 		goto out_free;
@@ -9692,6 +9722,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
 	if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
 		return -ENOBUFS;
 
+	if (req->relative_rssi_set) {
+		struct nl80211_bss_select_rssi_adjust rssi_adjust;
+
+		if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
+			       req->relative_rssi))
+			return -ENOBUFS;
+
+		rssi_adjust.band = req->rssi_adjust.band;
+		rssi_adjust.delta = req->rssi_adjust.delta;
+		if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
+			    sizeof(rssi_adjust), &rssi_adjust))
+			return -ENOBUFS;
+	}
+
 	freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
 	if (!freqs)
 		return -ENOBUFS;
-- 
cgit v1.2.3


From 3093ebbeabcdddc9a982950052f2151df43c7aa2 Mon Sep 17 00:00:00 2001
From: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Date: Fri, 13 Jan 2017 01:12:21 +0200
Subject: cfg80211: Specify the reason for connect timeout

This enhances the connect timeout API to also carry the reason for the
timeout. These reason codes for the connect time out are represented by
enum nl80211_timeout_reason and are passed to user space through a new
attribute NL80211_ATTR_TIMEOUT_REASON (u32).

Signed-off-by: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
[keep gfp_t argument last]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 18 ++++++++++++++----
 include/uapi/linux/nl80211.h | 21 +++++++++++++++++++++
 net/wireless/core.h          |  4 +++-
 net/wireless/mlme.c          |  3 ++-
 net/wireless/nl80211.c       |  9 +++++++--
 net/wireless/nl80211.h       |  4 +++-
 net/wireless/sme.c           | 39 +++++++++++++++++++++++++++------------
 net/wireless/util.c          |  2 +-
 8 files changed, 78 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4456491132cd..9b3427c8d1db 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5090,6 +5090,12 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
  *      %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
  *      the real status code for failures.
  * @gfp: allocation flags
+ * @timeout_reason: reason for connection timeout. This is used when the
+ *	connection fails due to a timeout instead of an explicit rejection from
+ *	the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
+ *	not known. This value is used only if @status < 0 to indicate that the
+ *	failure is due to a timeout and not due to explicit rejection by the AP.
+ *	This value is ignored in other cases (@status >= 0).
  *
  * It should be called by the underlying driver whenever connect() has
  * succeeded. This is similar to cfg80211_connect_result(), but with the
@@ -5099,7 +5105,8 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 			  struct cfg80211_bss *bss, const u8 *req_ie,
 			  size_t req_ie_len, const u8 *resp_ie,
-			  size_t resp_ie_len, int status, gfp_t gfp);
+			  size_t resp_ie_len, int status, gfp_t gfp,
+			  enum nl80211_timeout_reason timeout_reason);
 
 /**
  * cfg80211_connect_result - notify cfg80211 of connection result
@@ -5125,7 +5132,8 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			u16 status, gfp_t gfp)
 {
 	cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, resp_ie,
-			     resp_ie_len, status, gfp);
+			     resp_ie_len, status, gfp,
+			     NL80211_TIMEOUT_UNSPECIFIED);
 }
 
 /**
@@ -5136,6 +5144,7 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
  * @req_ie: association request IEs (maybe be %NULL)
  * @req_ie_len: association request IEs length
  * @gfp: allocation flags
+ * @timeout_reason: reason for connection timeout.
  *
  * It should be called by the underlying driver whenever connect() has failed
  * in a sequence where no explicit authentication/association rejection was
@@ -5145,10 +5154,11 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
  */
 static inline void
 cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
-			 const u8 *req_ie, size_t req_ie_len, gfp_t gfp)
+			 const u8 *req_ie, size_t req_ie_len, gfp_t gfp,
+			 enum nl80211_timeout_reason timeout_reason)
 {
 	cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1,
-			     gfp);
+			     gfp, timeout_reason);
 }
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6b17feb5e839..c51b40cc0645 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1996,6 +1996,10 @@ enum nl80211_commands {
  *	better BSSs. The attribute value is a packed structure
  *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  *
+ * @NL80211_ATTR_TIMEOUT_REASON: The reason for which an operation timed out.
+ *	u32 attribute with an &enum nl80211_timeout_reason value. This is used,
+ *	e.g., with %NL80211_CMD_CONNECT event.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2405,6 +2409,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
 	NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
 
+	NL80211_ATTR_TIMEOUT_REASON,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -4788,6 +4794,21 @@ enum nl80211_connect_failed_reason {
 	NL80211_CONN_FAIL_BLOCKED_CLIENT,
 };
 
+/**
+ * enum nl80211_timeout_reason - timeout reasons
+ *
+ * @NL80211_TIMEOUT_UNSPECIFIED: Timeout reason unspecified.
+ * @NL80211_TIMEOUT_SCAN: Scan (AP discovery) timed out.
+ * @NL80211_TIMEOUT_AUTH: Authentication timed out.
+ * @NL80211_TIMEOUT_ASSOC: Association timed out.
+ */
+enum nl80211_timeout_reason {
+	NL80211_TIMEOUT_UNSPECIFIED,
+	NL80211_TIMEOUT_SCAN,
+	NL80211_TIMEOUT_AUTH,
+	NL80211_TIMEOUT_ASSOC,
+};
+
 /**
  * enum nl80211_scan_flags -  scan request control flags
  *
diff --git a/net/wireless/core.h b/net/wireless/core.h
index ba42055a036d..58ca206982fe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -228,6 +228,7 @@ struct cfg80211_event {
 			size_t resp_ie_len;
 			struct cfg80211_bss *bss;
 			int status; /* -1 = failed; 0..65535 = status code */
+			enum nl80211_timeout_reason timeout_reason;
 		} cr;
 		struct {
 			const u8 *req_ie;
@@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *req_ie, size_t req_ie_len,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       int status, bool wextev,
-			       struct cfg80211_bss *bss);
+			       struct cfg80211_bss *bss,
+			       enum nl80211_timeout_reason timeout_reason);
 void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			     size_t ie_len, u16 reason, bool from_ap);
 int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index b876f40c9dad..22b3d9990065 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
 	/* update current_bss etc., consumes the bss reference */
 	__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
 				  status_code,
-				  status_code == WLAN_STATUS_SUCCESS, bss);
+				  status_code == WLAN_STATUS_SUCCESS, bss,
+				  NL80211_TIMEOUT_UNSPECIFIED);
 }
 EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 71c66ff9a702..b4e7bdd673e0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -409,6 +409,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
 		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
 	},
+	[NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -13231,7 +13232,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *bssid,
 				 const u8 *req_ie, size_t req_ie_len,
 				 const u8 *resp_ie, size_t resp_ie_len,
-				 int status, gfp_t gfp)
+				 int status,
+				 enum nl80211_timeout_reason timeout_reason,
+				 gfp_t gfp)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -13252,7 +13255,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	    nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
 			status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
 			status) ||
-	    (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) ||
+	    (status < 0 &&
+	     (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
+	      nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) ||
 	    (req_ie &&
 	     nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
 	    (resp_ie &&
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 75f82520211d..e488dca87423 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -56,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 				 struct net_device *netdev, const u8 *bssid,
 				 const u8 *req_ie, size_t req_ie_len,
 				 const u8 *resp_ie, size_t resp_ie_len,
-				 int status, gfp_t gfp);
+				 int status,
+				 enum nl80211_timeout_reason timeout_reason,
+				 gfp_t gfp);
 void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev, const u8 *bssid,
 			 const u8 *req_ie, size_t req_ie_len,
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 46693913fcea..b347e63d7aaa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -34,10 +34,11 @@ struct cfg80211_conn {
 		CFG80211_CONN_SCAN_AGAIN,
 		CFG80211_CONN_AUTHENTICATE_NEXT,
 		CFG80211_CONN_AUTHENTICATING,
-		CFG80211_CONN_AUTH_FAILED,
+		CFG80211_CONN_AUTH_FAILED_TIMEOUT,
 		CFG80211_CONN_ASSOCIATE_NEXT,
 		CFG80211_CONN_ASSOCIATING,
 		CFG80211_CONN_ASSOC_FAILED,
+		CFG80211_CONN_ASSOC_FAILED_TIMEOUT,
 		CFG80211_CONN_DEAUTH,
 		CFG80211_CONN_ABANDON,
 		CFG80211_CONN_CONNECTED,
@@ -140,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 	return err;
 }
 
-static int cfg80211_conn_do_work(struct wireless_dev *wdev)
+static int cfg80211_conn_do_work(struct wireless_dev *wdev,
+				 enum nl80211_timeout_reason *treason)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_connect_params *params;
@@ -171,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
 					  NULL, 0,
 					  params->key, params->key_len,
 					  params->key_idx, NULL, 0);
-	case CFG80211_CONN_AUTH_FAILED:
+	case CFG80211_CONN_AUTH_FAILED_TIMEOUT:
+		*treason = NL80211_TIMEOUT_AUTH;
 		return -ENOTCONN;
 	case CFG80211_CONN_ASSOCIATE_NEXT:
 		if (WARN_ON(!rdev->ops->assoc))
@@ -198,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev)
 					     WLAN_REASON_DEAUTH_LEAVING,
 					     false);
 		return err;
+	case CFG80211_CONN_ASSOC_FAILED_TIMEOUT:
+		*treason = NL80211_TIMEOUT_ASSOC;
+		/* fall through */
 	case CFG80211_CONN_ASSOC_FAILED:
 		cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
 				     NULL, 0,
@@ -223,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work)
 		container_of(work, struct cfg80211_registered_device, conn_work);
 	struct wireless_dev *wdev;
 	u8 bssid_buf[ETH_ALEN], *bssid = NULL;
+	enum nl80211_timeout_reason treason;
 
 	rtnl_lock();
 
@@ -244,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work)
 			memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
 			bssid = bssid_buf;
 		}
-		if (cfg80211_conn_do_work(wdev)) {
+		treason = NL80211_TIMEOUT_UNSPECIFIED;
+		if (cfg80211_conn_do_work(wdev, &treason)) {
 			__cfg80211_connect_result(
 					wdev->netdev, bssid,
-					NULL, 0, NULL, 0, -1, false, NULL);
+					NULL, 0, NULL, 0, -1, false, NULL,
+					treason);
 		}
 		wdev_unlock(wdev);
 	}
@@ -352,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 	} else if (status_code != WLAN_STATUS_SUCCESS) {
 		__cfg80211_connect_result(wdev->netdev, mgmt->bssid,
 					  NULL, 0, NULL, 0,
-					  status_code, false, NULL);
+					  status_code, false, NULL,
+					  NL80211_TIMEOUT_UNSPECIFIED);
 	} else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
 		wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
 		schedule_work(&rdev->conn_work);
@@ -400,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
 	if (!wdev->conn)
 		return;
 
-	wdev->conn->state = CFG80211_CONN_AUTH_FAILED;
+	wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT;
 	schedule_work(&rdev->conn_work);
 }
 
@@ -422,7 +432,7 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
 	if (!wdev->conn)
 		return;
 
-	wdev->conn->state = CFG80211_CONN_ASSOC_FAILED;
+	wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT;
 	schedule_work(&rdev->conn_work);
 }
 
@@ -564,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 
 	/* we're good if we have a matching bss struct */
 	if (bss) {
-		err = cfg80211_conn_do_work(wdev);
+		enum nl80211_timeout_reason treason;
+
+		err = cfg80211_conn_do_work(wdev, &treason);
 		cfg80211_put_bss(wdev->wiphy, bss);
 	} else {
 		/* otherwise we'll need to scan for the AP first */
@@ -661,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *req_ie, size_t req_ie_len,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       int status, bool wextev,
-			       struct cfg80211_bss *bss)
+			       struct cfg80211_bss *bss,
+			       enum nl80211_timeout_reason timeout_reason)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const u8 *country_ie;
@@ -680,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 	nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
 				    bssid, req_ie, req_ie_len,
 				    resp_ie, resp_ie_len,
-				    status, GFP_KERNEL);
+				    status, timeout_reason, GFP_KERNEL);
 
 #ifdef CONFIG_CFG80211_WEXT
 	if (wextev) {
@@ -771,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 			  struct cfg80211_bss *bss, const u8 *req_ie,
 			  size_t req_ie_len, const u8 *resp_ie,
-			  size_t resp_ie_len, int status, gfp_t gfp)
+			  size_t resp_ie_len, int status, gfp_t gfp,
+			  enum nl80211_timeout_reason timeout_reason)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -811,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 		cfg80211_hold_bss(bss_from_pub(bss));
 	ev->cr.bss = bss;
 	ev->cr.status = status;
+	ev->cr.timeout_reason = timeout_reason;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index cd8a7ae55e7d..1b9296882dcd 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -951,7 +951,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 				ev->cr.resp_ie, ev->cr.resp_ie_len,
 				ev->cr.status,
 				ev->cr.status == WLAN_STATUS_SUCCESS,
-				ev->cr.bss);
+				ev->cr.bss, ev->cr.timeout_reason);
 			break;
 		case EVENT_ROAMED:
 			__cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
-- 
cgit v1.2.3


From a50a05f497a2a6e772900ffe93246fb7243d86d8 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sun, 15 Jan 2017 15:26:16 +0100
Subject: ipv6: sr: add missing Kbuild export for header files

Add missing IPv6-SR header files in include/uapi/linux/Kbuild.

Also, prevent seg6_lwt_headroom() from being exported and add
missing linux/types.h include.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/Kbuild          | 4 ++++
 include/uapi/linux/seg6.h          | 2 ++
 include/uapi/linux/seg6_hmac.h     | 1 +
 include/uapi/linux/seg6_iptunnel.h | 4 ++++
 4 files changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index f330ba4547cf..e600b50be77e 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -379,6 +379,10 @@ header-y += sctp.h
 header-y += sdla.h
 header-y += seccomp.h
 header-y += securebits.h
+header-y += seg6_genl.h
+header-y += seg6.h
+header-y += seg6_hmac.h
+header-y += seg6_iptunnel.h
 header-y += selinux_netlink.h
 header-y += sem.h
 header-y += serial_core.h
diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
index c396a8052f73..33496595064c 100644
--- a/include/uapi/linux/seg6.h
+++ b/include/uapi/linux/seg6.h
@@ -14,6 +14,8 @@
 #ifndef _UAPI_LINUX_SEG6_H
 #define _UAPI_LINUX_SEG6_H
 
+#include <linux/types.h>
+
 /*
  * SRH
  */
diff --git a/include/uapi/linux/seg6_hmac.h b/include/uapi/linux/seg6_hmac.h
index b652dfd51bc5..e691c753fc3f 100644
--- a/include/uapi/linux/seg6_hmac.h
+++ b/include/uapi/linux/seg6_hmac.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI_LINUX_SEG6_HMAC_H
 #define _UAPI_LINUX_SEG6_HMAC_H
 
+#include <linux/types.h>
 #include <linux/seg6.h>
 
 #define SEG6_HMAC_SECRET_LEN	64
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index 0f7dbd280a9c..7a7183d4062a 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -33,6 +33,8 @@ enum {
 	SEG6_IPTUN_MODE_ENCAP,
 };
 
+#ifdef __KERNEL__
+
 static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 {
 	int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP);
@@ -42,3 +44,5 @@ static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 }
 
 #endif
+
+#endif
-- 
cgit v1.2.3


From aefb4d4ad83b608cb8e0cab8d3cd8e57d3f91feb Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Mon, 16 Jan 2017 14:16:36 +0000
Subject: net: AF-specific RTM_GETSTATS attributes

Add the functionality for including address-family-specific per-link
stats in RTM_GETSTATS messages. This is done through adding a new
IFLA_STATS_AF_SPEC attribute under which address family attributes are
nested and then the AF-specific attributes can be further nested. This
follows the model of IFLA_AF_SPEC on RTM_*LINK messages and it has the
advantage of presenting an easily extended hierarchy. The rtnl_af_ops
structure is extended to provide AFs with the opportunity to fill and
provide the size of their stats attributes.

One alternative would have been to provide AFs with the ability to add
attributes directly into the RTM_GETSTATS message without a nested
hierarchy. I discounted this approach as it increases the rate at
which the 32 attribute number space is used up and it makes
implementation a little more tricky for stats dump resuming (at the
moment the order in which attributes are added to the message has to
match the numeric order of the attributes).

Another alternative would have been to register per-AF RTM_GETSTATS
handlers. I discounted this approach as I perceived a common use-case
to be getting all the stats for an interface and this approach would
necessitate multiple requests/dumps to retrieve them all.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h      |  4 ++++
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 50 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 4113916cc1bb..106de5f7bf06 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -139,6 +139,10 @@ struct rtnl_af_ops {
 						    const struct nlattr *attr);
 	int			(*set_link_af)(struct net_device *dev,
 					       const struct nlattr *attr);
+
+	int			(*fill_stats_af)(struct sk_buff *skb,
+						 const struct net_device *dev);
+	size_t			(*get_stats_af_size)(const struct net_device *dev);
 };
 
 void __rtnl_af_unregister(struct rtnl_af_ops *ops);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6b13e591abc9..184b16ed2b84 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -847,6 +847,7 @@ enum {
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
 	IFLA_STATS_LINK_OFFLOAD_XSTATS,
+	IFLA_STATS_AF_SPEC,
 	__IFLA_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 75e3ea7bda08..f538f764fca6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3829,6 +3829,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		*idxattr = 0;
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+		struct rtnl_af_ops *af_ops;
+
+		*idxattr = IFLA_STATS_AF_SPEC;
+		attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+			if (af_ops->fill_stats_af) {
+				struct nlattr *af;
+				int err;
+
+				af = nla_nest_start(skb, af_ops->family);
+				if (!af)
+					goto nla_put_failure;
+
+				err = af_ops->fill_stats_af(skb, dev);
+
+				if (err == -ENODATA)
+					nla_nest_cancel(skb, af);
+				else if (err < 0)
+					goto nla_put_failure;
+
+				nla_nest_end(skb, af);
+			}
+		}
+
+		nla_nest_end(skb, attr);
+
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3885,6 +3918,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
 		size += rtnl_get_offload_stats_size(dev);
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+		struct rtnl_af_ops *af_ops;
+
+		/* for IFLA_STATS_AF_SPEC */
+		size += nla_total_size(0);
+
+		list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+			if (af_ops->get_stats_af_size) {
+				size += nla_total_size(
+					af_ops->get_stats_af_size(dev));
+
+				/* for AF_* */
+				size += nla_total_size(0);
+			}
+		}
+	}
+
 	return size;
 }
 
-- 
cgit v1.2.3


From 27d691056bde4a6feca5e83fd92b787332c46302 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Mon, 16 Jan 2017 14:16:37 +0000
Subject: mpls: Packet stats

Having MPLS packet stats is useful for observing network operation and
for diagnosing network problems. In the absence of anything better,
RFC2863 and RFC3813 are used for guidance for which stats to expose
and the semantics of them. In particular rx_noroutes maps to in
unknown protos in RFC2863. The stats are exposed to userspace via
AF_MPLS attributes embedded in the IFLA_STATS_AF_SPEC attribute of
RTM_GETSTATS messages.

All the introduced fields are 64-bit, even error ones, to ensure no
overflow with long uptimes. Per-CPU counters are used to avoid
cache-line contention on the commonly used fields. The other fields
have also been made per-CPU for code to avoid performance problems in
error conditions on the assumption that on some platforms the cost of
atomic operations could be more expensive than sending the packet
(which is what would be done in the success case). If that's not the
case, we could instead not use per-CPU counters for these fields.

Only unicast and non-fragment are exposed at the moment, but other
counters can be exposed in the future either by adding to the end of
struct mpls_link_stats or by additional netlink attributes in the
AF_MPLS IFLA_STATS_AF_SPEC nested attribute.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mpls.h |  30 ++++++++
 net/mpls/af_mpls.c        | 181 ++++++++++++++++++++++++++++++++++++++++------
 net/mpls/internal.h       |  58 ++++++++++++++-
 net/mpls/mpls_iptunnel.c  |  11 ++-
 4 files changed, 252 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h
index 24a6cb1aec86..77a19dfe3990 100644
--- a/include/uapi/linux/mpls.h
+++ b/include/uapi/linux/mpls.h
@@ -43,4 +43,34 @@ struct mpls_label {
 
 #define MPLS_LABEL_FIRST_UNRESERVED	16 /* RFC3032 */
 
+/* These are embedded into IFLA_STATS_AF_SPEC:
+ * [IFLA_STATS_AF_SPEC]
+ * -> [AF_MPLS]
+ *    -> [MPLS_STATS_xxx]
+ *
+ * Attributes:
+ * [MPLS_STATS_LINK] = {
+ *     struct mpls_link_stats
+ * }
+ */
+enum {
+	MPLS_STATS_UNSPEC, /* also used as 64bit pad attribute */
+	MPLS_STATS_LINK,
+	__MPLS_STATS_MAX,
+};
+
+#define MPLS_STATS_MAX (__MPLS_STATS_MAX - 1)
+
+struct mpls_link_stats {
+	__u64	rx_packets;		/* total packets received	*/
+	__u64	tx_packets;		/* total packets transmitted	*/
+	__u64	rx_bytes;		/* total bytes received		*/
+	__u64	tx_bytes;		/* total bytes transmitted	*/
+	__u64	rx_errors;		/* bad packets received		*/
+	__u64	tx_errors;		/* packet transmit problems	*/
+	__u64	rx_dropped;		/* packet dropped on receive	*/
+	__u64	tx_dropped;		/* packet dropped on transmit	*/
+	__u64	rx_noroute;		/* no route for packet dest	*/
+};
+
 #endif /* _UAPI_MPLS_H */
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 15fe97644ffe..4dc81963af8f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -8,6 +8,7 @@
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
 #include <linux/vmalloc.h>
+#include <linux/percpu.h>
 #include <net/ip.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -17,8 +18,8 @@
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
-#include <net/addrconf.h>
 #endif
+#include <net/addrconf.h>
 #include <net/nexthop.h>
 #include "internal.h"
 
@@ -48,11 +49,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
 	return rt;
 }
 
-static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
-{
-	return rcu_dereference_rtnl(dev->mpls_ptr);
-}
-
 bool mpls_output_possible(const struct net_device *dev)
 {
 	return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,6 +94,31 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+				 const struct sk_buff *skb)
+{
+	struct mpls_dev *mdev;
+
+	if (skb->protocol == htons(ETH_P_MPLS_UC)) {
+		mdev = mpls_dev_get(dev);
+		if (mdev)
+			MPLS_INC_STATS_LEN(mdev, skb->len,
+					   tx_packets,
+					   tx_bytes);
+	} else if (skb->protocol == htons(ETH_P_IP)) {
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct inet6_dev *in6dev = __in6_dev_get(dev);
+
+		if (in6dev)
+			IP6_UPD_PO_STATS(dev_net(dev), in6dev,
+					 IPSTATS_MIB_OUT, skb->len);
+#endif
+	}
+}
+EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
+
 static u32 mpls_multipath_hash(struct mpls_route *rt,
 			       struct sk_buff *skb, bool bos)
 {
@@ -253,6 +274,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct mpls_nh *nh;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
+	struct mpls_dev *out_mdev;
 	struct mpls_dev *mdev;
 	unsigned int hh_len;
 	unsigned int new_header_size;
@@ -262,17 +284,25 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	/* Careful this entire function runs inside of an rcu critical section */
 
 	mdev = mpls_dev_get(dev);
-	if (!mdev || !mdev->input_enabled)
+	if (!mdev)
 		goto drop;
 
-	if (skb->pkt_type != PACKET_HOST)
+	MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets,
+			   rx_bytes);
+
+	if (!mdev->input_enabled) {
+		MPLS_INC_STATS(mdev, rx_dropped);
 		goto drop;
+	}
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto err;
 
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
-		goto drop;
+		goto err;
 
 	if (!pskb_may_pull(skb, sizeof(*hdr)))
-		goto drop;
+		goto err;
 
 	/* Read and decode the label */
 	hdr = mpls_hdr(skb);
@@ -285,33 +315,35 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	skb_orphan(skb);
 
 	rt = mpls_route_input_rcu(net, dec.label);
-	if (!rt)
+	if (!rt) {
+		MPLS_INC_STATS(mdev, rx_noroute);
 		goto drop;
+	}
 
 	nh = mpls_select_multipath(rt, skb, dec.bos);
 	if (!nh)
-		goto drop;
-
-	/* Find the output device */
-	out_dev = rcu_dereference(nh->nh_dev);
-	if (!mpls_output_possible(out_dev))
-		goto drop;
+		goto err;
 
 	if (skb_warn_if_lro(skb))
-		goto drop;
+		goto err;
 
 	skb_forward_csum(skb);
 
 	/* Verify ttl is valid */
 	if (dec.ttl <= 1)
-		goto drop;
+		goto err;
 	dec.ttl -= 1;
 
+	/* Find the output device */
+	out_dev = rcu_dereference(nh->nh_dev);
+	if (!mpls_output_possible(out_dev))
+		goto tx_err;
+
 	/* Verify the destination can hold the packet */
 	new_header_size = mpls_nh_header_size(nh);
 	mtu = mpls_dev_mtu(out_dev);
 	if (mpls_pkt_too_big(skb, mtu - new_header_size))
-		goto drop;
+		goto tx_err;
 
 	hh_len = LL_RESERVED_SPACE(out_dev);
 	if (!out_dev->header_ops)
@@ -319,7 +351,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	/* Ensure there is enough space for the headers in the skb */
 	if (skb_cow(skb, hh_len + new_header_size))
-		goto drop;
+		goto tx_err;
 
 	skb->dev = out_dev;
 	skb->protocol = htons(ETH_P_MPLS_UC);
@@ -327,7 +359,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	if (unlikely(!new_header_size && dec.bos)) {
 		/* Penultimate hop popping */
 		if (!mpls_egress(rt, skb, dec))
-			goto drop;
+			goto err;
 	} else {
 		bool bos;
 		int i;
@@ -343,6 +375,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	mpls_stats_inc_outucastpkts(out_dev, skb);
+
 	/* If via wasn't specified then send out using device address */
 	if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
 		err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
@@ -355,6 +389,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 				    __func__, err);
 	return 0;
 
+tx_err:
+	out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+	if (out_mdev)
+		MPLS_INC_STATS(out_mdev, tx_errors);
+	goto drop;
+err:
+	MPLS_INC_STATS(mdev, rx_errors);
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
@@ -853,6 +894,70 @@ errout:
 	return err;
 }
 
+static void mpls_get_stats(struct mpls_dev *mdev,
+			   struct mpls_link_stats *stats)
+{
+	struct mpls_pcpu_stats *p;
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+
+	for_each_possible_cpu(i) {
+		struct mpls_link_stats local;
+		unsigned int start;
+
+		p = per_cpu_ptr(mdev->stats, i);
+		do {
+			start = u64_stats_fetch_begin(&p->syncp);
+			local = p->stats;
+		} while (u64_stats_fetch_retry(&p->syncp, start));
+
+		stats->rx_packets	+= local.rx_packets;
+		stats->rx_bytes		+= local.rx_bytes;
+		stats->tx_packets	+= local.tx_packets;
+		stats->tx_bytes		+= local.tx_bytes;
+		stats->rx_errors	+= local.rx_errors;
+		stats->tx_errors	+= local.tx_errors;
+		stats->rx_dropped	+= local.rx_dropped;
+		stats->tx_dropped	+= local.tx_dropped;
+		stats->rx_noroute	+= local.rx_noroute;
+	}
+}
+
+static int mpls_fill_stats_af(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct mpls_link_stats *stats;
+	struct mpls_dev *mdev;
+	struct nlattr *nla;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return -ENODATA;
+
+	nla = nla_reserve_64bit(skb, MPLS_STATS_LINK,
+				sizeof(struct mpls_link_stats),
+				MPLS_STATS_UNSPEC);
+	if (!nla)
+		return -EMSGSIZE;
+
+	stats = nla_data(nla);
+	mpls_get_stats(mdev, stats);
+
+	return 0;
+}
+
+static size_t mpls_get_stats_af_size(const struct net_device *dev)
+{
+	struct mpls_dev *mdev;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return 0;
+
+	return nla_total_size_64bit(sizeof(struct mpls_link_stats));
+}
+
 #define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
 	(&((struct mpls_dev *)0)->field)
 
@@ -911,6 +1016,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 {
 	struct mpls_dev *mdev;
 	int err = -ENOMEM;
+	int i;
 
 	ASSERT_RTNL();
 
@@ -918,6 +1024,17 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	if (!mdev)
 		return ERR_PTR(err);
 
+	mdev->stats = alloc_percpu(struct mpls_pcpu_stats);
+	if (!mdev->stats)
+		goto free;
+
+	for_each_possible_cpu(i) {
+		struct mpls_pcpu_stats *mpls_stats;
+
+		mpls_stats = per_cpu_ptr(mdev->stats, i);
+		u64_stats_init(&mpls_stats->syncp);
+	}
+
 	err = mpls_dev_sysctl_register(dev, mdev);
 	if (err)
 		goto free;
@@ -927,10 +1044,19 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	return mdev;
 
 free:
+	free_percpu(mdev->stats);
 	kfree(mdev);
 	return ERR_PTR(err);
 }
 
+static void mpls_dev_destroy_rcu(struct rcu_head *head)
+{
+	struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu);
+
+	free_percpu(mdev->stats);
+	kfree(mdev);
+}
+
 static void mpls_ifdown(struct net_device *dev, int event)
 {
 	struct mpls_route __rcu **platform_label;
@@ -1045,7 +1171,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 		if (mdev) {
 			mpls_dev_sysctl_unregister(mdev);
 			RCU_INIT_POINTER(dev->mpls_ptr, NULL);
-			kfree_rcu(mdev, rcu);
+			call_rcu(&mdev->rcu, mpls_dev_destroy_rcu);
 		}
 		break;
 	case NETDEV_CHANGENAME:
@@ -1706,6 +1832,12 @@ static struct pernet_operations mpls_net_ops = {
 	.exit = mpls_net_exit,
 };
 
+static struct rtnl_af_ops mpls_af_ops __read_mostly = {
+	.family		   = AF_MPLS,
+	.fill_stats_af	   = mpls_fill_stats_af,
+	.get_stats_af_size = mpls_get_stats_af_size,
+};
+
 static int __init mpls_init(void)
 {
 	int err;
@@ -1722,6 +1854,8 @@ static int __init mpls_init(void)
 
 	dev_add_pack(&mpls_packet_type);
 
+	rtnl_af_register(&mpls_af_ops);
+
 	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
@@ -1738,6 +1872,7 @@ module_init(mpls_init);
 static void __exit mpls_exit(void)
 {
 	rtnl_unregister_all(PF_MPLS);
+	rtnl_af_unregister(&mpls_af_ops);
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bdfef6c3271a..d97243034605 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -9,13 +9,58 @@ struct mpls_entry_decoded {
 	u8 bos;
 };
 
+struct mpls_pcpu_stats {
+	struct mpls_link_stats	stats;
+	struct u64_stats_sync	syncp;
+};
+
 struct mpls_dev {
-	int			input_enabled;
+	int				input_enabled;
 
-	struct ctl_table_header *sysctl;
-	struct rcu_head		rcu;
+	struct mpls_pcpu_stats __percpu	*stats;
+
+	struct ctl_table_header		*sysctl;
+	struct rcu_head			rcu;
 };
 
+#if BITS_PER_LONG == 32
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field)		\
+	do {								\
+		__typeof__(*(mdev)->stats) *ptr =			\
+			raw_cpu_ptr((mdev)->stats);			\
+		local_bh_disable();					\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->stats.pkts_field++;				\
+		ptr->stats.bytes_field += (len);			\
+		u64_stats_update_end(&ptr->syncp);			\
+		local_bh_enable();					\
+	} while (0)
+
+#define MPLS_INC_STATS(mdev, field)					\
+	do {								\
+		__typeof__(*(mdev)->stats) *ptr =			\
+			raw_cpu_ptr((mdev)->stats);			\
+		local_bh_disable();					\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->stats.field++;					\
+		u64_stats_update_end(&ptr->syncp);			\
+		local_bh_enable();					\
+	} while (0)
+
+#else
+
+#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field)		\
+	do {								\
+		this_cpu_inc((mdev)->stats->stats.pkts_field);		\
+		this_cpu_add((mdev)->stats->stats.bytes_field, (len));	\
+	} while (0)
+
+#define MPLS_INC_STATS(mdev, field)			\
+	this_cpu_inc((mdev)->stats->stats.field)
+
+#endif
+
 struct sk_buff;
 
 #define LABEL_NOT_SPECIFIED (1 << 20)
@@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
 	return result;
 }
 
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
 		   const u32 label[]);
 int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
@@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
 bool mpls_output_possible(const struct net_device *dev);
 unsigned int mpls_dev_mtu(const struct net_device *dev);
 bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+				 const struct sk_buff *skb);
 
 #endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 2f7ccd934416..02531284bc49 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = NULL;
 	struct rt6_info *rt6 = NULL;
+	struct mpls_dev *out_mdev;
 	int err = 0;
 	bool bos;
 	int i;
 	unsigned int ttl;
 
+	/* Find the output device */
+	out_dev = dst->dev;
+
 	/* Obtain the ttl */
 	if (dst->ops->family == AF_INET) {
 		ttl = ip_hdr(skb)->ttl;
@@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb)
 
 	skb_orphan(skb);
 
-	/* Find the output device */
-	out_dev = dst->dev;
 	if (!mpls_output_possible(out_dev) ||
 	    !dst->lwtstate || skb_warn_if_lro(skb))
 		goto drop;
@@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb)
 		bos = false;
 	}
 
+	mpls_stats_inc_outucastpkts(out_dev, skb);
+
 	if (rt)
 		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
 				 skb);
@@ -122,6 +126,9 @@ static int mpls_xmit(struct sk_buff *skb)
 	return LWTUNNEL_XMIT_DONE;
 
 drop:
+	out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL;
+	if (out_mdev)
+		MPLS_INC_STATS(out_mdev, tx_errors);
 	kfree_skb(skb);
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 53631a5f9c6669264adb7b4e92fd95d1d6ffa7d3 Mon Sep 17 00:00:00 2001
From: Lance Richardson <lrichard@redhat.com>
Date: Mon, 16 Jan 2017 18:11:35 -0500
Subject: bridge: sparse fixes in br_ip6_multicast_alloc_query()

Changed type of csum field in struct igmpv3_query from __be16 to
__sum16 to eliminate type warning, made same change in struct
igmpv3_report for consistency.

Fixed up an ntohs() where htons() should have been used instead.

Signed-off-by: Lance Richardson <lrichard@redhat.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/igmp.h | 4 ++--
 net/bridge/br_multicast.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h
index ccbb32aa6704..a97f9a7568cf 100644
--- a/include/uapi/linux/igmp.h
+++ b/include/uapi/linux/igmp.h
@@ -53,7 +53,7 @@ struct igmpv3_grec {
 struct igmpv3_report {
 	__u8 type;
 	__u8 resv1;
-	__be16 csum;
+	__sum16 csum;
 	__be16 resv2;
 	__be16 ngrec;
 	struct igmpv3_grec grec[0];
@@ -62,7 +62,7 @@ struct igmpv3_report {
 struct igmpv3_query {
 	__u8 type;
 	__u8 code;
-	__be16 csum;
+	__sum16 csum;
 	__be32 group;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u8 qrv:3,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b30e77e8427c..f66346122dc4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -540,7 +540,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 		break;
 	case 2:
 		mld2q = (struct mld2_query *)icmp6_hdr(skb);
-		mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval));
+		mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval));
 		mld2q->mld2q_type = ICMPV6_MGM_QUERY;
 		mld2q->mld2q_code = 0;
 		mld2q->mld2q_cksum = 0;
-- 
cgit v1.2.3


From c0cdc19f84a4712cf74888f83af286e3c2e14efd Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Wed, 11 Jan 2017 06:35:12 -0800
Subject: rpmsg: Driver for user space endpoint interface

This driver allows rpmsg instances to expose access to rpmsg endpoints
to user space processes. It provides a control interface, allowing
userspace to export endpoints and an endpoint interface for each exposed
endpoint.

The implementation is based on prior art by Texas Instrument, Google,
PetaLogix and was derived from a FreeRTOS performance statistics driver
written by Michal Simek.

The control interface provides a "create endpoint" ioctl, which is fed a
name, source and destination address. The three values are used to
create the endpoint, in a backend-specific way, and a rpmsg endpoint
device is created - with the three parameters are available in sysfs for
udev usage.

E.g. to create an endpoint device for one of the Qualcomm SMD channel
related to DIAG one would issue:

  struct rpmsg_endpoint_info info = { "DIAG_CNTL", 0, 0 };
  int fd = open("/dev/rpmsg_ctrl0", O_RDWR);
  ioctl(fd, RPMSG_CREATE_EPT_IOCTL, &info);

Each created endpoint device shows up as an individual character device
in /dev, allowing permission to be controlled on a per-endpoint basis.
The rpmsg endpoint will be created and destroyed following the opening
and closing of the endpoint device, allowing rpmsg backends to open and
close the physical channel, if supported by the wire protocol.

Cc: Marek Novak <marek.novak@nxp.com>
Cc: Matteo Sartori <matteo.sartori@t3lab.it>
Cc: Michal Simek <monstr@monstr.eu>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/ioctl/ioctl-number.txt |   1 +
 drivers/rpmsg/Kconfig                |   8 +
 drivers/rpmsg/Makefile               |   1 +
 drivers/rpmsg/rpmsg_char.c           | 585 +++++++++++++++++++++++++++++++++++
 drivers/rpmsg/rpmsg_internal.h       |  15 +
 include/uapi/linux/rpmsg.h           |  35 +++
 6 files changed, 645 insertions(+)
 create mode 100644 drivers/rpmsg/rpmsg_char.c
 create mode 100644 include/uapi/linux/rpmsg.h

(limited to 'include/uapi')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 81c7f2bb7daf..08244bea5048 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -321,6 +321,7 @@ Code  Seq#(hex)	Include File		Comments
 0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
 0xB3	00	linux/mmc/ioctl.h
 0xB4	00-0F	linux/gpio.h		<mailto:linux-gpio@vger.kernel.org>
+0xB5	00-0F	uapi/linux/rpmsg.h	<mailto:linux-remoteproc@vger.kernel.org>
 0xC0	00-0F	linux/usb/iowarrior.h
 0xCA	00-0F	uapi/misc/cxl.h
 0xCA	80-8F	uapi/scsi/cxlflash_ioctl.h
diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig
index de31c5f14dd9..fa0d582efb3d 100644
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -4,6 +4,14 @@ menu "Rpmsg drivers"
 config RPMSG
 	tristate
 
+config RPMSG_CHAR
+	tristate "RPMSG device interface"
+	depends on RPMSG
+	help
+	  Say Y here to export rpmsg endpoints as device files, usually found
+	  in /dev. They make it possible for user-space programs to send and
+	  receive rpmsg packets.
+
 config RPMSG_QCOM_SMD
 	tristate "Qualcomm Shared Memory Driver (SMD)"
 	depends on QCOM_SMEM
diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile
index ae9c9132cf76..fae9a6d548fb 100644
--- a/drivers/rpmsg/Makefile
+++ b/drivers/rpmsg/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_RPMSG)		+= rpmsg_core.o
+obj-$(CONFIG_RPMSG_CHAR)	+= rpmsg_char.o
 obj-$(CONFIG_RPMSG_QCOM_SMD)	+= qcom_smd.o
 obj-$(CONFIG_RPMSG_VIRTIO)	+= virtio_rpmsg_bus.o
diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c
new file mode 100644
index 000000000000..a78b6b79cea4
--- /dev/null
+++ b/drivers/rpmsg/rpmsg_char.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2016, Linaro Ltd.
+ * Copyright (c) 2012, Michal Simek <monstr@monstr.eu>
+ * Copyright (c) 2012, PetaLogix
+ * Copyright (c) 2011, Texas Instruments, Inc.
+ * Copyright (c) 2011, Google, Inc.
+ *
+ * Based on rpmsg performance statistics driver by Michal Simek, which in turn
+ * was based on TI & Google OMX rpmsg driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/rpmsg.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/rpmsg.h>
+
+#include "rpmsg_internal.h"
+
+#define RPMSG_DEV_MAX	(MINORMASK + 1)
+
+static dev_t rpmsg_major;
+static struct class *rpmsg_class;
+
+static DEFINE_IDA(rpmsg_ctrl_ida);
+static DEFINE_IDA(rpmsg_ept_ida);
+static DEFINE_IDA(rpmsg_minor_ida);
+
+#define dev_to_eptdev(dev) container_of(dev, struct rpmsg_eptdev, dev)
+#define cdev_to_eptdev(i_cdev) container_of(i_cdev, struct rpmsg_eptdev, cdev)
+
+#define dev_to_ctrldev(dev) container_of(dev, struct rpmsg_ctrldev, dev)
+#define cdev_to_ctrldev(i_cdev) container_of(i_cdev, struct rpmsg_ctrldev, cdev)
+
+/**
+ * struct rpmsg_ctrldev - control device for instantiating endpoint devices
+ * @rpdev:	underlaying rpmsg device
+ * @cdev:	cdev for the ctrl device
+ * @dev:	device for the ctrl device
+ */
+struct rpmsg_ctrldev {
+	struct rpmsg_device *rpdev;
+	struct cdev cdev;
+	struct device dev;
+};
+
+/**
+ * struct rpmsg_eptdev - endpoint device context
+ * @dev:	endpoint device
+ * @cdev:	cdev for the endpoint device
+ * @rpdev:	underlaying rpmsg device
+ * @chinfo:	info used to open the endpoint
+ * @ept_lock:	synchronization of @ept modifications
+ * @ept:	rpmsg endpoint reference, when open
+ * @queue_lock:	synchronization of @queue operations
+ * @queue:	incoming message queue
+ * @readq:	wait object for incoming queue
+ */
+struct rpmsg_eptdev {
+	struct device dev;
+	struct cdev cdev;
+
+	struct rpmsg_device *rpdev;
+	struct rpmsg_channel_info chinfo;
+
+	struct mutex ept_lock;
+	struct rpmsg_endpoint *ept;
+
+	spinlock_t queue_lock;
+	struct sk_buff_head queue;
+	wait_queue_head_t readq;
+};
+
+static int rpmsg_eptdev_destroy(struct device *dev, void *data)
+{
+	struct rpmsg_eptdev *eptdev = dev_to_eptdev(dev);
+
+	mutex_lock(&eptdev->ept_lock);
+	if (eptdev->ept) {
+		rpmsg_destroy_ept(eptdev->ept);
+		eptdev->ept = NULL;
+	}
+	mutex_unlock(&eptdev->ept_lock);
+
+	/* wake up any blocked readers */
+	wake_up_interruptible(&eptdev->readq);
+
+	device_del(&eptdev->dev);
+	put_device(&eptdev->dev);
+
+	return 0;
+}
+
+static int rpmsg_ept_cb(struct rpmsg_device *rpdev, void *buf, int len,
+			void *priv, u32 addr)
+{
+	struct rpmsg_eptdev *eptdev = priv;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	memcpy(skb_put(skb, len), buf, len);
+
+	spin_lock(&eptdev->queue_lock);
+	skb_queue_tail(&eptdev->queue, skb);
+	spin_unlock(&eptdev->queue_lock);
+
+	/* wake up any blocking processes, waiting for new data */
+	wake_up_interruptible(&eptdev->readq);
+
+	return 0;
+}
+
+static int rpmsg_eptdev_open(struct inode *inode, struct file *filp)
+{
+	struct rpmsg_eptdev *eptdev = cdev_to_eptdev(inode->i_cdev);
+	struct rpmsg_endpoint *ept;
+	struct rpmsg_device *rpdev = eptdev->rpdev;
+	struct device *dev = &eptdev->dev;
+
+	get_device(dev);
+
+	ept = rpmsg_create_ept(rpdev, rpmsg_ept_cb, eptdev, eptdev->chinfo);
+	if (!ept) {
+		dev_err(dev, "failed to open %s\n", eptdev->chinfo.name);
+		put_device(dev);
+		return -EINVAL;
+	}
+
+	eptdev->ept = ept;
+	filp->private_data = eptdev;
+
+	return 0;
+}
+
+static int rpmsg_eptdev_release(struct inode *inode, struct file *filp)
+{
+	struct rpmsg_eptdev *eptdev = cdev_to_eptdev(inode->i_cdev);
+	struct device *dev = &eptdev->dev;
+	struct sk_buff *skb;
+
+	/* Close the endpoint, if it's not already destroyed by the parent */
+	mutex_lock(&eptdev->ept_lock);
+	if (eptdev->ept) {
+		rpmsg_destroy_ept(eptdev->ept);
+		eptdev->ept = NULL;
+	}
+	mutex_unlock(&eptdev->ept_lock);
+
+	/* Discard all SKBs */
+	while (!skb_queue_empty(&eptdev->queue)) {
+		skb = skb_dequeue(&eptdev->queue);
+		kfree_skb(skb);
+	}
+
+	put_device(dev);
+
+	return 0;
+}
+
+static ssize_t rpmsg_eptdev_read(struct file *filp, char __user *buf,
+				 size_t len, loff_t *f_pos)
+{
+	struct rpmsg_eptdev *eptdev = filp->private_data;
+	unsigned long flags;
+	struct sk_buff *skb;
+	int use;
+
+	if (!eptdev->ept)
+		return -EPIPE;
+
+	spin_lock_irqsave(&eptdev->queue_lock, flags);
+
+	/* Wait for data in the queue */
+	if (skb_queue_empty(&eptdev->queue)) {
+		spin_unlock_irqrestore(&eptdev->queue_lock, flags);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		/* Wait until we get data or the endpoint goes away */
+		if (wait_event_interruptible(eptdev->readq,
+					     !skb_queue_empty(&eptdev->queue) ||
+					     !eptdev->ept))
+			return -ERESTARTSYS;
+
+		/* We lost the endpoint while waiting */
+		if (!eptdev->ept)
+			return -EPIPE;
+
+		spin_lock_irqsave(&eptdev->queue_lock, flags);
+	}
+
+	skb = skb_dequeue(&eptdev->queue);
+	if (!skb)
+		return -EFAULT;
+
+	spin_unlock_irqrestore(&eptdev->queue_lock, flags);
+
+	use = min_t(size_t, len, skb->len);
+	if (copy_to_user(buf, skb->data, use))
+		use = -EFAULT;
+
+	kfree_skb(skb);
+
+	return use;
+}
+
+static ssize_t rpmsg_eptdev_write(struct file *filp, const char __user *buf,
+				  size_t len, loff_t *f_pos)
+{
+	struct rpmsg_eptdev *eptdev = filp->private_data;
+	void *kbuf;
+	int ret;
+
+	kbuf = memdup_user(buf, len);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+
+	if (mutex_lock_interruptible(&eptdev->ept_lock)) {
+		ret = -ERESTARTSYS;
+		goto free_kbuf;
+	}
+
+	if (!eptdev->ept) {
+		ret = -EPIPE;
+		goto unlock_eptdev;
+	}
+
+	if (filp->f_flags & O_NONBLOCK)
+		ret = rpmsg_trysend(eptdev->ept, kbuf, len);
+	else
+		ret = rpmsg_send(eptdev->ept, kbuf, len);
+
+unlock_eptdev:
+	mutex_unlock(&eptdev->ept_lock);
+
+free_kbuf:
+	kfree(kbuf);
+	return ret < 0 ? ret : len;
+}
+
+static unsigned int rpmsg_eptdev_poll(struct file *filp, poll_table *wait)
+{
+	struct rpmsg_eptdev *eptdev = filp->private_data;
+	unsigned int mask = 0;
+
+	if (!eptdev->ept)
+		return POLLERR;
+
+	poll_wait(filp, &eptdev->readq, wait);
+
+	if (!skb_queue_empty(&eptdev->queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	mask |= rpmsg_poll(eptdev->ept, filp, wait);
+
+	return mask;
+}
+
+static long rpmsg_eptdev_ioctl(struct file *fp, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct rpmsg_eptdev *eptdev = fp->private_data;
+
+	if (cmd != RPMSG_DESTROY_EPT_IOCTL)
+		return -EINVAL;
+
+	return rpmsg_eptdev_destroy(&eptdev->dev, NULL);
+}
+
+static const struct file_operations rpmsg_eptdev_fops = {
+	.owner = THIS_MODULE,
+	.open = rpmsg_eptdev_open,
+	.release = rpmsg_eptdev_release,
+	.read = rpmsg_eptdev_read,
+	.write = rpmsg_eptdev_write,
+	.poll = rpmsg_eptdev_poll,
+	.unlocked_ioctl = rpmsg_eptdev_ioctl,
+};
+
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct rpmsg_eptdev *eptdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", eptdev->chinfo.name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t src_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct rpmsg_eptdev *eptdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", eptdev->chinfo.src);
+}
+static DEVICE_ATTR_RO(src);
+
+static ssize_t dst_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct rpmsg_eptdev *eptdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", eptdev->chinfo.dst);
+}
+static DEVICE_ATTR_RO(dst);
+
+static struct attribute *rpmsg_eptdev_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_src.attr,
+	&dev_attr_dst.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(rpmsg_eptdev);
+
+static void rpmsg_eptdev_release_device(struct device *dev)
+{
+	struct rpmsg_eptdev *eptdev = dev_to_eptdev(dev);
+
+	ida_simple_remove(&rpmsg_ept_ida, dev->id);
+	ida_simple_remove(&rpmsg_minor_ida, MINOR(eptdev->dev.devt));
+	cdev_del(&eptdev->cdev);
+	kfree(eptdev);
+}
+
+static int rpmsg_eptdev_create(struct rpmsg_ctrldev *ctrldev,
+			       struct rpmsg_channel_info chinfo)
+{
+	struct rpmsg_device *rpdev = ctrldev->rpdev;
+	struct rpmsg_eptdev *eptdev;
+	struct device *dev;
+	int ret;
+
+	eptdev = kzalloc(sizeof(*eptdev), GFP_KERNEL);
+	if (!eptdev)
+		return -ENOMEM;
+
+	dev = &eptdev->dev;
+	eptdev->rpdev = rpdev;
+	eptdev->chinfo = chinfo;
+
+	mutex_init(&eptdev->ept_lock);
+	spin_lock_init(&eptdev->queue_lock);
+	skb_queue_head_init(&eptdev->queue);
+	init_waitqueue_head(&eptdev->readq);
+
+	device_initialize(dev);
+	dev->class = rpmsg_class;
+	dev->parent = &ctrldev->dev;
+	dev->groups = rpmsg_eptdev_groups;
+	dev_set_drvdata(dev, eptdev);
+
+	cdev_init(&eptdev->cdev, &rpmsg_eptdev_fops);
+	eptdev->cdev.owner = THIS_MODULE;
+
+	ret = ida_simple_get(&rpmsg_minor_ida, 0, RPMSG_DEV_MAX, GFP_KERNEL);
+	if (ret < 0)
+		goto free_eptdev;
+	dev->devt = MKDEV(MAJOR(rpmsg_major), ret);
+
+	ret = ida_simple_get(&rpmsg_ept_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto free_minor_ida;
+	dev->id = ret;
+	dev_set_name(dev, "rpmsg%d", ret);
+
+	ret = cdev_add(&eptdev->cdev, dev->devt, 1);
+	if (ret)
+		goto free_ept_ida;
+
+	/* We can now rely on the release function for cleanup */
+	dev->release = rpmsg_eptdev_release_device;
+
+	ret = device_add(dev);
+	if (ret) {
+		dev_err(dev, "device_register failed: %d\n", ret);
+		put_device(dev);
+	}
+
+	return ret;
+
+free_ept_ida:
+	ida_simple_remove(&rpmsg_ept_ida, dev->id);
+free_minor_ida:
+	ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt));
+free_eptdev:
+	put_device(dev);
+	kfree(eptdev);
+
+	return ret;
+}
+
+static int rpmsg_ctrldev_open(struct inode *inode, struct file *filp)
+{
+	struct rpmsg_ctrldev *ctrldev = cdev_to_ctrldev(inode->i_cdev);
+
+	get_device(&ctrldev->dev);
+	filp->private_data = ctrldev;
+
+	return 0;
+}
+
+static int rpmsg_ctrldev_release(struct inode *inode, struct file *filp)
+{
+	struct rpmsg_ctrldev *ctrldev = cdev_to_ctrldev(inode->i_cdev);
+
+	put_device(&ctrldev->dev);
+
+	return 0;
+}
+
+static long rpmsg_ctrldev_ioctl(struct file *fp, unsigned int cmd,
+				unsigned long arg)
+{
+	struct rpmsg_ctrldev *ctrldev = fp->private_data;
+	void __user *argp = (void __user *)arg;
+	struct rpmsg_endpoint_info eptinfo;
+	struct rpmsg_channel_info chinfo;
+
+	if (cmd != RPMSG_CREATE_EPT_IOCTL)
+		return -EINVAL;
+
+	if (copy_from_user(&eptinfo, argp, sizeof(eptinfo)))
+		return -EFAULT;
+
+	memcpy(chinfo.name, eptinfo.name, RPMSG_NAME_SIZE);
+	chinfo.name[RPMSG_NAME_SIZE-1] = '\0';
+	chinfo.src = eptinfo.src;
+	chinfo.dst = eptinfo.dst;
+
+	return rpmsg_eptdev_create(ctrldev, chinfo);
+};
+
+static const struct file_operations rpmsg_ctrldev_fops = {
+	.owner = THIS_MODULE,
+	.open = rpmsg_ctrldev_open,
+	.release = rpmsg_ctrldev_release,
+	.unlocked_ioctl = rpmsg_ctrldev_ioctl,
+};
+
+static void rpmsg_ctrldev_release_device(struct device *dev)
+{
+	struct rpmsg_ctrldev *ctrldev = dev_to_ctrldev(dev);
+
+	ida_simple_remove(&rpmsg_ctrl_ida, dev->id);
+	ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt));
+	cdev_del(&ctrldev->cdev);
+	kfree(ctrldev);
+}
+
+static int rpmsg_chrdev_probe(struct rpmsg_device *rpdev)
+{
+	struct rpmsg_ctrldev *ctrldev;
+	struct device *dev;
+	int ret;
+
+	ctrldev = kzalloc(sizeof(*ctrldev), GFP_KERNEL);
+	if (!ctrldev)
+		return -ENOMEM;
+
+	ctrldev->rpdev = rpdev;
+
+	dev = &ctrldev->dev;
+	device_initialize(dev);
+	dev->parent = &rpdev->dev;
+	dev->class = rpmsg_class;
+
+	cdev_init(&ctrldev->cdev, &rpmsg_ctrldev_fops);
+	ctrldev->cdev.owner = THIS_MODULE;
+
+	ret = ida_simple_get(&rpmsg_minor_ida, 0, RPMSG_DEV_MAX, GFP_KERNEL);
+	if (ret < 0)
+		goto free_ctrldev;
+	dev->devt = MKDEV(MAJOR(rpmsg_major), ret);
+
+	ret = ida_simple_get(&rpmsg_ctrl_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto free_minor_ida;
+	dev->id = ret;
+	dev_set_name(&ctrldev->dev, "rpmsg_ctrl%d", ret);
+
+	ret = cdev_add(&ctrldev->cdev, dev->devt, 1);
+	if (ret)
+		goto free_ctrl_ida;
+
+	/* We can now rely on the release function for cleanup */
+	dev->release = rpmsg_ctrldev_release_device;
+
+	ret = device_add(dev);
+	if (ret) {
+		dev_err(&rpdev->dev, "device_register failed: %d\n", ret);
+		put_device(dev);
+	}
+
+	dev_set_drvdata(&rpdev->dev, ctrldev);
+
+	return ret;
+
+free_ctrl_ida:
+	ida_simple_remove(&rpmsg_ctrl_ida, dev->id);
+free_minor_ida:
+	ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt));
+free_ctrldev:
+	put_device(dev);
+	kfree(ctrldev);
+
+	return ret;
+}
+
+static void rpmsg_chrdev_remove(struct rpmsg_device *rpdev)
+{
+	struct rpmsg_ctrldev *ctrldev = dev_get_drvdata(&rpdev->dev);
+	int ret;
+
+	/* Destroy all endpoints */
+	ret = device_for_each_child(&ctrldev->dev, NULL, rpmsg_eptdev_destroy);
+	if (ret)
+		dev_warn(&rpdev->dev, "failed to nuke endpoints: %d\n", ret);
+
+	device_del(&ctrldev->dev);
+	put_device(&ctrldev->dev);
+}
+
+static struct rpmsg_driver rpmsg_chrdev_driver = {
+	.probe = rpmsg_chrdev_probe,
+	.remove = rpmsg_chrdev_remove,
+	.drv = {
+		.name = "rpmsg_chrdev",
+	},
+};
+
+static int rpmsg_char_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&rpmsg_major, 0, RPMSG_DEV_MAX, "rpmsg");
+	if (ret < 0) {
+		pr_err("rpmsg: failed to allocate char dev region\n");
+		return ret;
+	}
+
+	rpmsg_class = class_create(THIS_MODULE, "rpmsg");
+	if (IS_ERR(rpmsg_class)) {
+		pr_err("failed to create rpmsg class\n");
+		unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX);
+		return PTR_ERR(rpmsg_class);
+	}
+
+	ret = register_rpmsg_driver(&rpmsg_chrdev_driver);
+	if (ret < 0) {
+		pr_err("rpmsgchr: failed to register rpmsg driver\n");
+		class_destroy(rpmsg_class);
+		unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX);
+	}
+
+	return ret;
+}
+postcore_initcall(rpmsg_char_init);
+
+static void rpmsg_chrdev_exit(void)
+{
+	unregister_rpmsg_driver(&rpmsg_chrdev_driver);
+	class_destroy(rpmsg_class);
+	unregister_chrdev_region(rpmsg_major, RPMSG_DEV_MAX);
+}
+module_exit(rpmsg_chrdev_exit);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rpmsg/rpmsg_internal.h b/drivers/rpmsg/rpmsg_internal.h
index 6176f2457b6b..0cf9c7e2ee83 100644
--- a/drivers/rpmsg/rpmsg_internal.h
+++ b/drivers/rpmsg/rpmsg_internal.h
@@ -82,4 +82,19 @@ int rpmsg_unregister_device(struct device *parent,
 struct device *rpmsg_find_device(struct device *parent,
 				 struct rpmsg_channel_info *chinfo);
 
+/**
+ * rpmsg_chrdev_register_device() - register chrdev device based on rpdev
+ * @rpdev:	prepared rpdev to be used for creating endpoints
+ *
+ * This function wraps rpmsg_register_device() preparing the rpdev for use as
+ * basis for the rpmsg chrdev.
+ */
+static inline int rpmsg_chrdev_register_device(struct rpmsg_device *rpdev)
+{
+	strcpy(rpdev->id.name, "rpmsg_chrdev");
+	rpdev->driver_override = "rpmsg_chrdev";
+
+	return rpmsg_register_device(rpdev);
+}
+
 #endif
diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
new file mode 100644
index 000000000000..dedc226e0d3f
--- /dev/null
+++ b/include/uapi/linux/rpmsg.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_RPMSG_H_
+#define _UAPI_RPMSG_H_
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * struct rpmsg_endpoint_info - endpoint info representation
+ * @name: name of service
+ * @src: local address
+ * @dst: destination address
+ */
+struct rpmsg_endpoint_info {
+	char name[32];
+	__u32 src;
+	__u32 dst;
+};
+
+#define RPMSG_CREATE_EPT_IOCTL	_IOW(0xb5, 0x1, struct rpmsg_endpoint_info)
+#define RPMSG_DESTROY_EPT_IOCTL	_IO(0xb5, 0x2)
+
+#endif
-- 
cgit v1.2.3


From 92c82e8a322b32a6cabe7d8800dc10401157a623 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 13 Jan 2017 03:26:29 -0500
Subject: audit: add feature audit_lost reset

Add a method to reset the audit_lost value.

An AUDIT_SET message with the AUDIT_STATUS_LOST flag set by itself
will return a positive value repesenting the current audit_lost value
and reset the counter to zero.  If AUDIT_STATUS_LOST is not the
only flag set, the reset command will be ignored.  The value sent with
the command is ignored.  The return value will be the +ve lost value at
reset time.

An AUDIT_CONFIG_CHANGE message will be queued to the listening audit
daemon.  The message will be a standard CONFIG_CHANGE message with the
fields "lost=0" and "old=" with the latter containing the value of
audit_lost at reset time.

See: https://github.com/linux-audit/audit-kernel/issues/3

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h | 6 +++++-
 kernel/audit.c             | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c8dc97bc2c1b..3f24110ae63c 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -326,15 +326,19 @@ enum {
 #define AUDIT_STATUS_RATE_LIMIT		0x0008
 #define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
 #define AUDIT_STATUS_BACKLOG_WAIT_TIME	0x0020
+#define AUDIT_STATUS_LOST		0x0040
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
 #define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH	0x00000004
 #define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
+#define AUDIT_FEATURE_BITMAP_LOST_RESET		0x00000020
+
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
 				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \
-				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER)
+				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
+				  AUDIT_FEATURE_BITMAP_LOST_RESET)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/audit.c b/kernel/audit.c
index 57acf2541fdd..25dd70a588b2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -121,7 +121,7 @@ u32		audit_sig_sid = 0;
    3) suppressed due to audit_rate_limit
    4) suppressed due to audit_backlog_limit
 */
-static atomic_t    audit_lost = ATOMIC_INIT(0);
+static atomic_t	audit_lost = ATOMIC_INIT(0);
 
 /* The netlink socket. */
 static struct sock *audit_sock;
@@ -1052,6 +1052,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			if (err < 0)
 				return err;
 		}
+		if (s.mask == AUDIT_STATUS_LOST) {
+			u32 lost = atomic_xchg(&audit_lost, 0);
+
+			audit_log_config_change("lost", 0, lost, 1);
+			return lost;
+		}
 		break;
 	}
 	case AUDIT_GET_FEATURE:
-- 
cgit v1.2.3


From 9fb657aec0e20b4ed4401c44a4140f8d7b7a9ca0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 18 Jan 2017 00:44:46 +0800
Subject: sctp: add sockopt SCTP_ENABLE_STREAM_RESET

This patch is to add sockopt SCTP_ENABLE_STREAM_RESET to get/set
strreset_enable to indicate which reconf request type it supports,
which is described in rfc6525 section 6.3.1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 +++
 include/uapi/linux/sctp.h  |  7 ++++
 net/sctp/associola.c       |  1 +
 net/sctp/socket.c          | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ee037ef15d65..d99b76e33b2e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1257,6 +1257,8 @@ struct sctp_endpoint {
 	__u8  auth_enable:1,
 	      prsctp_enable:1,
 	      reconf_enable:1;
+
+	__u8  strreset_enable;
 };
 
 /* Recover the outter endpoint structure. */
@@ -1872,6 +1874,8 @@ struct sctp_association {
 	     prsctp_enable:1,
 	     reconf_enable:1;
 
+	__u8 strreset_enable;
+
 	__u32 strreset_outseq; /* Update after receiving response */
 	__u32 strreset_inseq; /* Update after receiving request */
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index a406adcc0793..867be0f32fd7 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -115,6 +115,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_SUPPORTED	113
 #define SCTP_DEFAULT_PRINFO	114
 #define SCTP_PR_ASSOC_STATUS	115
+#define SCTP_ENABLE_STREAM_RESET	118
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -138,6 +139,12 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_RTX_ENABLED(x)	(SCTP_PR_POLICY(x) == SCTP_PR_SCTP_RTX)
 #define SCTP_PR_PRIO_ENABLED(x)	(SCTP_PR_POLICY(x) == SCTP_PR_SCTP_PRIO)
 
+/* For enable stream reset */
+#define SCTP_ENABLE_RESET_STREAM_REQ	0x01
+#define SCTP_ENABLE_RESET_ASSOC_REQ	0x02
+#define SCTP_ENABLE_CHANGE_ASSOC_REQ	0x04
+#define SCTP_ENABLE_STRRESET_MASK	0x07
+
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in <bits/socket.h> as an enum.  */
 enum sctp_msg_flags {
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 68b99adc21a3..e50dc6d7543f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -271,6 +271,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->active_key_id = ep->active_key_id;
 	asoc->prsctp_enable = ep->prsctp_enable;
 	asoc->reconf_enable = ep->reconf_enable;
+	asoc->strreset_enable = ep->strreset_enable;
 
 	/* Save the hmacs and chunks list into this association */
 	if (ep->auth_hmacs_list)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 635e03412693..0a9bc984b6c8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3750,6 +3750,42 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_enable_strreset(struct sock *sk,
+					   char __user *optval,
+					   unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		asoc->strreset_enable = params.assoc_value;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		sp->ep->strreset_enable = params.assoc_value;
+	} else {
+		goto out;
+	}
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3916,6 +3952,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_DEFAULT_PRINFO:
 		retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
 		break;
+	case SCTP_ENABLE_STREAM_RESET:
+		retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6400,6 +6439,47 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
+					   char __user *optval,
+					   int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (asoc) {
+		params.assoc_value = asoc->strreset_enable;
+	} else if (!params.assoc_id) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		params.assoc_value = sp->ep->strreset_enable;
+	} else {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6567,6 +6647,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
 							optlen);
 		break;
+	case SCTP_ENABLE_STREAM_RESET:
+		retval = sctp_getsockopt_enable_strreset(sk, len, optval,
+							 optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 7f9d68ac944e24ee5f9ac8d059ca00b1c1d34137 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 18 Jan 2017 00:44:47 +0800
Subject: sctp: implement sender-side procedures for SSN Reset Request
 Parameter

This patch is to implement sender-side procedures for the Outgoing
and Incoming SSN Reset Request Parameter described in rfc6525 section
5.1.2 and 5.1.3.

It is also add sockopt SCTP_RESET_STREAMS in rfc6525 section 6.3.2
for users.

Note that the new asoc member strreset_outstanding is to make sure
only one reconf request chunk on the fly as rfc6525 section 5.1.1
demands.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    |  6 ++++
 include/net/sctp/structs.h |  1 +
 include/uapi/linux/sctp.h  | 11 +++++++
 net/sctp/outqueue.c        | 33 +++++++++++++------
 net/sctp/socket.c          | 29 +++++++++++++++++
 net/sctp/stream.c          | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 149 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index bc0e049b1474..3cfd365bcfbc 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -193,6 +193,12 @@ void sctp_remaddr_proc_exit(struct net *net);
  */
 int sctp_offload_init(void);
 
+/*
+ * sctp/stream.c
+ */
+int sctp_send_reset_streams(struct sctp_association *asoc,
+			    struct sctp_reset_streams *params);
+
 /*
  * Module global variables
  */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d99b76e33b2e..231fa9ac50bd 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1875,6 +1875,7 @@ struct sctp_association {
 	     reconf_enable:1;
 
 	__u8 strreset_enable;
+	__u8 strreset_outstanding; /* request param count on the fly */
 
 	__u32 strreset_outseq; /* Update after receiving response */
 	__u32 strreset_inseq; /* Update after receiving request */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 867be0f32fd7..03c27cefffb1 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -116,6 +116,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_DEFAULT_PRINFO	114
 #define SCTP_PR_ASSOC_STATUS	115
 #define SCTP_ENABLE_STREAM_RESET	118
+#define SCTP_RESET_STREAMS	119
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -145,6 +146,9 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ENABLE_CHANGE_ASSOC_REQ	0x04
 #define SCTP_ENABLE_STRRESET_MASK	0x07
 
+#define SCTP_STREAM_RESET_INCOMING	0x01
+#define SCTP_STREAM_RESET_OUTGOING	0x02
+
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in <bits/socket.h> as an enum.  */
 enum sctp_msg_flags {
@@ -1015,4 +1019,11 @@ struct sctp_info {
 	__u32	__reserved3;
 };
 
+struct sctp_reset_streams {
+	sctp_assoc_t srs_assoc_id;
+	uint16_t srs_flags;
+	uint16_t srs_number_streams;	/* 0 == ALL */
+	uint16_t srs_stream_list[];	/* list if srs_num_streams is not 0 */
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 34efaa4ef2f6..65abe22d8691 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -915,22 +915,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_ECN_ECNE:
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
+		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
 			if (status  != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
-			} else {
-				asoc->stats.octrlchunks++;
-				/* PR-SCTP C5) If a FORWARD TSN is sent, the
-				 * sender MUST assure that at least one T3-rtx
-				 * timer is running.
-				 */
-				if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
-					sctp_transport_reset_t3_rtx(transport);
-					transport->last_time_sent = jiffies;
-				}
+				break;
+			}
+
+			asoc->stats.octrlchunks++;
+			/* PR-SCTP C5) If a FORWARD TSN is sent, the
+			 * sender MUST assure that at least one T3-rtx
+			 * timer is running.
+			 */
+			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+				sctp_transport_reset_t3_rtx(transport);
+				transport->last_time_sent = jiffies;
 			}
+
+			if (chunk == asoc->strreset_chunk)
+				sctp_transport_reset_reconf_timer(transport);
+
 			break;
 
 		default:
@@ -1016,6 +1022,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 		/* Finally, transmit new packets.  */
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
+
 			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
 			 * stream identifier.
 			 */
@@ -1038,6 +1046,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				continue;
 			}
 
+			if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) {
+				sctp_outq_head_data(q, chunk);
+				goto sctp_flush_out;
+			}
+
 			/* If there is a specified transport, use it.
 			 * Otherwise, we want to use the active path.
 			 */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0a9bc984b6c8..bee4dd3feabb 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3786,6 +3786,32 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_reset_streams(struct sock *sk,
+					 char __user *optval,
+					 unsigned int optlen)
+{
+	struct sctp_reset_streams *params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(struct sctp_reset_streams))
+		return -EINVAL;
+
+	params = memdup_user(optval, optlen);
+	if (IS_ERR(params))
+		return PTR_ERR(params);
+
+	asoc = sctp_id2assoc(sk, params->srs_assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_reset_streams(asoc, params);
+
+out:
+	kfree(params);
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3955,6 +3981,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ENABLE_STREAM_RESET:
 		retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
 		break;
+	case SCTP_RESET_STREAMS:
+		retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f86de43cbbe5..13d5e07dcd7d 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -33,6 +33,7 @@
  */
 
 #include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
 
 struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp)
 {
@@ -83,3 +84,81 @@ void sctp_stream_clear(struct sctp_stream *stream)
 	for (i = 0; i < stream->incnt; i++)
 		stream->in[i].ssn = 0;
 }
+
+static int sctp_send_reconf(struct sctp_association *asoc,
+			    struct sctp_chunk *chunk)
+{
+	struct net *net = sock_net(asoc->base.sk);
+	int retval = 0;
+
+	retval = sctp_primitive_RECONF(net, asoc, chunk);
+	if (retval)
+		sctp_chunk_free(chunk);
+
+	return retval;
+}
+
+int sctp_send_reset_streams(struct sctp_association *asoc,
+			    struct sctp_reset_streams *params)
+{
+	struct sctp_stream *stream = asoc->stream;
+	__u16 i, str_nums, *str_list;
+	struct sctp_chunk *chunk;
+	int retval = -EINVAL;
+	bool out, in;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
+		retval = -ENOPROTOOPT;
+		goto out;
+	}
+
+	if (asoc->strreset_outstanding) {
+		retval = -EINPROGRESS;
+		goto out;
+	}
+
+	out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
+	in  = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
+	if (!out && !in)
+		goto out;
+
+	str_nums = params->srs_number_streams;
+	str_list = params->srs_stream_list;
+	if (out && str_nums)
+		for (i = 0; i < str_nums; i++)
+			if (str_list[i] >= stream->outcnt)
+				goto out;
+
+	if (in && str_nums)
+		for (i = 0; i < str_nums; i++)
+			if (str_list[i] >= stream->incnt)
+				goto out;
+
+	chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in);
+	if (!chunk)
+		goto out;
+
+	if (out) {
+		if (str_nums)
+			for (i = 0; i < str_nums; i++)
+				stream->out[str_list[i]].state =
+						       SCTP_STREAM_CLOSED;
+		else
+			for (i = 0; i < stream->outcnt; i++)
+				stream->out[i].state = SCTP_STREAM_CLOSED;
+	}
+
+	asoc->strreset_outstanding = out + in;
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+	}
+
+out:
+	return retval;
+}
-- 
cgit v1.2.3


From a5e8c07059d0f0b31737408711d44794928ac218 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 18 Jan 2017 17:55:49 +0000
Subject: bpf: add bpf_probe_read_str helper

Provide a simple helper with the same semantics of strncpy_from_unsafe():

int bpf_probe_read_str(void *dst, int size, const void *unsafe_addr)

This gives more flexibility to a bpf program. A typical use case is
intercepting a file name during sys_open(). The current approach is:

SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
	char buf[PATHLEN]; // PATHLEN is defined to 256
	bpf_probe_read(buf, sizeof(buf), ctx->di);

	/* consume buf */
}

This is suboptimal because the size of the string needs to be estimated
at compile time, causing more memory to be copied than often necessary,
and can become more problematic if further processing on buf is done,
for example by pushing it to userspace via bpf_perf_event_output(),
since the real length of the string is unknown and the entire buffer
must be copied (and defining an unrolled strnlen() inside the bpf
program is a very inefficient and unfeasible approach).

With the new helper, the code can easily operate on the actual string
length rather than the buffer size:

SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
	char buf[PATHLEN]; // PATHLEN is defined to 256
	int res = bpf_probe_read_str(buf, sizeof(buf), ctx->di);

	/* consume buf, for example push it to userspace via
	 * bpf_perf_event_output(), but this time we can use
	 * res (the string length) as event size, after checking
	 * its boundaries.
	 */
}

Another useful use case is when parsing individual process arguments or
individual environment variables navigating current->mm->arg_start and
current->mm->env_start: using this helper and the return value, one can
quickly iterate at the right offset of the memory area.

The code changes simply leverage the already existent
strncpy_from_unsafe() kernel function, which is safe to be called from a
bpf program as it is used in bpf_trace_printk().

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 15 ++++++++++++++-
 kernel/trace/bpf_trace.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..54a5894bb4ea 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -430,6 +430,18 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ *     Copy a NUL terminated string from unsafe address. In case the string
+ *     length is smaller than size, the target is not padded with further NUL
+ *     bytes. In case the string length is larger than size, just count-1
+ *     bytes are copied and the last byte is set to NUL.
+ *     @dst: destination address
+ *     @size: maximum number of bytes to copy, including the trailing NUL
+ *     @unsafe_ptr: unsafe address
+ *     Return:
+ *       > 0 length of the string including the trailing NUL on success
+ *       < 0 error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -476,7 +488,8 @@ union bpf_attr {
 	FN(set_hash_invalid),		\
 	FN(get_numa_node_id),		\
 	FN(skb_change_head),		\
-	FN(xdp_adjust_head),
+	FN(xdp_adjust_head),		\
+	FN(probe_read_str),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c22a961d1a42..424daa4586d1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
+	   const void *, unsafe_ptr)
+{
+	int ret;
+
+	/*
+	 * The strncpy_from_unsafe() call will likely not fill the entire
+	 * buffer, but that's okay in this circumstance as we're probing
+	 * arbitrary memory anyway similar to bpf_probe_read() and might
+	 * as well probe the stack. Thus, memory is explicitly cleared
+	 * only in error case, so that improper users ignoring return
+	 * code altogether don't copy garbage; otherwise length of string
+	 * is returned that can be used for bpf_perf_event_output() et al.
+	 */
+	ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
+	if (unlikely(ret < 0))
+		memset(dst, 0, size);
+
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_str_proto = {
+	.func		= bpf_probe_read_str,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_current_task_under_cgroup_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_probe_read_str:
+		return &bpf_probe_read_str_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 01fd12bb189a0772301dd37e9b31e53761269a1b Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Wed, 18 Jan 2017 13:50:53 -0500
Subject: tipc: make replicast a user selectable option

If the bearer carrying multicast messages supports broadcast, those
messages will be sent to all cluster nodes, irrespective of whether
these nodes host any actual destinations socket or not. This is clearly
wasteful if the cluster is large and there are only a few real
destinations for the message being sent.

In this commit we extend the eligibility of the newly introduced
"replicast" transmit option. We now make it possible for a user to
select which method he wants to be used, either as a mandatory setting
via setsockopt(), or as a relative setting where we let the broadcast
layer decide which method to use based on the ratio between cluster
size and the message's actual number of destination nodes.

In the latter case, a sending socket must stick to a previously
selected method until it enters an idle period of at least 5 seconds.
This eliminates the risk of message reordering caused by method change,
i.e., when changes to cluster size or number of destinations would
otherwise mandate a new method to be used.

Reviewed-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |  6 +++--
 net/tipc/bcast.c          | 62 ++++++++++++++++++++++++++++++++++++++++++-----
 net/tipc/bcast.h          | 17 ++++++++++++-
 net/tipc/link.c           |  4 +++
 net/tipc/node.h           |  4 ++-
 net/tipc/socket.c         | 36 +++++++++++++++++++++------
 6 files changed, 112 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index bf049e8fe31b..5351b08c897a 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -1,7 +1,7 @@
 /*
  * include/uapi/linux/tipc.h: Header for TIPC socket interface
  *
- * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2003-2006, 2015-2016 Ericsson AB
  * Copyright (c) 2005, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -220,7 +220,7 @@ struct sockaddr_tipc {
 #define TIPC_DESTNAME	3	/* destination name */
 
 /*
- * TIPC-specific socket option values
+ * TIPC-specific socket option names
  */
 
 #define TIPC_IMPORTANCE		127	/* Default: TIPC_LOW_IMPORTANCE */
@@ -229,6 +229,8 @@ struct sockaddr_tipc {
 #define TIPC_CONN_TIMEOUT	130	/* Default: 8000 (ms)  */
 #define TIPC_NODE_RECVQ_DEPTH	131	/* Default: none (read only) */
 #define TIPC_SOCK_RECVQ_DEPTH	132	/* Default: none (read only) */
+#define TIPC_MCAST_BROADCAST    133     /* Default: TIPC selects. No arg */
+#define TIPC_MCAST_REPLICAST    134     /* Default: TIPC selects. No arg */
 
 /*
  * Maximum sizes of TIPC bearer-related names (including terminating NULL)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 672e6ef93cab..7d99029df342 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -54,6 +54,9 @@ const char tipc_bclink_name[] = "broadcast-link";
  * @dest: array keeping number of reachable destinations per bearer
  * @primary_bearer: a bearer having links to all broadcast destinations, if any
  * @bcast_support: indicates if primary bearer, if any, supports broadcast
+ * @rcast_support: indicates if all peer nodes support replicast
+ * @rc_ratio: dest count as percentage of cluster size where send method changes
+ * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
  */
 struct tipc_bc_base {
 	struct tipc_link *link;
@@ -61,6 +64,9 @@ struct tipc_bc_base {
 	int dests[MAX_BEARERS];
 	int primary_bearer;
 	bool bcast_support;
+	bool rcast_support;
+	int rc_ratio;
+	int bc_threshold;
 };
 
 static struct tipc_bc_base *tipc_bc_base(struct net *net)
@@ -73,6 +79,19 @@ int tipc_bcast_get_mtu(struct net *net)
 	return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
 }
 
+void tipc_bcast_disable_rcast(struct net *net)
+{
+	tipc_bc_base(net)->rcast_support = false;
+}
+
+static void tipc_bcbase_calc_bc_threshold(struct net *net)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net));
+
+	bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100);
+}
+
 /* tipc_bcbase_select_primary(): find a bearer with links to all destinations,
  *                               if any, and make it primary bearer
  */
@@ -175,6 +194,31 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
 	__skb_queue_purge(&_xmitq);
 }
 
+static void tipc_bcast_select_xmit_method(struct net *net, int dests,
+					  struct tipc_mc_method *method)
+{
+	struct tipc_bc_base *bb = tipc_bc_base(net);
+	unsigned long exp = method->expires;
+
+	/* Broadcast supported by used bearer/bearers? */
+	if (!bb->bcast_support) {
+		method->rcast = true;
+		return;
+	}
+	/* Any destinations which don't support replicast ? */
+	if (!bb->rcast_support) {
+		method->rcast = false;
+		return;
+	}
+	/* Can current method be changed ? */
+	method->expires = jiffies + TIPC_METHOD_EXPIRE;
+	if (method->mandatory || time_before(jiffies, exp))
+		return;
+
+	/* Determine method to use now */
+	method->rcast = dests <= bb->bc_threshold;
+}
+
 /* tipc_bcast_xmit - broadcast the buffer chain to all external nodes
  * @net: the applicable net namespace
  * @pkts: chain of buffers containing message
@@ -237,16 +281,16 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
  *                   and to identified node local sockets
  * @net: the applicable net namespace
  * @pkts: chain of buffers containing message
- * @dests: destination nodes for message. Not consumed.
+ * @method: send method to be used
+ * @dests: destination nodes for message.
  * @cong_link_cnt: returns number of encountered congested destination links
- * @cong_links: returns identities of congested links
  * Consumes buffer chain.
  * Returns 0 if success, otherwise errno
  */
 int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
-		    struct tipc_nlist *dests, u16 *cong_link_cnt)
+		    struct tipc_mc_method *method, struct tipc_nlist *dests,
+		    u16 *cong_link_cnt)
 {
-	struct tipc_bc_base *bb = tipc_bc_base(net);
 	struct sk_buff_head inputq, localq;
 	int rc = 0;
 
@@ -258,9 +302,10 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
 		rc = -ENOMEM;
 		goto exit;
 	}
-
+	/* Send according to determined transmit method */
 	if (dests->remote) {
-		if (!bb->bcast_support)
+		tipc_bcast_select_xmit_method(net, dests->remote, method);
+		if (method->rcast)
 			rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
 		else
 			rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
@@ -269,6 +314,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
 	if (dests->local)
 		tipc_sk_mcast_rcv(net, &localq, &inputq);
 exit:
+	/* This queue should normally be empty by now */
 	__skb_queue_purge(pkts);
 	return rc;
 }
@@ -377,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l,
 	tipc_bcast_lock(net);
 	tipc_link_add_bc_peer(snd_l, uc_l, xmitq);
 	tipc_bcbase_select_primary(net);
+	tipc_bcbase_calc_bc_threshold(net);
 	tipc_bcast_unlock(net);
 }
 
@@ -395,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
 	tipc_bcast_lock(net);
 	tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq);
 	tipc_bcbase_select_primary(net);
+	tipc_bcbase_calc_bc_threshold(net);
 	tipc_bcast_unlock(net);
 
 	tipc_bcbase_xmit(net, &xmitq);
@@ -477,6 +525,8 @@ int tipc_bcast_init(struct net *net)
 		goto enomem;
 	bb->link = l;
 	tn->bcl = l;
+	bb->rc_ratio = 25;
+	bb->rcast_support = true;
 	return 0;
 enomem:
 	kfree(bb);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index dd772e6f6fa4..751530ab0c49 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -46,6 +46,8 @@ struct tipc_nlist;
 struct tipc_nitem;
 extern const char tipc_bclink_name[];
 
+#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
+
 struct tipc_nlist {
 	struct list_head list;
 	u32 self;
@@ -58,6 +60,17 @@ void tipc_nlist_purge(struct tipc_nlist *nl);
 void tipc_nlist_add(struct tipc_nlist *nl, u32 node);
 void tipc_nlist_del(struct tipc_nlist *nl, u32 node);
 
+/* Cookie to be used between socket and broadcast layer
+ * @rcast: replicast (instead of broadcast) was used at previous xmit
+ * @mandatory: broadcast/replicast indication was set by user
+ * @expires: re-evaluate non-mandatory transmit method if we are past this
+ */
+struct tipc_mc_method {
+	bool rcast;
+	bool mandatory;
+	unsigned long expires;
+};
+
 int tipc_bcast_init(struct net *net);
 void tipc_bcast_stop(struct net *net);
 void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
@@ -66,8 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
 void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
 void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
 int  tipc_bcast_get_mtu(struct net *net);
+void tipc_bcast_disable_rcast(struct net *net);
 int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
-		    struct tipc_nlist *dests, u16 *cong_link_cnt);
+		    struct tipc_mc_method *method, struct tipc_nlist *dests,
+		    u16 *cong_link_cnt);
 int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index b17b9e155469..ddd2dd6f77aa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 	if (link_is_bc_sndlink(l))
 		l->state = LINK_ESTABLISHED;
 
+	/* Disable replicast if even a single peer doesn't support it */
+	if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
+		tipc_bcast_disable_rcast(net);
+
 	return true;
 }
 
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 39ef54c1f2ad..898c22916984 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -47,11 +47,13 @@
 enum {
 	TIPC_BCAST_SYNCH      = (1 << 1),
 	TIPC_BCAST_STATE_NACK = (1 << 2),
-	TIPC_BLOCK_FLOWCTL    = (1 << 3)
+	TIPC_BLOCK_FLOWCTL    = (1 << 3),
+	TIPC_BCAST_RCAST      = (1 << 4)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
 				TIPC_BCAST_STATE_NACK | \
+				TIPC_BCAST_RCAST | \
 				TIPC_BLOCK_FLOWCTL)
 #define INVALID_BEARER_ID -1
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 93b6ae3154c9..5bec8aac5008 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -79,6 +79,7 @@ enum {
  * @rcv_unacked: # messages read by user, but not yet acked back to peer
  * @peer: 'connected' peer for dgram/rdm
  * @node: hash table node
+ * @mc_method: cookie for use between socket and broadcast layer
  * @rcu: rcu struct for tipc_sock
  */
 struct tipc_sock {
@@ -103,6 +104,7 @@ struct tipc_sock {
 	u16 rcv_win;
 	struct sockaddr_tipc peer;
 	struct rhash_head node;
+	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
 };
 
@@ -740,6 +742,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
 	int mtu = tipc_bcast_get_mtu(net);
+	struct tipc_mc_method *method = &tsk->mc_method;
 	u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE);
 	struct sk_buff_head pkts;
 	struct tipc_nlist dsts;
@@ -773,7 +776,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 
 	/* Send message if build was successful */
 	if (unlikely(rc == dlen))
-		rc = tipc_mcast_xmit(net, &pkts, &dsts,
+		rc = tipc_mcast_xmit(net, &pkts, method, &dsts,
 				     &tsk->cong_link_cnt);
 
 	tipc_nlist_purge(&dsts);
@@ -2344,18 +2347,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	u32 value;
+	u32 value = 0;
 	int res;
 
 	if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
 		return 0;
 	if (lvl != SOL_TIPC)
 		return -ENOPROTOOPT;
-	if (ol < sizeof(value))
-		return -EINVAL;
-	res = get_user(value, (u32 __user *)ov);
-	if (res)
-		return res;
+
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+	case TIPC_SRC_DROPPABLE:
+	case TIPC_DEST_DROPPABLE:
+	case TIPC_CONN_TIMEOUT:
+		if (ol < sizeof(value))
+			return -EINVAL;
+		res = get_user(value, (u32 __user *)ov);
+		if (res)
+			return res;
+		break;
+	default:
+		if (ov || ol)
+			return -EINVAL;
+	}
 
 	lock_sock(sk);
 
@@ -2376,6 +2390,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 		tipc_sk(sk)->conn_timeout = value;
 		/* no need to set "res", since already 0 at this point */
 		break;
+	case TIPC_MCAST_BROADCAST:
+		tsk->mc_method.rcast = false;
+		tsk->mc_method.mandatory = true;
+		break;
+	case TIPC_MCAST_REPLICAST:
+		tsk->mc_method.rcast = true;
+		tsk->mc_method.mandatory = true;
+		break;
 	default:
 		res = -EINVAL;
 	}
-- 
cgit v1.2.3


From b95a5c4db09bc7c253636cb84dc9b12c577fd5a0 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Sat, 21 Jan 2017 17:26:11 +0100
Subject: bpf: add a longest prefix match trie map implementation

This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.

Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.

Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.

The code carries more information about the internal implementation.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   7 +
 kernel/bpf/Makefile      |   2 +-
 kernel/bpf/lpm_trie.c    | 503 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 511 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/lpm_trie.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 54a5894bb4ea..bd3068485410 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
 	__s32	imm;		/* signed immediate constant */
 };
 
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
+	__u8	data[0];	/* Arbitrary size */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_ARRAY,
 	BPF_MAP_TYPE_LRU_HASH,
 	BPF_MAP_TYPE_LRU_PERCPU_HASH,
+	BPF_MAP_TYPE_LPM_TRIE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474ac3cd..e1ce4f4fd7fd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000000000000..ba19241d1979
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,503 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016,2017 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+	struct rcu_head rcu;
+	struct lpm_trie_node __rcu	*child[2];
+	u32				prefixlen;
+	u32				flags;
+	u8				data[0];
+};
+
+struct lpm_trie {
+	struct bpf_map			map;
+	struct lpm_trie_node __rcu	*root;
+	size_t				n_entries;
+	size_t				max_prefixlen;
+	size_t				data_size;
+	raw_spinlock_t			lock;
+};
+
+/* This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of what (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |
+ *    +----------------+
+ *    |       (2)      |
+ *    | 192.168.0.0/24 |
+ *    |    value: 2    |
+ *    |   [0]    [1]   |
+ *    +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |      |
+ *    +----------------+  +------------------+
+ *    |       (2)      |  |        (3)       |
+ *    | 192.168.0.0/24 |  | 192.168.128.0/24 |
+ *    |    value: 2    |  |     value: 3     |
+ *    |   [0]    [1]   |  |    [0]    [1]    |
+ *    +----------------+  +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differentiate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ *                      +----------------+
+ *                      |       (1)  (R) |
+ *                      | 192.168.0.0/16 |
+ *                      |    value: 1    |
+ *                      |   [0]    [1]   |
+ *                      +----------------+
+ *                           |      |
+ *            +----------------+  +------------------+
+ *            |       (4)  (I) |  |        (3)       |
+ *            | 192.168.0.0/23 |  | 192.168.128.0/24 |
+ *            |    value: ---  |  |     value: 3     |
+ *            |   [0]    [1]   |  |    [0]    [1]    |
+ *            +----------------+  +------------------+
+ *                 |      |
+ *  +----------------+  +----------------+
+ *  |       (2)      |  |       (5)      |
+ *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
+ *  |    value: 2    |  |     value: 5   |
+ *  |   [0]    [1]   |  |   [0]    [1]   |
+ *  +----------------+  +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+	return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie:	The trie to get internal sizes from
+ * @node:	The node to operate on
+ * @key:	The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+				   const struct lpm_trie_node *node,
+				   const struct bpf_lpm_trie_key *key)
+{
+	size_t prefixlen = 0;
+	size_t i;
+
+	for (i = 0; i < trie->data_size; i++) {
+		size_t b;
+
+		b = 8 - fls(node->data[i] ^ key->data[i]);
+		prefixlen += b;
+
+		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+			return min(node->prefixlen, key->prefixlen);
+
+		if (b < 8)
+			break;
+	}
+
+	return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node *node, *found = NULL;
+	struct bpf_lpm_trie_key *key = _key;
+
+	/* Start walking the trie from the root node ... */
+
+	for (node = rcu_dereference(trie->root); node;) {
+		unsigned int next_bit;
+		size_t matchlen;
+
+		/* Determine the longest prefix of @node that matches @key.
+		 * If it's the maximum possible prefix for this trie, we have
+		 * an exact match and can return it directly.
+		 */
+		matchlen = longest_prefix_match(trie, node, key);
+		if (matchlen == trie->max_prefixlen) {
+			found = node;
+			break;
+		}
+
+		/* If the number of bits that match is smaller than the prefix
+		 * length of @node, bail out and return the node we have seen
+		 * last in the traversal (ie, the parent).
+		 */
+		if (matchlen < node->prefixlen)
+			break;
+
+		/* Consider this node as return candidate unless it is an
+		 * artificially added intermediate one.
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			found = node;
+
+		/* If the node match is fully satisfied, let's see if we can
+		 * become more specific. Determine the next bit in the key and
+		 * traverse down.
+		 */
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+
+	if (!found)
+		return NULL;
+
+	return found->data + trie->data_size;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+						 const void *value)
+{
+	struct lpm_trie_node *node;
+	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+
+	if (value)
+		size += trie->map.value_size;
+
+	node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!node)
+		return NULL;
+
+	node->flags = 0;
+
+	if (value)
+		memcpy(node->data + trie->data_size, value,
+		       trie->map.value_size);
+
+	return node;
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+			    void *_key, void *value, u64 flags)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node *node, *im_node, *new_node = NULL;
+	struct lpm_trie_node __rcu **slot;
+	struct bpf_lpm_trie_key *key = _key;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Allocate and fill a new node */
+
+	if (trie->n_entries == trie->map.max_entries) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	new_node = lpm_trie_node_alloc(trie, value);
+	if (!new_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trie->n_entries++;
+
+	new_node->prefixlen = key->prefixlen;
+	RCU_INIT_POINTER(new_node->child[0], NULL);
+	RCU_INIT_POINTER(new_node->child[1], NULL);
+	memcpy(new_node->data, key->data, trie->data_size);
+
+	/* Now find a slot to attach the new node. To do that, walk the tree
+	 * from the root and match as many bits as possible for each node until
+	 * we either find an empty slot or a slot that needs to be replaced by
+	 * an intermediate node.
+	 */
+	slot = &trie->root;
+
+	while ((node = rcu_dereference_protected(*slot,
+					lockdep_is_held(&trie->lock)))) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen ||
+		    node->prefixlen == trie->max_prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		slot = &node->child[next_bit];
+	}
+
+	/* If the slot is empty (a free child pointer or an empty root),
+	 * simply assign the @new_node to that slot and be done.
+	 */
+	if (!node) {
+		rcu_assign_pointer(*slot, new_node);
+		goto out;
+	}
+
+	/* If the slot we picked already exists, replace it with @new_node
+	 * which already has the correct data array set.
+	 */
+	if (node->prefixlen == matchlen) {
+		new_node->child[0] = node->child[0];
+		new_node->child[1] = node->child[1];
+
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			trie->n_entries--;
+
+		rcu_assign_pointer(*slot, new_node);
+		kfree_rcu(node, rcu);
+
+		goto out;
+	}
+
+	/* If the new node matches the prefix completely, it must be inserted
+	 * as an ancestor. Simply insert it between @node and *@slot.
+	 */
+	if (matchlen == key->prefixlen) {
+		next_bit = extract_bit(node->data, matchlen);
+		rcu_assign_pointer(new_node->child[next_bit], node);
+		rcu_assign_pointer(*slot, new_node);
+		goto out;
+	}
+
+	im_node = lpm_trie_node_alloc(trie, NULL);
+	if (!im_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	im_node->prefixlen = matchlen;
+	im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+	memcpy(im_node->data, node->data, trie->data_size);
+
+	/* Now determine which child to install in which slot */
+	if (extract_bit(key->data, matchlen)) {
+		rcu_assign_pointer(im_node->child[0], node);
+		rcu_assign_pointer(im_node->child[1], new_node);
+	} else {
+		rcu_assign_pointer(im_node->child[0], new_node);
+		rcu_assign_pointer(im_node->child[1], node);
+	}
+
+	/* Finally, assign the intermediate node to the determined spot */
+	rcu_assign_pointer(*slot, im_node);
+
+out:
+	if (ret) {
+		if (new_node)
+			trie->n_entries--;
+
+		kfree(new_node);
+		kfree(im_node);
+	}
+
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
+}
+
+static int trie_delete_elem(struct bpf_map *map, void *key)
+{
+	/* TODO */
+	return -ENOSYS;
+}
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+	size_t cost, cost_per_node;
+	struct lpm_trie *trie;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 ||
+	    attr->map_flags != BPF_F_NO_PREALLOC ||
+	    attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1   ||
+	    attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
+	    attr->value_size == 0)
+		return ERR_PTR(-EINVAL);
+
+	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+	if (!trie)
+		return ERR_PTR(-ENOMEM);
+
+	/* copy mandatory map attributes */
+	trie->map.map_type = attr->map_type;
+	trie->map.key_size = attr->key_size;
+	trie->map.value_size = attr->value_size;
+	trie->map.max_entries = attr->max_entries;
+	trie->data_size = attr->key_size -
+			  offsetof(struct bpf_lpm_trie_key, data);
+	trie->max_prefixlen = trie->data_size * 8;
+
+	cost_per_node = sizeof(struct lpm_trie_node) +
+			attr->value_size + trie->data_size;
+	cost = sizeof(*trie) + attr->max_entries * cost_per_node;
+	trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	ret = bpf_map_precharge_memlock(trie->map.pages);
+	if (ret) {
+		kfree(trie);
+		return ERR_PTR(ret);
+	}
+
+	raw_spin_lock_init(&trie->lock);
+
+	return &trie->map;
+}
+
+static void trie_free(struct bpf_map *map)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node __rcu **slot;
+	struct lpm_trie_node *node;
+
+	raw_spin_lock(&trie->lock);
+
+	/* Always start at the root and walk down to a node that has no
+	 * children. Then free that node, nullify its reference in the parent
+	 * and start over.
+	 */
+
+	for (;;) {
+		slot = &trie->root;
+
+		for (;;) {
+			node = rcu_dereference_protected(*slot,
+					lockdep_is_held(&trie->lock));
+			if (!node)
+				goto unlock;
+
+			if (rcu_access_pointer(node->child[0])) {
+				slot = &node->child[0];
+				continue;
+			}
+
+			if (rcu_access_pointer(node->child[1])) {
+				slot = &node->child[1];
+				continue;
+			}
+
+			kfree(node);
+			RCU_INIT_POINTER(*slot, NULL);
+			break;
+		}
+	}
+
+unlock:
+	raw_spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+	.map_alloc = trie_alloc,
+	.map_free = trie_free,
+	.map_lookup_elem = trie_lookup_elem,
+	.map_update_elem = trie_update_elem,
+	.map_delete_elem = trie_delete_elem,
+};
+
+static struct bpf_map_type_list trie_type __read_mostly = {
+	.ops = &trie_ops,
+	.type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+	bpf_register_map_type(&trie_type);
+	return 0;
+}
+late_initcall(register_trie_map);
-- 
cgit v1.2.3


From 12a6075cabc0d9ffbc0366b44daa22f278606312 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 10 Jan 2017 18:52:06 +0100
Subject: can: dev: add CAN interface termination API

This patch adds a netlink interface to configure the CAN bus termination of
CAN interfaces.

Inside the driver an array of supported termination values is defined:

const u16 drvname_termination[] = { 60, 120, CAN_TERMINATION_DISABLED };

struct drvname_priv *priv;
priv = netdev_priv(dev);

priv->termination_const = drvname_termination;
priv->termination_const_cnt = ARRAY_SIZE(drvname_termination);
priv->termination = CAN_TERMINATION_DISABLED;

And the funtion to set the value has to be defined:

priv->do_set_termination = drvname_set_termination;

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Reviewed-by: Ramesh Shanmugasundaram <Ramesh.shanmugasundaram@bp.renesas.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c            | 49 +++++++++++++++++++++++++++++++++++++++-
 include/linux/can/dev.h          |  4 ++++
 include/uapi/linux/can/netlink.h |  5 ++++
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 8d6208c0b400..fefe2cd17721 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -958,6 +958,30 @@ static int can_changelink(struct net_device *dev,
 		}
 	}
 
+	if (data[IFLA_CAN_TERMINATION]) {
+		const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]);
+		const unsigned int num_term = priv->termination_const_cnt;
+		unsigned int i;
+
+		if (!priv->do_set_termination)
+			return -EOPNOTSUPP;
+
+		/* check whether given value is supported by the interface */
+		for (i = 0; i < num_term; i++) {
+			if (termval == priv->termination_const[i])
+				break;
+		}
+		if (i >= num_term)
+			return -EINVAL;
+
+		/* Finally, set the termination value */
+		err = priv->do_set_termination(dev, termval);
+		if (err)
+			return err;
+
+		priv->termination = termval;
+	}
+
 	return 0;
 }
 
@@ -980,6 +1004,11 @@ static size_t can_get_size(const struct net_device *dev)
 		size += nla_total_size(sizeof(struct can_bittiming));
 	if (priv->data_bittiming_const)				/* IFLA_CAN_DATA_BITTIMING_CONST */
 		size += nla_total_size(sizeof(struct can_bittiming_const));
+	if (priv->termination_const) {
+		size += nla_total_size(sizeof(priv->termination));		/* IFLA_CAN_TERMINATION */
+		size += nla_total_size(sizeof(*priv->termination_const) *	/* IFLA_CAN_TERMINATION_CONST */
+				       priv->termination_const_cnt);
+	}
 
 	return size;
 }
@@ -1018,7 +1047,15 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    (priv->data_bittiming_const &&
 	     nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST,
 		     sizeof(*priv->data_bittiming_const),
-		     priv->data_bittiming_const)))
+		     priv->data_bittiming_const)) ||
+
+	    (priv->termination_const &&
+	     (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) ||
+	      nla_put(skb, IFLA_CAN_TERMINATION_CONST,
+		      sizeof(*priv->termination_const) *
+		      priv->termination_const_cnt,
+		      priv->termination_const))))
+
 		return -EMSGSIZE;
 
 	return 0;
@@ -1073,6 +1110,16 @@ static struct rtnl_link_ops can_link_ops __read_mostly = {
  */
 int register_candev(struct net_device *dev)
 {
+	struct can_priv *priv = netdev_priv(dev);
+
+	/* Ensure termination_const, termination_const_cnt and
+	 * do_set_termination consistency. All must be either set or
+	 * unset.
+	 */
+	if ((!priv->termination_const != !priv->termination_const_cnt) ||
+	    (!priv->termination_const != !priv->do_set_termination))
+		return -EINVAL;
+
 	dev->rtnl_link_ops = &can_link_ops;
 	return register_netdev(dev);
 }
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 5f5270941ba0..f6a57f322f00 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -38,6 +38,9 @@ struct can_priv {
 	struct can_bittiming bittiming, data_bittiming;
 	const struct can_bittiming_const *bittiming_const,
 		*data_bittiming_const;
+	const u16 *termination_const;
+	unsigned int termination_const_cnt;
+	u16 termination;
 	struct can_clock clock;
 
 	enum can_state state;
@@ -53,6 +56,7 @@ struct can_priv {
 	int (*do_set_bittiming)(struct net_device *dev);
 	int (*do_set_data_bittiming)(struct net_device *dev);
 	int (*do_set_mode)(struct net_device *dev, enum can_mode mode);
+	int (*do_set_termination)(struct net_device *dev, u16 term);
 	int (*do_get_state)(const struct net_device *dev,
 			    enum can_state *state);
 	int (*do_get_berr_counter)(const struct net_device *dev,
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 94ffe0c83ce7..7414771926fb 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -127,9 +127,14 @@ enum {
 	IFLA_CAN_BERR_COUNTER,
 	IFLA_CAN_DATA_BITTIMING,
 	IFLA_CAN_DATA_BITTIMING_CONST,
+	IFLA_CAN_TERMINATION,
+	IFLA_CAN_TERMINATION_CONST,
 	__IFLA_CAN_MAX
 };
 
 #define IFLA_CAN_MAX	(__IFLA_CAN_MAX - 1)
 
+/* u16 termination range: 1..65535 Ohms */
+#define CAN_TERMINATION_DISABLED 0
+
 #endif /* !_UAPI_CAN_NETLINK_H */
-- 
cgit v1.2.3


From 431af779256cd6cb8328ac23c5696bae63c33a51 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 11 Jan 2017 17:05:35 +0100
Subject: can: dev: add CAN interface API for fixed bitrates

Some CAN interfaces only support fixed fixed bitrates. This patch adds a
netlink interface to get the list of the CAN interface's fixed bitrates and
data bitrates.

Inside the driver arrays of supported data- bitrate values are defined.

const u32 drvname_bitrate[] = { 20000, 50000, 100000 };
const u32 drvname_data_bitrate[] = { 200000, 500000, 1000000 };

struct drvname_priv *priv;
priv = netdev_priv(dev);

priv->bitrate_const = drvname_bitrate;
priv->bitrate_const_cnt = ARRAY_SIZE(drvname_bitrate);
priv->data_bitrate_const = drvname_data_bitrate;
priv->data_bitrate_const_cnt = ARRAY_SIZE(drvname_data_bitrate);

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c            | 81 ++++++++++++++++++++++++++++++++--------
 include/linux/can/dev.h          |  4 ++
 include/uapi/linux/can/netlink.h |  2 +
 3 files changed, 71 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index afcf487382c9..611d16a7061d 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -279,8 +279,29 @@ static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	return 0;
 }
 
+/* Checks the validity of predefined bitrate settings */
+static int can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt,
+				const u32 *bitrate_const,
+				const unsigned int bitrate_const_cnt)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	unsigned int i;
+
+	for (i = 0; i < bitrate_const_cnt; i++) {
+		if (bt->bitrate == bitrate_const[i])
+			break;
+	}
+
+	if (i >= priv->bitrate_const_cnt)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
-			     const struct can_bittiming_const *btc)
+			     const struct can_bittiming_const *btc,
+			     const u32 *bitrate_const,
+			     const unsigned int bitrate_const_cnt)
 {
 	int err;
 
@@ -290,10 +311,13 @@ static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	 * alternatively the CAN timing parameters (tq, prop_seg, etc.) are
 	 * provided directly which are then checked and fixed up.
 	 */
-	if (!bt->tq && bt->bitrate)
+	if (!bt->tq && bt->bitrate && btc)
 		err = can_calc_bittiming(dev, bt, btc);
-	else if (bt->tq && !bt->bitrate)
+	else if (bt->tq && !bt->bitrate && btc)
 		err = can_fixup_bittiming(dev, bt, btc);
+	else if (!bt->tq && bt->bitrate && bitrate_const)
+		err = can_validate_bitrate(dev, bt, bitrate_const,
+					   bitrate_const_cnt);
 	else
 		err = -EINVAL;
 
@@ -878,12 +902,12 @@ static int can_changelink(struct net_device *dev,
 			return -EOPNOTSUPP;
 
 		memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt));
-		if (priv->bittiming_const) {
-			err = can_get_bittiming(dev, &bt,
-						priv->bittiming_const);
-			if (err)
-				return err;
-		}
+		err = can_get_bittiming(dev, &bt,
+					priv->bittiming_const,
+					priv->bitrate_const,
+					priv->bitrate_const_cnt);
+		if (err)
+			return err;
 		memcpy(&priv->bittiming, &bt, sizeof(bt));
 
 		if (priv->do_set_bittiming) {
@@ -962,12 +986,12 @@ static int can_changelink(struct net_device *dev,
 
 		memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]),
 		       sizeof(dbt));
-		if (priv->data_bittiming_const) {
-			err = can_get_bittiming(dev, &dbt,
-						priv->data_bittiming_const);
-			if (err)
-				return err;
-		}
+		err = can_get_bittiming(dev, &dbt,
+					priv->data_bittiming_const,
+					priv->data_bitrate_const,
+					priv->data_bitrate_const_cnt);
+		if (err)
+			return err;
 		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
 
 		if (priv->do_set_data_bittiming) {
@@ -1029,6 +1053,12 @@ static size_t can_get_size(const struct net_device *dev)
 		size += nla_total_size(sizeof(*priv->termination_const) *	/* IFLA_CAN_TERMINATION_CONST */
 				       priv->termination_const_cnt);
 	}
+	if (priv->bitrate_const)				/* IFLA_CAN_BITRATE_CONST */
+		size += nla_total_size(sizeof(*priv->bitrate_const) *
+				       priv->bitrate_const_cnt);
+	if (priv->data_bitrate_const)				/* IFLA_CAN_DATA_BITRATE_CONST */
+		size += nla_total_size(sizeof(*priv->data_bitrate_const) *
+				       priv->data_bitrate_const_cnt);
 
 	return size;
 }
@@ -1074,7 +1104,20 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	      nla_put(skb, IFLA_CAN_TERMINATION_CONST,
 		      sizeof(*priv->termination_const) *
 		      priv->termination_const_cnt,
-		      priv->termination_const))))
+		      priv->termination_const))) ||
+
+	    (priv->bitrate_const &&
+	     nla_put(skb, IFLA_CAN_BITRATE_CONST,
+		     sizeof(*priv->bitrate_const) *
+		     priv->bitrate_const_cnt,
+		     priv->bitrate_const)) ||
+
+	    (priv->data_bitrate_const &&
+	     nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST,
+		     sizeof(*priv->data_bitrate_const) *
+		     priv->data_bitrate_const_cnt,
+		     priv->data_bitrate_const))
+	    )
 
 		return -EMSGSIZE;
 
@@ -1140,6 +1183,12 @@ int register_candev(struct net_device *dev)
 	    (!priv->termination_const != !priv->do_set_termination))
 		return -EINVAL;
 
+	if (!priv->bitrate_const != !priv->bitrate_const_cnt)
+		return -EINVAL;
+
+	if (!priv->data_bitrate_const != !priv->data_bitrate_const_cnt)
+		return -EINVAL;
+
 	dev->rtnl_link_ops = &can_link_ops;
 	return register_netdev(dev);
 }
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index f6a57f322f00..141b05aade81 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -41,6 +41,10 @@ struct can_priv {
 	const u16 *termination_const;
 	unsigned int termination_const_cnt;
 	u16 termination;
+	const u32 *bitrate_const;
+	unsigned int bitrate_const_cnt;
+	const u32 *data_bitrate_const;
+	unsigned int data_bitrate_const_cnt;
 	struct can_clock clock;
 
 	enum can_state state;
diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 7414771926fb..fdf75f74fdaf 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -129,6 +129,8 @@ enum {
 	IFLA_CAN_DATA_BITTIMING_CONST,
 	IFLA_CAN_TERMINATION,
 	IFLA_CAN_TERMINATION_CONST,
+	IFLA_CAN_BITRATE_CONST,
+	IFLA_CAN_DATA_BITRATE_CONST,
 	__IFLA_CAN_MAX
 };
 
-- 
cgit v1.2.3


From 6db6f0eae6052b70885562e1733896647ec1d807 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 21 Jan 2017 21:01:32 +0100
Subject: bridge: multicast to unicast
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements an optional, per bridge port flag and feature to deliver
multicast packets to any host on the according port via unicast
individually. This is done by copying the packet per host and
changing the multicast destination MAC to a unicast one accordingly.

multicast-to-unicast works on top of the multicast snooping feature of
the bridge. Which means unicast copies are only delivered to hosts which
are interested in it and signalized this via IGMP/MLD reports
previously.

This feature is intended for interface types which have a more reliable
and/or efficient way to deliver unicast packets than broadcast ones
(e.g. wifi).

However, it should only be enabled on interfaces where no IGMPv2/MLDv1
report suppression takes place. This feature is disabled by default.

The initial patch and idea is from Felix Fietkau.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
[linus.luessing@c0d3.blue: various bug + style fixes, commit message]
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c      | 39 ++++++++++++++++++-
 net/bridge/br_mdb.c          |  2 +-
 net/bridge/br_multicast.c    | 90 ++++++++++++++++++++++++++++++++------------
 net/bridge/br_netlink.c      |  5 +++
 net/bridge/br_private.h      |  3 +-
 net/bridge/br_sysfs_if.c     |  2 +
 8 files changed, 114 insertions(+), 29 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index c6587c01d951..debc9d5904e5 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -46,6 +46,7 @@ struct br_ip_list {
 #define BR_LEARNING_SYNC	BIT(9)
 #define BR_PROXYARP_WIFI	BIT(10)
 #define BR_MCAST_FLOOD		BIT(11)
+#define BR_MULTICAST_TO_UNICAST	BIT(12)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 184b16ed2b84..b9aa5641ebe5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -321,6 +321,7 @@ enum {
 	IFLA_BRPORT_MULTICAST_ROUTER,
 	IFLA_BRPORT_PAD,
 	IFLA_BRPORT_MCAST_FLOOD,
+	IFLA_BRPORT_MCAST_TO_UCAST,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7cb41aee4c82..a0f9d0037d24 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -174,6 +174,31 @@ out:
 	return p;
 }
 
+static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
+			       const unsigned char *addr, bool local_orig)
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+	const unsigned char *src = eth_hdr(skb)->h_source;
+
+	if (!should_deliver(p, skb))
+		return;
+
+	/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+	if (skb->dev == p->dev && ether_addr_equal(src, addr))
+		return;
+
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev->stats.tx_dropped++;
+		return;
+	}
+
+	if (!is_broadcast_ether_addr(addr))
+		memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+
+	__br_forward(p, skb, local_orig);
+}
+
 /* called under rcu_read_lock */
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
@@ -241,10 +266,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 		rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
 			     NULL;
 
-		port = (unsigned long)lport > (unsigned long)rport ?
-		       lport : rport;
+		if ((unsigned long)lport > (unsigned long)rport) {
+			port = lport;
+
+			if (port->flags & BR_MULTICAST_TO_UNICAST) {
+				maybe_deliver_addr(lport, skb, p->eth_addr,
+						   local_orig);
+				goto delivered;
+			}
+		} else {
+			port = rport;
+		}
 
 		prev = maybe_deliver(prev, port, skb, local_orig);
+delivered:
 		if (IS_ERR(prev))
 			goto out;
 		if (prev == port)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 7dbc80d01eb0..056e6ac49d8f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, state);
+	p = br_multicast_new_port_group(port, group, *pp, state, NULL);
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f66346122dc4..1de3438e36bf 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -43,12 +43,14 @@ static void br_multicast_add_router(struct net_bridge *br,
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid);
+					 __u16 vid,
+					 const unsigned char *src);
+
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid);
+					 __u16 vid, const unsigned char *src);
 #endif
 unsigned int br_mdb_rehash_seq;
 
@@ -711,7 +713,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 			struct net_bridge_port *port,
 			struct br_ip *group,
 			struct net_bridge_port_group __rcu *next,
-			unsigned char flags)
+			unsigned char flags,
+			const unsigned char *src)
 {
 	struct net_bridge_port_group *p;
 
@@ -726,12 +729,32 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
 		    (unsigned long)p);
+
+	if (src)
+		memcpy(p->eth_addr, src, ETH_ALEN);
+	else
+		memset(p->eth_addr, 0xff, ETH_ALEN);
+
 	return p;
 }
 
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+				struct net_bridge_port *port,
+				const unsigned char *src)
+{
+	if (p->port != port)
+		return false;
+
+	if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+		return true;
+
+	return ether_addr_equal(src, p->eth_addr);
+}
+
 static int br_multicast_add_group(struct net_bridge *br,
 				  struct net_bridge_port *port,
-				  struct br_ip *group)
+				  struct br_ip *group,
+				  const unsigned char *src)
 {
 	struct net_bridge_port_group __rcu **pp;
 	struct net_bridge_port_group *p;
@@ -758,13 +781,13 @@ static int br_multicast_add_group(struct net_bridge *br,
 	for (pp = &mp->ports;
 	     (p = mlock_dereference(*pp, br)) != NULL;
 	     pp = &p->next) {
-		if (p->port == port)
+		if (br_port_group_equal(p, port, src))
 			goto found;
 		if ((unsigned long)p->port < (unsigned long)port)
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, 0);
+	p = br_multicast_new_port_group(port, group, *pp, 0, src);
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
@@ -783,7 +806,8 @@ err:
 static int br_ip4_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      __be32 group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -794,14 +818,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IP);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int br_ip6_multicast_add_group(struct net_bridge *br,
 				      struct net_bridge_port *port,
 				      const struct in6_addr *group,
-				      __u16 vid)
+				      __u16 vid,
+				      const unsigned char *src)
 {
 	struct br_ip br_group;
 
@@ -812,7 +837,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 	br_group.proto = htons(ETH_P_IPV6);
 	br_group.vid = vid;
 
-	return br_multicast_add_group(br, port, &br_group);
+	return br_multicast_add_group(br, port, &br_group, src);
 }
 #endif
 
@@ -1081,6 +1106,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 					 struct sk_buff *skb,
 					 u16 vid)
 {
+	const unsigned char *src;
 	struct igmpv3_report *ih;
 	struct igmpv3_grec *grec;
 	int i;
@@ -1121,12 +1147,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
 		     type == IGMPV3_MODE_IS_INCLUDE) &&
 		    ntohs(grec->grec_nsrcs) == 0) {
-			br_ip4_multicast_leave_group(br, port, group, vid);
+			br_ip4_multicast_leave_group(br, port, group, vid, src);
 		} else {
-			err = br_ip4_multicast_add_group(br, port, group, vid);
+			err = br_ip4_multicast_add_group(br, port, group, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1141,6 +1169,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 					struct sk_buff *skb,
 					u16 vid)
 {
+	const unsigned char *src;
 	struct icmp6hdr *icmp6h;
 	struct mld2_grec *grec;
 	int i;
@@ -1188,14 +1217,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 			continue;
 		}
 
+		src = eth_hdr(skb)->h_source;
 		if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
 		     grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
 		    ntohs(*nsrcs) == 0) {
 			br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
-						     vid);
+						     vid, src);
 		} else {
 			err = br_ip6_multicast_add_group(br, port,
-							 &grec->grec_mca, vid);
+							 &grec->grec_mca, vid,
+							 src);
 			if (err)
 				break;
 		}
@@ -1511,7 +1542,8 @@ br_multicast_leave_group(struct net_bridge *br,
 			 struct net_bridge_port *port,
 			 struct br_ip *group,
 			 struct bridge_mcast_other_query *other_query,
-			 struct bridge_mcast_own_query *own_query)
+			 struct bridge_mcast_own_query *own_query,
+			 const unsigned char *src)
 {
 	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
@@ -1535,7 +1567,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (pp = &mp->ports;
 		     (p = mlock_dereference(*pp, br)) != NULL;
 		     pp = &p->next) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			rcu_assign_pointer(*pp, p->next);
@@ -1566,7 +1598,7 @@ br_multicast_leave_group(struct net_bridge *br,
 		for (p = mlock_dereference(mp->ports, br);
 		     p != NULL;
 		     p = mlock_dereference(p->next, br)) {
-			if (p->port != port)
+			if (!br_port_group_equal(p, port, src))
 				continue;
 
 			if (!hlist_unhashed(&p->mglist) &&
@@ -1617,7 +1649,8 @@ out:
 static void br_ip4_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 __be32 group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1632,14 +1665,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
-				 own_query);
+				 own_query, src);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static void br_ip6_multicast_leave_group(struct net_bridge *br,
 					 struct net_bridge_port *port,
 					 const struct in6_addr *group,
-					 __u16 vid)
+					 __u16 vid,
+					 const unsigned char *src)
 {
 	struct br_ip br_group;
 	struct bridge_mcast_own_query *own_query;
@@ -1654,7 +1688,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
 	br_group.vid = vid;
 
 	br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
-				 own_query);
+				 own_query, src);
 }
 #endif
 
@@ -1712,6 +1746,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct igmphdr *ih;
 	int err;
 
@@ -1731,13 +1766,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 	}
 
 	ih = igmp_hdr(skb);
+	src = eth_hdr(skb)->h_source;
 	BR_INPUT_SKB_CB(skb)->igmp = ih->type;
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+		err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
 		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1746,7 +1782,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
-		br_ip4_multicast_leave_group(br, port, ih->group, vid);
+		br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
 		break;
 	}
 
@@ -1766,6 +1802,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 u16 vid)
 {
 	struct sk_buff *skb_trimmed = NULL;
+	const unsigned char *src;
 	struct mld_msg *mld;
 	int err;
 
@@ -1785,8 +1822,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 
 	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
+		src = eth_hdr(skb)->h_source;
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
+						 src);
 		break;
 	case ICMPV6_MLD2_REPORT:
 		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1795,7 +1834,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+		src = eth_hdr(skb)->h_source;
+		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
 		break;
 	}
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 71c7453268c1..6c087cd049b9 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -123,6 +123,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_GUARD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
 		+ nla_total_size(1)	/* IFLA_BRPORT_FAST_LEAVE */
+		+ nla_total_size(1)	/* IFLA_BRPORT_MCAST_TO_UCAST */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LEARNING */
 		+ nla_total_size(1)	/* IFLA_BRPORT_UNICAST_FLOOD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP */
@@ -173,6 +174,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_ROOT_BLOCK)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
 		       !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST,
+		       !!(p->flags & BR_MULTICAST_TO_UNICAST)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
 		       !!(p->flags & BR_FLOOD)) ||
@@ -586,6 +589,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_PROXYARP]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 },
+	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -636,6 +640,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
 	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8ce621e8345c..0b82a227fc34 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -177,6 +177,7 @@ struct net_bridge_port_group {
 	struct timer_list		timer;
 	struct br_ip			addr;
 	unsigned char			flags;
+	unsigned char			eth_addr[ETH_ALEN];
 };
 
 struct net_bridge_mdb_entry
@@ -599,7 +600,7 @@ void br_multicast_free_pg(struct rcu_head *head);
 struct net_bridge_port_group *
 br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    struct net_bridge_port_group __rcu *next,
-			    unsigned char flags);
+			    unsigned char flags, const unsigned char *src);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 8bd569695e76..05e8946ccc03 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -188,6 +188,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
 		   store_multicast_router);
 
 BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST);
 #endif
 
 static const struct brport_attribute *brport_attrs[] = {
@@ -214,6 +215,7 @@ static const struct brport_attribute *brport_attrs[] = {
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 	&brport_attr_multicast_fast_leave,
+	&brport_attr_multicast_to_unicast,
 #endif
 	&brport_attr_proxyarp,
 	&brport_attr_proxyarp_wifi,
-- 
cgit v1.2.3


From 6ae0a6286171154661b74f7f550f9441c6008424 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Mon, 23 Jan 2017 11:07:08 +0100
Subject: net: Introduce psample, a new genetlink channel for packet sampling

Add a general way for kernel modules to sample packets, without being tied
to any specific subsystem. This netlink channel can be used by tc,
iptables, etc. and allow to standardize packet sampling in the kernel.

For every sampled packet, the psample module adds the following metadata
fields:

PSAMPLE_ATTR_IIFINDEX - the packets input ifindex, if applicable

PSAMPLE_ATTR_OIFINDEX - the packet output ifindex, if applicable

PSAMPLE_ATTR_ORIGSIZE - the packet's original size, in case it has been
   truncated during sampling

PSAMPLE_ATTR_SAMPLE_GROUP - the packet's sample group, which is set by the
   user who initiated the sampling. This field allows the user to
   differentiate between several samplers working simultaneously and
   filter packets relevant to him

PSAMPLE_ATTR_GROUP_SEQ - sequence counter of last sent packet. The
   sequence is kept for each group

PSAMPLE_ATTR_SAMPLE_RATE - the sampling rate used for sampling the packets

PSAMPLE_ATTR_DATA - the actual packet bits

The sampled packets are sent to the PSAMPLE_NL_MCGRP_SAMPLE multicast
group. In addition, add the GET_GROUPS netlink command which allows the
user to see the current sample groups, their refcount and sequence number.
This command currently supports only netlink dump mode.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |   7 +
 include/net/psample.h        |  36 ++++++
 include/uapi/linux/Kbuild    |   1 +
 include/uapi/linux/psample.h |  35 +++++
 net/Kconfig                  |   1 +
 net/Makefile                 |   1 +
 net/psample/Kconfig          |  15 +++
 net/psample/Makefile         |   5 +
 net/psample/psample.c        | 301 +++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 402 insertions(+)
 create mode 100644 include/net/psample.h
 create mode 100644 include/uapi/linux/psample.h
 create mode 100644 net/psample/Kconfig
 create mode 100644 net/psample/Makefile
 create mode 100644 net/psample/psample.c

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3c84a8fecc09..d76fccd09266 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9957,6 +9957,13 @@ L:	linuxppc-dev@lists.ozlabs.org
 S:	Maintained
 F:	drivers/block/ps3vram.c
 
+PSAMPLE PACKET SAMPLING SUPPORT:
+M:	Yotam Gigi <yotamg@mellanox.com>
+S:	Maintained
+F:	net/psample
+F:	include/net/psample.h
+F:	include/uapi/linux/psample.h
+
 PSTORE FILESYSTEM
 M:	Anton Vorontsov <anton@enomsg.org>
 M:	Colin Cross <ccross@android.com>
diff --git a/include/net/psample.h b/include/net/psample.h
new file mode 100644
index 000000000000..8888b0e1a82e
--- /dev/null
+++ b/include/net/psample.h
@@ -0,0 +1,36 @@
+#ifndef __NET_PSAMPLE_H
+#define __NET_PSAMPLE_H
+
+#include <uapi/linux/psample.h>
+#include <linux/module.h>
+#include <linux/list.h>
+
+struct psample_group {
+	struct list_head list;
+	struct net *net;
+	u32 group_num;
+	u32 refcount;
+	u32 seq;
+};
+
+struct psample_group *psample_group_get(struct net *net, u32 group_num);
+void psample_group_put(struct psample_group *group);
+
+#if IS_ENABLED(CONFIG_PSAMPLE)
+
+void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
+			   u32 trunc_size, int in_ifindex, int out_ifindex,
+			   u32 sample_rate);
+
+#else
+
+static inline void psample_sample_packet(struct psample_group *group,
+					 struct sk_buff *skb, u32 trunc_size,
+					 int in_ifindex, int out_ifindex,
+					 u32 sample_rate)
+{
+}
+
+#endif
+
+#endif /* __NET_PSAMPLE_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index e600b50be77e..80ad741a42fa 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -305,6 +305,7 @@ header-y += netrom.h
 header-y += net_namespace.h
 header-y += net_tstamp.h
 header-y += nfc.h
+header-y += psample.h
 header-y += nfs2.h
 header-y += nfs3.h
 header-y += nfs4.h
diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
new file mode 100644
index 000000000000..ed48996ec0e8
--- /dev/null
+++ b/include/uapi/linux/psample.h
@@ -0,0 +1,35 @@
+#ifndef __UAPI_PSAMPLE_H
+#define __UAPI_PSAMPLE_H
+
+enum {
+	/* sampled packet metadata */
+	PSAMPLE_ATTR_IIFINDEX,
+	PSAMPLE_ATTR_OIFINDEX,
+	PSAMPLE_ATTR_ORIGSIZE,
+	PSAMPLE_ATTR_SAMPLE_GROUP,
+	PSAMPLE_ATTR_GROUP_SEQ,
+	PSAMPLE_ATTR_SAMPLE_RATE,
+	PSAMPLE_ATTR_DATA,
+
+	/* commands attributes */
+	PSAMPLE_ATTR_GROUP_REFCOUNT,
+
+	__PSAMPLE_ATTR_MAX
+};
+
+enum psample_command {
+	PSAMPLE_CMD_SAMPLE,
+	PSAMPLE_CMD_GET_GROUP,
+	PSAMPLE_CMD_NEW_GROUP,
+	PSAMPLE_CMD_DEL_GROUP,
+};
+
+/* Can be overridden at runtime by module option */
+#define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1)
+
+#define PSAMPLE_NL_MCGRP_CONFIG_NAME "config"
+#define PSAMPLE_NL_MCGRP_SAMPLE_NAME "packets"
+#define PSAMPLE_GENL_NAME "psample"
+#define PSAMPLE_GENL_VERSION 1
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index 92ae1500d9e1..ce4aee69fc0d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -390,6 +390,7 @@ source "net/9p/Kconfig"
 source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
+source "net/psample/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
diff --git a/net/Makefile b/net/Makefile
index 5d6e0e5ff7f8..7d41de48310e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
 obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
 obj-$(CONFIG_NFC)		+= nfc/
+obj-$(CONFIG_PSAMPLE)		+= psample/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_MPLS)		+= mpls/
diff --git a/net/psample/Kconfig b/net/psample/Kconfig
new file mode 100644
index 000000000000..d850246a6059
--- /dev/null
+++ b/net/psample/Kconfig
@@ -0,0 +1,15 @@
+#
+# psample packet sampling configuration
+#
+
+menuconfig PSAMPLE
+	depends on NET
+	tristate "Packet-sampling netlink channel"
+	default n
+	help
+	  Say Y here to add support for packet-sampling netlink channel
+	  This netlink channel allows transferring packets alongside some
+	  metadata to userspace.
+
+	  To compile this support as a module, choose M here: the module will
+	  be called psample.
diff --git a/net/psample/Makefile b/net/psample/Makefile
new file mode 100644
index 000000000000..609b0a79c9f3
--- /dev/null
+++ b/net/psample/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the psample netlink channel
+#
+
+obj-$(CONFIG_PSAMPLE) += psample.o
diff --git a/net/psample/psample.c b/net/psample/psample.c
new file mode 100644
index 000000000000..8aa58a918783
--- /dev/null
+++ b/net/psample/psample.c
@@ -0,0 +1,301 @@
+/*
+ * net/psample/psample.c - Netlink channel for packet sampling
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/psample.h>
+#include <linux/spinlock.h>
+
+#define PSAMPLE_MAX_PACKET_SIZE 0xffff
+
+static LIST_HEAD(psample_groups_list);
+static DEFINE_SPINLOCK(psample_groups_lock);
+
+/* multicast groups */
+enum psample_nl_multicast_groups {
+	PSAMPLE_NL_MCGRP_CONFIG,
+	PSAMPLE_NL_MCGRP_SAMPLE,
+};
+
+static const struct genl_multicast_group psample_nl_mcgrps[] = {
+	[PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME },
+	[PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME },
+};
+
+static struct genl_family psample_nl_family __ro_after_init;
+
+static int psample_group_nl_fill(struct sk_buff *msg,
+				 struct psample_group *group,
+				 enum psample_command cmd, u32 portid, u32 seq,
+				 int flags)
+{
+	void *hdr;
+	int ret;
+
+	hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+	if (ret < 0)
+		goto error;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount);
+	if (ret < 0)
+		goto error;
+
+	ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq);
+	if (ret < 0)
+		goto error;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+error:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
+					   struct netlink_callback *cb)
+{
+	struct psample_group *group;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	spin_lock(&psample_groups_lock);
+	list_for_each_entry(group, &psample_groups_list, list) {
+		if (!net_eq(group->net, sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			break;
+		idx++;
+	}
+
+	spin_unlock(&psample_groups_lock);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static const struct genl_ops psample_nl_ops[] = {
+	{
+		.cmd = PSAMPLE_CMD_GET_GROUP,
+		.dumpit = psample_nl_cmd_get_group_dumpit,
+		/* can be retrieved by unprivileged users */
+	}
+};
+
+static struct genl_family psample_nl_family __ro_after_init = {
+	.name		= PSAMPLE_GENL_NAME,
+	.version	= PSAMPLE_GENL_VERSION,
+	.maxattr	= PSAMPLE_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.mcgrps		= psample_nl_mcgrps,
+	.ops		= psample_nl_ops,
+	.n_ops		= ARRAY_SIZE(psample_nl_ops),
+	.n_mcgrps	= ARRAY_SIZE(psample_nl_mcgrps),
+};
+
+static void psample_group_notify(struct psample_group *group,
+				 enum psample_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI);
+	if (!err)
+		genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0,
+					PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC);
+	else
+		nlmsg_free(msg);
+}
+
+static struct psample_group *psample_group_create(struct net *net,
+						  u32 group_num)
+{
+	struct psample_group *group;
+
+	group = kzalloc(sizeof(*group), GFP_ATOMIC);
+	if (!group)
+		return NULL;
+
+	group->net = net;
+	group->group_num = group_num;
+	list_add_tail(&group->list, &psample_groups_list);
+
+	psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP);
+	return group;
+}
+
+static void psample_group_destroy(struct psample_group *group)
+{
+	psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
+	list_del(&group->list);
+	kfree(group);
+}
+
+static struct psample_group *
+psample_group_lookup(struct net *net, u32 group_num)
+{
+	struct psample_group *group;
+
+	list_for_each_entry(group, &psample_groups_list, list)
+		if ((group->group_num == group_num) && (group->net == net))
+			return group;
+	return NULL;
+}
+
+struct psample_group *psample_group_get(struct net *net, u32 group_num)
+{
+	struct psample_group *group;
+
+	spin_lock(&psample_groups_lock);
+
+	group = psample_group_lookup(net, group_num);
+	if (!group) {
+		group = psample_group_create(net, group_num);
+		if (!group)
+			goto out;
+	}
+	group->refcount++;
+
+out:
+	spin_unlock(&psample_groups_lock);
+	return group;
+}
+EXPORT_SYMBOL_GPL(psample_group_get);
+
+void psample_group_put(struct psample_group *group)
+{
+	spin_lock(&psample_groups_lock);
+
+	if (--group->refcount == 0)
+		psample_group_destroy(group);
+
+	spin_unlock(&psample_groups_lock);
+}
+EXPORT_SYMBOL_GPL(psample_group_put);
+
+void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
+			   u32 trunc_size, int in_ifindex, int out_ifindex,
+			   u32 sample_rate)
+{
+	struct sk_buff *nl_skb;
+	int data_len;
+	int meta_len;
+	void *data;
+	int ret;
+
+	meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   nla_total_size(sizeof(u32)) +	/* sample_rate */
+		   nla_total_size(sizeof(u32)) +	/* orig_size */
+		   nla_total_size(sizeof(u32)) +	/* group_num */
+		   nla_total_size(sizeof(u32));		/* seq */
+
+	data_len = min(skb->len, trunc_size);
+	if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
+		data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
+			    - NLA_ALIGNTO;
+
+	nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
+	if (unlikely(!nl_skb))
+		return;
+
+	data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0,
+			   PSAMPLE_CMD_SAMPLE);
+	if (unlikely(!data))
+		goto error;
+
+	if (in_ifindex) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	if (out_ifindex) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++);
+	if (unlikely(ret < 0))
+		goto error;
+
+	if (data_len) {
+		int nla_len = nla_total_size(data_len);
+		struct nlattr *nla;
+
+		nla = (struct nlattr *)skb_put(nl_skb, nla_len);
+		nla->nla_type = PSAMPLE_ATTR_DATA;
+		nla->nla_len = nla_attr_size(data_len);
+
+		if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
+			goto error;
+	}
+
+	genlmsg_end(nl_skb, data);
+	genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
+				PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
+
+	return;
+error:
+	pr_err_ratelimited("Could not create psample log message\n");
+	nlmsg_free(nl_skb);
+}
+EXPORT_SYMBOL_GPL(psample_sample_packet);
+
+static int __init psample_module_init(void)
+{
+	return genl_register_family(&psample_nl_family);
+}
+
+static void __exit psample_module_exit(void)
+{
+	genl_unregister_family(&psample_nl_family);
+}
+
+module_init(psample_module_init);
+module_exit(psample_module_exit);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("netlink channel for packet sampling");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 5c5670fae43027778e84b9d9ff3b9d91a10a8131 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Mon, 23 Jan 2017 11:07:09 +0100
Subject: net/sched: Introduce sample tc action

This action allows the user to sample traffic matched by tc classifier.
The sampling consists of choosing packets randomly and sampling them using
the psample module. The user can configure the psample group number, the
sampling rate and the packet's truncation (to save kernel-user traffic).

Example:
To sample ingress traffic from interface eth1, one may use the commands:

tc qdisc add dev eth1 handle ffff: ingress

tc filter add dev eth1 parent ffff: \
	   matchall action sample rate 12 group 4

Where the first command adds an ingress qdisc and the second starts
sampling randomly with an average of one sampled packet per 12 packets on
dev eth1 to psample group 4.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_sample.h        |  50 +++++++
 include/uapi/linux/tc_act/Kbuild      |   1 +
 include/uapi/linux/tc_act/tc_sample.h |  26 ++++
 net/sched/Kconfig                     |  12 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_sample.c                | 274 ++++++++++++++++++++++++++++++++++
 6 files changed, 364 insertions(+)
 create mode 100644 include/net/tc_act/tc_sample.h
 create mode 100644 include/uapi/linux/tc_act/tc_sample.h
 create mode 100644 net/sched/act_sample.c

(limited to 'include/uapi')

diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
new file mode 100644
index 000000000000..89e9305be880
--- /dev/null
+++ b/include/net/tc_act/tc_sample.h
@@ -0,0 +1,50 @@
+#ifndef __NET_TC_SAMPLE_H
+#define __NET_TC_SAMPLE_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+struct tcf_sample {
+	struct tc_action common;
+	u32 rate;
+	bool truncate;
+	u32 trunc_size;
+	struct psample_group __rcu *psample_group;
+	u32 psample_group_num;
+	struct list_head tcfm_list;
+	struct rcu_head rcu;
+};
+#define to_sample(a) ((struct tcf_sample *)a)
+
+static inline bool is_tcf_sample(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	return a->ops && a->ops->type == TCA_ACT_SAMPLE;
+#else
+	return false;
+#endif
+}
+
+static inline __u32 tcf_sample_rate(const struct tc_action *a)
+{
+	return to_sample(a)->rate;
+}
+
+static inline bool tcf_sample_truncate(const struct tc_action *a)
+{
+	return to_sample(a)->truncate;
+}
+
+static inline int tcf_sample_trunc_size(const struct tc_action *a)
+{
+	return to_sample(a)->trunc_size;
+}
+
+static inline struct psample_group *
+tcf_sample_psample_group(const struct tc_action *a)
+{
+	return rcu_dereference(to_sample(a)->psample_group);
+}
+
+#endif /* __NET_TC_SAMPLE_H */
diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index e3db7403296f..ba62ddf0e58a 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -4,6 +4,7 @@ header-y += tc_defact.h
 header-y += tc_gact.h
 header-y += tc_ipt.h
 header-y += tc_mirred.h
+header-y += tc_sample.h
 header-y += tc_nat.h
 header-y += tc_pedit.h
 header-y += tc_skbedit.h
diff --git a/include/uapi/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h
new file mode 100644
index 000000000000..edc9058bb30d
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_sample.h
@@ -0,0 +1,26 @@
+#ifndef __LINUX_TC_SAMPLE_H
+#define __LINUX_TC_SAMPLE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+
+#define TCA_ACT_SAMPLE 26
+
+struct tc_sample {
+	tc_gen;
+};
+
+enum {
+	TCA_SAMPLE_UNSPEC,
+	TCA_SAMPLE_TM,
+	TCA_SAMPLE_PARMS,
+	TCA_SAMPLE_RATE,
+	TCA_SAMPLE_TRUNC_SIZE,
+	TCA_SAMPLE_PSAMPLE_GROUP,
+	TCA_SAMPLE_PAD,
+	__TCA_SAMPLE_MAX
+};
+#define TCA_SAMPLE_MAX (__TCA_SAMPLE_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a9aa38d43fa7..72cfa3a6bac0 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -650,6 +650,18 @@ config NET_ACT_MIRRED
 	  To compile this code as a module, choose M here: the
 	  module will be called act_mirred.
 
+config NET_ACT_SAMPLE
+        tristate "Traffic Sampling"
+        depends on NET_CLS_ACT
+        select PSAMPLE
+        ---help---
+	  Say Y here to allow packet sampling tc action. The packet sample
+	  action consists of statistically choosing packets and sampling
+	  them using the psample module.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_sample.
+
 config NET_ACT_IPT
         tristate "IPtables targets"
         depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 4bdda3634e0b..7b915d226de7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)	+= act_police.o
 obj-$(CONFIG_NET_ACT_GACT)	+= act_gact.o
 obj-$(CONFIG_NET_ACT_MIRRED)	+= act_mirred.o
+obj-$(CONFIG_NET_ACT_SAMPLE)	+= act_sample.o
 obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
new file mode 100644
index 000000000000..39229756de07
--- /dev/null
+++ b/net/sched/act_sample.c
@@ -0,0 +1,274 @@
+/*
+ * net/sched/act_sample.c - Packet sampling tc action
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_sample.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/psample.h>
+
+#include <linux/if_arp.h>
+
+#define SAMPLE_TAB_MASK     7
+static unsigned int sample_net_id;
+static struct tc_action_ops act_sample_ops;
+
+static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
+	[TCA_SAMPLE_PARMS]		= { .len = sizeof(struct tc_sample) },
+	[TCA_SAMPLE_RATE]		= { .type = NLA_U32 },
+	[TCA_SAMPLE_TRUNC_SIZE]		= { .type = NLA_U32 },
+	[TCA_SAMPLE_PSAMPLE_GROUP]	= { .type = NLA_U32 },
+};
+
+static int tcf_sample_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a, int ovr,
+			   int bind)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
+	struct psample_group *psample_group;
+	struct tc_sample *parm;
+	struct tcf_sample *s;
+	bool exists = false;
+	int ret;
+
+	if (!nla)
+		return -EINVAL;
+	ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy);
+	if (ret < 0)
+		return ret;
+	if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
+	    !tb[TCA_SAMPLE_PSAMPLE_GROUP])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SAMPLE_PARMS]);
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_sample_ops, bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+	s = to_sample(*a);
+
+	ASSERT_RTNL();
+	s->tcf_action = parm->action;
+	s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+	s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+	psample_group = psample_group_get(net, s->psample_group_num);
+	if (!psample_group)
+		return -ENOMEM;
+	RCU_INIT_POINTER(s->psample_group, psample_group);
+
+	if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
+		s->truncate = true;
+		s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
+	}
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static void tcf_sample_cleanup_rcu(struct rcu_head *rcu)
+{
+	struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu);
+	struct psample_group *psample_group;
+
+	psample_group = rcu_dereference_protected(s->psample_group, 1);
+	RCU_INIT_POINTER(s->psample_group, NULL);
+	psample_group_put(psample_group);
+}
+
+static void tcf_sample_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_sample *s = to_sample(a);
+
+	call_rcu(&s->rcu, tcf_sample_cleanup_rcu);
+}
+
+static bool tcf_sample_dev_ok_push(struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return false;
+	default:
+		return true;
+	}
+}
+
+static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_sample *s = to_sample(a);
+	struct psample_group *psample_group;
+	int retval;
+	int size;
+	int iif;
+	int oif;
+
+	tcf_lastuse_update(&s->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+	retval = READ_ONCE(s->tcf_action);
+
+	rcu_read_lock();
+	psample_group = rcu_dereference(s->psample_group);
+
+	/* randomly sample packets according to rate */
+	if (psample_group && (prandom_u32() % s->rate == 0)) {
+		if (!skb_at_tc_ingress(skb)) {
+			iif = skb->skb_iif;
+			oif = skb->dev->ifindex;
+		} else {
+			iif = skb->dev->ifindex;
+			oif = 0;
+		}
+
+		/* on ingress, the mac header gets popped, so push it back */
+		if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+			skb_push(skb, skb->mac_len);
+
+		size = s->truncate ? s->trunc_size : skb->len;
+		psample_sample_packet(psample_group, skb, size, iif, oif,
+				      s->rate);
+
+		if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
+			skb_pull(skb, skb->mac_len);
+	}
+
+	rcu_read_unlock();
+	return retval;
+}
+
+static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_sample *s = to_sample(a);
+	struct tc_sample opt = {
+		.index      = s->tcf_index,
+		.action     = s->tcf_action,
+		.refcnt     = s->tcf_refcnt - ref,
+		.bindcnt    = s->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &s->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate))
+		goto nla_put_failure;
+
+	if (s->truncate)
+		if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size))
+			goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_sample_ops = {
+	.kind	  = "sample",
+	.type	  = TCA_ACT_SAMPLE,
+	.owner	  = THIS_MODULE,
+	.act	  = tcf_sample_act,
+	.dump	  = tcf_sample_dump,
+	.init	  = tcf_sample_init,
+	.cleanup  = tcf_sample_cleanup,
+	.walk	  = tcf_sample_walker,
+	.lookup	  = tcf_sample_search,
+	.size	  = sizeof(struct tcf_sample),
+};
+
+static __net_init int sample_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
+}
+
+static void __net_exit sample_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations sample_net_ops = {
+	.init = sample_init_net,
+	.exit = sample_exit_net,
+	.id   = &sample_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init sample_init_module(void)
+{
+	return tcf_register_action(&act_sample_ops, &sample_net_ops);
+}
+
+static void __exit sample_cleanup_module(void)
+{
+	tcf_unregister_action(&act_sample_ops, &sample_net_ops);
+}
+
+module_init(sample_init_module);
+module_exit(sample_cleanup_module);
+
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Packet sampling action");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 7898489880f55a9c3a954cd5660a0fb4fd81b625 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 30 Nov 2016 20:33:33 +0200
Subject: IB/mlx5: Enable Eth VFs to query their min-inline value for
 user-space

For some mlx5 HW models (CX4, CX4Lx), the VF driver needs to put part
of the packet headers on the TX descriptor so the e-switch can do proper
matching and steering. This is called "min-inline", it's advertized to
the VF by the FW and also enforced on them by the HW, such that if they
don't obey, their packets are dropped.

SRIOV VF libmlx5 instances should take into account the min-inline
value of their vports. For that end, we provide this value through
the vendor response part of init_ucontext command.

The min inline value is reported in a way which will let newer libmlx5
instances realize that they are running over an older kernel and act
accordingly (e.g apply some educated guess).

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c |  9 +++++++++
 include/uapi/rdma/mlx5-abi.h      | 14 +++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 8727116a4cab..9d8535385bb8 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -53,6 +53,7 @@
 #include <linux/in.h>
 #include <linux/etherdevice.h>
 #include <linux/mlx5/fs.h>
+#include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
 
 #define DRIVER_NAME "mlx5_ib"
@@ -1202,6 +1203,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 		resp.response_length += sizeof(resp.cmds_supp_uhw);
 	}
 
+	if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
+		if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
+			mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
+			resp.eth_min_inline++;
+		}
+		resp.response_length += sizeof(resp.eth_min_inline);
+	}
+
 	/*
 	 * We don't want to expose information from the PCI bar that is located
 	 * after 4096 bytes, so if the arch only supports larger pages, let's
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 85dc966ea70b..da7cd62bace7 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -90,6 +90,17 @@ enum mlx5_user_cmds_supp_uhw {
 	MLX5_USER_CMDS_SUPP_UHW_CREATE_AH    = 1 << 1,
 };
 
+/* The eth_min_inline response value is set to off-by-one vs the FW
+ * returned value to allow user-space to deal with older kernels.
+ */
+enum mlx5_user_inline_mode {
+	MLX5_USER_INLINE_MODE_NA,
+	MLX5_USER_INLINE_MODE_NONE,
+	MLX5_USER_INLINE_MODE_L2,
+	MLX5_USER_INLINE_MODE_IP,
+	MLX5_USER_INLINE_MODE_TCP_UDP,
+};
+
 struct mlx5_ib_alloc_ucontext_resp {
 	__u32	qp_tab_size;
 	__u32	bf_reg_size;
@@ -106,7 +117,8 @@ struct mlx5_ib_alloc_ucontext_resp {
 	__u32	response_length;
 	__u8	cqe_version;
 	__u8	cmds_supp_uhw;
-	__u16	reserved2;
+	__u8	eth_min_inline;
+	__u8	reserved2;
 	__u64	hca_core_clock_offset;
 	__u32	log_uar_size;
 	__u32	num_uars_per_page;
-- 
cgit v1.2.3


From d1b662adcdb87944b7f6f7bd2f95cbb1404dbf18 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 24 Jan 2017 01:06:28 +0100
Subject: bpf: allow option for setting bpf_l4_csum_replace from scratch

When programs need to calculate the csum from scratch for small UDP
packets and use bpf_l4_csum_replace() to feed the result from helpers
like bpf_csum_diff(), then we need a flag besides BPF_F_MARK_MANGLED_0
that would ignore the case of current csum being 0, and which would
still allow for the helper to set the csum and transform when needed
to CSUM_MANGLED_0.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd3068485410..e07fd5a324e6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -522,6 +522,7 @@ enum bpf_func_id {
 /* BPF_FUNC_l4_csum_replace flags. */
 #define BPF_F_PSEUDO_HDR		(1ULL << 4)
 #define BPF_F_MARK_MANGLED_0		(1ULL << 5)
+#define BPF_F_MARK_ENFORCE		(1ULL << 6)
 
 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
 #define BPF_F_INGRESS			(1ULL << 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index e2263da505be..1e00737e3bc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1522,10 +1522,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
 {
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
 	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
+	bool do_mforce = flags & BPF_F_MARK_ENFORCE;
 	__sum16 *ptr;
 
-	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
-			       BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
+			       BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely(offset > 0xffff || offset & 1))
 		return -EFAULT;
@@ -1533,7 +1534,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
 		return -EFAULT;
 
 	ptr = (__sum16 *)(skb->data + offset);
-	if (is_mmzero && !*ptr)
+	if (is_mmzero && !do_mforce && !*ptr)
 		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
-- 
cgit v1.2.3


From e5ff5ce6e20ee22511398bb31fb912466cf82a36 Mon Sep 17 00:00:00 2001
From: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Date: Wed, 25 Jan 2017 14:03:36 +1300
Subject: nsfs: Add an ioctl() to return the namespace type

Linux 4.9 added two ioctl() operations that can be used to discover:

* the parental relationships for hierarchical namespaces (user and PID)
  [NS_GET_PARENT]
* the user namespaces that owns a specified non-user-namespace
  [NS_GET_USERNS]

For no good reason that I can glean, NS_GET_USERNS was made synonymous
with NS_GET_PARENT for user namespaces. It might have been better if
NS_GET_USERNS had returned an error if the supplied file descriptor
referred to a user namespace, since it suggests that the caller may be
confused. More particularly, if it had generated an error, then I wouldn't
need the new ioctl() operation proposed here. (On the other hand, what
I propose here may be more generally useful.)

I would like to write code that discovers namespace relationships for
the purpose of understanding the namespace setup on a running system.
In particular, given a file descriptor (or pathname) for a namespace,
N, I'd like to obtain the corresponding user namespace.  Namespace N
might be a user namespace (in which case my code would just use N) or
a non-user namespace (in which case my code will use NS_GET_USERNS to
get the user namespace associated with N). The problem is that there
is no way to tell the difference by looking at the file descriptor
(and if I try to use NS_GET_USERNS on an N that is a user namespace, I
get the parent user namespace of N, which is not what I want).

This patch therefore adds a new ioctl(), NS_GET_NSTYPE, which, given
a file descriptor that refers to a user namespace, returns the
namespace type (one of the CLONE_NEW* constants).

Signed-off-by: Michael Kerrisk <mtk-manpages@gmail.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/nsfs.c                 | 2 ++
 include/uapi/linux/nsfs.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..5d534763c662 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -172,6 +172,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		if (!ns->ops->get_parent)
 			return -EINVAL;
 		return open_related_ns(ns, ns->ops->get_parent);
+	case NS_GET_NSTYPE:
+		return ns->ops->type;
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 3af617230d1b..2b48df11056a 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -9,5 +9,8 @@
 #define NS_GET_USERNS	_IO(NSIO, 0x1)
 /* Returns a file descriptor that refers to a parent namespace */
 #define NS_GET_PARENT	_IO(NSIO, 0x2)
+/* Returns the type of namespace (CLONE_NEW* value) referred to by
+   file descriptor */
+#define NS_GET_NSTYPE	_IO(NSIO, 0x3)
 
 #endif /* __LINUX_NSFS_H */
-- 
cgit v1.2.3


From 1045ba77a5962a22bce7777678ef46714107ea63 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Tue, 24 Jan 2017 07:02:41 -0500
Subject: net sched actions: Add support for user cookies

Introduce optional 128-bit action cookie.
Like all other cookie schemes in the networking world (eg in protocols
like http or existing kernel fib protocol field, etc) the idea is to save
user state that when retrieved serves as a correlator. The kernel
_should not_ intepret it.  The user can store whatever they wish in the
128 bits.

Sample exercise(showing variable length use of cookie)

.. create an accept action with cookie a1b2c3d4
sudo $TC actions add action ok index 1 cookie a1b2c3d4

.. dump all gact actions..
sudo $TC -s actions ls action gact

    action order 0: gact action pass
     random type none pass val 0
     index 1 ref 1 bind 0 installed 5 sec used 5 sec
    Action statistics:
    Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0
    cookie a1b2c3d4

.. bind the accept action to a filter..
sudo $TC filter add dev lo parent ffff: protocol ip prio 1 \
u32 match ip dst 127.0.0.1/32 flowid 1:1 action gact index 1

... send some traffic..
$ ping 127.0.0.1 -c 3
PING 127.0.0.1 (127.0.0.1) 56(84) bytes of data.
64 bytes from 127.0.0.1: icmp_seq=1 ttl=64 time=0.020 ms
64 bytes from 127.0.0.1: icmp_seq=2 ttl=64 time=0.027 ms
64 bytes from 127.0.0.1: icmp_seq=3 ttl=64 time=0.038 ms

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        |  1 +
 include/net/pkt_cls.h        |  8 ++++++++
 include/uapi/linux/pkt_cls.h |  3 +++
 net/sched/act_api.c          | 45 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 1d716449209e..cfa2ae33da9a 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -41,6 +41,7 @@ struct tc_action {
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
+	struct tc_cookie	*act_cookie;
 };
 #define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f0a051480c6c..b43077e47d35 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -515,4 +515,12 @@ struct tc_cls_bpf_offload {
 	u32 gen_flags;
 };
 
+
+/* This structure holds cookie structure that is passed from user
+ * to the kernel for actions and classifiers
+ */
+struct tc_cookie {
+	u8  *data;
+	u32 len;
+};
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index fd373ebd5a44..345551e71410 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -4,6 +4,8 @@
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
 
+#define TC_COOKIE_MAX_SIZE 16
+
 /* Action attributes */
 enum {
 	TCA_ACT_UNSPEC,
@@ -12,6 +14,7 @@ enum {
 	TCA_ACT_INDEX,
 	TCA_ACT_STATS,
 	TCA_ACT_PAD,
+	TCA_ACT_COOKIE,
 	__TCA_ACT_MAX
 };
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cd08df91351d..3c5e29ba6594 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -24,6 +24,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/sch_generic.h>
+#include <net/pkt_cls.h>
 #include <net/act_api.h>
 #include <net/netlink.h>
 
@@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head)
 
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_qstats);
+
+	if (p->act_cookie) {
+		kfree(p->act_cookie->data);
+		kfree(p->act_cookie);
+	}
+
 	kfree(p);
 }
 
@@ -475,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 		goto nla_put_failure;
 	if (tcf_action_copy_stats(skb, a, 0))
 		goto nla_put_failure;
+	if (a->act_cookie) {
+		if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
+			    a->act_cookie->data))
+			goto nla_put_failure;
+	}
+
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
@@ -516,6 +529,22 @@ errout:
 	return err;
 }
 
+int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb)
+{
+	a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL);
+	if (!a->act_cookie)
+		return -ENOMEM;
+
+	a->act_cookie->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
+	if (!a->act_cookie->data) {
+		kfree(a->act_cookie);
+		return -ENOMEM;
+	}
+	a->act_cookie->len = nla_len(tb[TCA_ACT_COOKIE]);
+
+	return 0;
+}
+
 struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 				    struct nlattr *est, char *name, int ovr,
 				    int bind)
@@ -575,6 +604,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		goto err_mod;
 
+	if (tb[TCA_ACT_COOKIE]) {
+		int cklen = nla_len(tb[TCA_ACT_COOKIE]);
+
+		if (cklen > TC_COOKIE_MAX_SIZE) {
+			err = -EINVAL;
+			tcf_hash_release(a, bind);
+			goto err_mod;
+		}
+
+		err = nla_memdup_cookie(a, tb);
+		if (err < 0) {
+			tcf_hash_release(a, bind);
+			goto err_mod;
+		}
+	}
+
 	/* module count goes up only when brand new policy is created
 	 * if it exists and is only bound to in a_o->init() then
 	 * ACT_P_CREATED is not returned (a zero is).
-- 
cgit v1.2.3


From 19f6d3f3c8422d65b5e3d2162e30ef07c6e21ea2 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 23 Jan 2017 10:59:22 -0800
Subject: net/tcp-fastopen: Add new API support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
  s = socket()
    create a new socket
  setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
    newly introduced sockopt
    If set, new functionality described below will be used.
    Return ENOTSUPP if TFO is not supported or not enabled in the
    kernel.

  connect()
    With cookie present, return 0 immediately.
    With no cookie, initiate 3WHS with TFO cookie-request option and
    return -1 with errno = EINPROGRESS.

  write()/sendmsg()
    With cookie present, send out SYN with data and return the number of
    bytes buffered.
    With no cookie, and 3WHS not yet completed, return -1 with errno =
    EINPROGRESS.
    No MSG_FASTOPEN flag is needed.

  read()
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
    write() is not called yet.
    Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
    established but no msg is received yet.
    Return number of bytes read if socket is established and there is
    msg received.

The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  3 ++-
 include/net/inet_sock.h  |  6 +++++-
 include/net/tcp.h        |  1 +
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/af_inet.c       | 31 ++++++++++++++++++++++++-------
 net/ipv4/tcp.c           | 35 ++++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_fastopen.c  | 33 +++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  7 ++++++-
 net/ipv6/tcp_ipv6.c      |  5 +++++
 9 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 5371b3d70cfe..f88f4649ba6f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -222,7 +222,8 @@ struct tcp_sock {
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
-		unused:5;
+		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+		unused:4;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index c9cff977a7fb..aa95053dfc78 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -206,7 +206,11 @@ struct inet_sock {
 				transparent:1,
 				mc_all:1,
 				nodefrag:1;
-	__u8			bind_address_no_port:1;
+	__u8			bind_address_no_port:1,
+				defer_connect:1; /* Indicates that fastopen_connect is set
+						  * and cookie exists so we defer connect
+						  * until first data frame is written
+						  */
 	__u8			rcv_tos;
 	__u8			convert_csum;
 	int			uc_index;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de67541d7adf..6ec4ea652f3f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1495,6 +1495,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
 /* Fastopen key context */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c53de2691cec..6ff35eb48d10 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -116,6 +116,7 @@ enum {
 #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 28fe8da4e1ac..92e7f3e957fa 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 	long timeo;
 
-	if (addr_len < sizeof(uaddr->sa_family))
-		return -EINVAL;
+	/*
+	 * uaddr can be NULL and addr_len can be 0 if:
+	 * sk is a TCP fastopen active socket and
+	 * TCP_FASTOPEN_CONNECT sockopt is set and
+	 * we already have a valid cookie for this socket.
+	 * In this case, user can call write() after connect().
+	 * write() will invoke tcp_sendmsg_fastopen() which calls
+	 * __inet_stream_connect().
+	 */
+	if (uaddr) {
+		if (addr_len < sizeof(uaddr->sa_family))
+			return -EINVAL;
 
-	if (uaddr->sa_family == AF_UNSPEC) {
-		err = sk->sk_prot->disconnect(sk, flags);
-		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
-		goto out;
+		if (uaddr->sa_family == AF_UNSPEC) {
+			err = sk->sk_prot->disconnect(sk, flags);
+			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+			goto out;
+		}
 	}
 
 	switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		err = -EALREADY;
+		if (inet_sk(sk)->defer_connect)
+			err = -EINPROGRESS;
+		else
+			err = -EALREADY;
 		/* Fall out of switch with err, set for this state */
 		break;
 	case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 		sock->state = SS_CONNECTING;
 
+		if (!err && inet_sk(sk)->defer_connect)
+			goto out;
+
 		/* Just entered SS_CONNECTING state; the only
 		 * difference is that return value in non-blocking
 		 * case is EINPROGRESS, rather than EALREADY.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c43eb1a831d7..d9735b76d073 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
+	} else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+		/* Active TCP fastopen socket with defer_connect
+		 * Return POLLOUT so application can call write()
+		 * in order for kernel to generate SYN+data
+		 */
+		mask |= POLLOUT | POLLWRNORM;
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 				int *copied, size_t size)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
 	int err, flags;
 
 	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	tp->fastopen_req->data = msg;
 	tp->fastopen_req->size = size;
 
+	if (inet->defer_connect) {
+		err = tcp_connect(sk);
+		/* Same failure procedure as in tcp_v4/6_connect */
+		if (err) {
+			tcp_set_state(sk, TCP_CLOSE);
+			inet->inet_dport = 0;
+			sk->sk_route_caps = 0;
+		}
+	}
 	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
 	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
 				    msg->msg_namelen, flags);
+	inet->defer_connect = 0;
 	*copied = tp->fastopen_req->copied;
 	tcp_free_fastopen_req(tp);
 	return err;
@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
-	if (flags & MSG_FASTOPEN) {
+	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
 		if (err == -EINPROGRESS && copied_syn > 0)
 			goto out;
@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		}
 		break;
+	case TCP_FASTOPEN_CONNECT:
+		if (val > 1 || val < 0) {
+			err = -EINVAL;
+		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+			if (sk->sk_state == TCP_CLOSE)
+				tp->fastopen_connect = val;
+			else
+				err = -EINVAL;
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		break;
 	case TCP_TIMESTAMP:
 		if (!tp->repair)
 			err = -EPERM;
@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = icsk->icsk_accept_queue.fastopenq.max_qlen;
 		break;
 
+	case TCP_FASTOPEN_CONNECT:
+		val = tp->fastopen_connect;
+		break;
+
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f90e09e1ff4c..9674bec4a0f8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -346,3 +346,36 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 	}
 	return cookie->len > 0;
 }
+
+/* This function checks if we want to defer sending SYN until the first
+ * write().  We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ *               return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+	struct tcp_fastopen_cookie cookie = { .len = 0 };
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (tp->fastopen_connect && !tp->fastopen_req) {
+		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+			inet_sk(sk)->defer_connect = 1;
+			return true;
+		}
+
+		/* Alloc fastopen_req in order for FO option to be included
+		 * in SYN
+		 */
+		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+					   sk->sk_allocation);
+		if (tp->fastopen_req)
+			tp->fastopen_req->cookie = cookie;
+		else
+			*err = -ENOBUFS;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a90b4540c11e..8c9e9aa17d66 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
+	rt = NULL;
 
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto failure;
+
 	err = tcp_connect(sk);
 
-	rt = NULL;
 	if (err)
 		goto failure;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0b7cd3d009b6..95c05e5293b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 							     inet->inet_dport,
 							     &tp->tsoffset);
 
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto late_failure;
+
 	err = tcp_connect(sk);
 	if (err)
 		goto late_failure;
-- 
cgit v1.2.3


From e60bf3ea67673fc6dd2645946e6dcf135fd7e30c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 11 Dec 2016 13:16:07 +0100
Subject: uapi: install batman_adv.h header

09748a22f4ab ("batman-adv: add generic netlink family for batman-adv")
introduced the new batman_adv.h which describes the netlink attributes and
commands of batman-adv. But the Kbuild entry to install the header was not
added.

All currently known tools ship their own copy of batman_adv.h but it should
be installed anyway to later be able to migrate to the system batman_adv.h.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index a8b93e685239..7fdceb2ac5b7 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -64,6 +64,7 @@ header-y += auto_fs.h
 header-y += auxvec.h
 header-y += ax25.h
 header-y += b1lli.h
+header-y += batman_adv.h
 header-y += baycom.h
 header-y += bcm933xx_hcs.h
 header-y += bfs_fs.h
-- 
cgit v1.2.3


From ac79cbb96b58614ce13c4fccc00a9b4d43c2f79b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 1 Jan 2017 00:00:00 +0100
Subject: batman-adv: update copyright years for 2017

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h        | 2 +-
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.c              | 2 +-
 net/batman-adv/bat_algo.h              | 2 +-
 net/batman-adv/bat_iv_ogm.c            | 2 +-
 net/batman-adv/bat_iv_ogm.h            | 2 +-
 net/batman-adv/bat_v.c                 | 2 +-
 net/batman-adv/bat_v.h                 | 2 +-
 net/batman-adv/bat_v_elp.c             | 2 +-
 net/batman-adv/bat_v_elp.h             | 2 +-
 net/batman-adv/bat_v_ogm.c             | 2 +-
 net/batman-adv/bat_v_ogm.h             | 2 +-
 net/batman-adv/bitarray.c              | 2 +-
 net/batman-adv/bitarray.h              | 2 +-
 net/batman-adv/bridge_loop_avoidance.c | 2 +-
 net/batman-adv/bridge_loop_avoidance.h | 2 +-
 net/batman-adv/debugfs.c               | 2 +-
 net/batman-adv/debugfs.h               | 2 +-
 net/batman-adv/distributed-arp-table.c | 2 +-
 net/batman-adv/distributed-arp-table.h | 2 +-
 net/batman-adv/fragmentation.c         | 2 +-
 net/batman-adv/fragmentation.h         | 2 +-
 net/batman-adv/gateway_client.c        | 2 +-
 net/batman-adv/gateway_client.h        | 2 +-
 net/batman-adv/gateway_common.c        | 2 +-
 net/batman-adv/gateway_common.h        | 2 +-
 net/batman-adv/hard-interface.c        | 2 +-
 net/batman-adv/hard-interface.h        | 2 +-
 net/batman-adv/hash.c                  | 2 +-
 net/batman-adv/hash.h                  | 2 +-
 net/batman-adv/icmp_socket.c           | 2 +-
 net/batman-adv/icmp_socket.h           | 2 +-
 net/batman-adv/log.c                   | 2 +-
 net/batman-adv/log.h                   | 2 +-
 net/batman-adv/main.c                  | 2 +-
 net/batman-adv/main.h                  | 2 +-
 net/batman-adv/multicast.c             | 2 +-
 net/batman-adv/multicast.h             | 2 +-
 net/batman-adv/netlink.c               | 2 +-
 net/batman-adv/netlink.h               | 2 +-
 net/batman-adv/network-coding.c        | 2 +-
 net/batman-adv/network-coding.h        | 2 +-
 net/batman-adv/originator.c            | 2 +-
 net/batman-adv/originator.h            | 2 +-
 net/batman-adv/packet.h                | 2 +-
 net/batman-adv/routing.c               | 2 +-
 net/batman-adv/routing.h               | 2 +-
 net/batman-adv/send.c                  | 2 +-
 net/batman-adv/send.h                  | 2 +-
 net/batman-adv/soft-interface.c        | 2 +-
 net/batman-adv/soft-interface.h        | 2 +-
 net/batman-adv/sysfs.c                 | 2 +-
 net/batman-adv/sysfs.h                 | 2 +-
 net/batman-adv/tp_meter.c              | 2 +-
 net/batman-adv/tp_meter.h              | 2 +-
 net/batman-adv/translation-table.c     | 2 +-
 net/batman-adv/translation-table.h     | 2 +-
 net/batman-adv/tvlv.c                  | 2 +-
 net/batman-adv/tvlv.h                  | 2 +-
 net/batman-adv/types.h                 | 2 +-
 60 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 734fe83ab645..a83ddb7b63db 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index f724d3c98a81..915987bc6d29 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 623d04302aa2..44fd073b7546 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 3b5b69cdd12b..29f6312f9bf1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f00f666e2ccd..7c3d994e90d8 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index b9f3550faaf7..ae2ab526bdb1 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 2ac612d7bab4..0acd081dd286 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 83b77639729e..dd7c4b647e6b 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index f2fb2f05b6bf..b90c9903e246 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index be17c0b1369e..376ead280ab9 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 38b9aab83fc0..03a35c9f456d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 4c4d45caa422..2068770b542d 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 032271421a20..2b070c7e31da 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0e6e9d09078c..cc262c9d97e0 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index e7f690b571ea..2d22fd5ba96c 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 2827cd3c13d2..e157986bd01c 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 77925504379d..5406148b9497 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index e49121ee55f6..9c5d4a65b98c 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 49576c5a3fe3..dab466f97ccf 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 813ecea96cf9..ec364a3c1c66 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 9c561e683f4b..42bfbd801a1b 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index b95f619606af..1a2d6c308745 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 52b8bd6ec431..de9955d5224d 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 859166d03561..3baa3d466e5e 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 21184810d89f..5db2e43e3775 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 8a5e1ddf1175..0a6a97d201f2 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 61a431a9772b..e348f76ea8c1 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d6309a423629..9f9890ff7a22 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a0a0fdb85805..b5f7e13918ac 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 557a7044cfbc..0c905e91c5e2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index b310f381ae02..6308c9f0fd96 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e44a7da51431..f3fec40aae86 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index c73c31769aba..4ef4bde2cc2d 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 3284a7b0325d..7a2b9f4da078 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d46415edd3be..5000c540614d 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 8683542067ba..57a8103dbce7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 090a69fc342e..952ba81a565b 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 2cddaf52a21d..2a78cddab0e9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 062738163bdc..ab13b4d58733 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 52eb16281aba..f1cd8c5da966 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index ab5a3bf0765f..e1f6fc72fe3e 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index d6d7fb4ec5d5..c66efb81d2f4 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 8f3b2969cc4e..8e2a4b205257 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index ebc56183f358..d94220a6d21a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 7a36bcfa0ba0..8e8a5db197cb 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 6713bdf414cd..5f050fbdfff7 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 05c3ff42e181..5ede16c32f15 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 49021b7124f3..d7308263b8fa 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index a94e1e8639ca..f21166d10323 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 1f55b4b9181c..4a9923a95e8a 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index ec303ddbf647..639c3abb214a 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 17c844196eb2..0ae8b30e4eaa 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c76021b4e198..e487412e256b 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 981e8c5b07e9..07f64b60b528 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index ba922c425e56..a8ada5c123bd 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 30ecbfb40adf..941afad92121 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 783fdba84db2..411d586191da 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index a783420356ae..1d9e267caec9 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index e4369b547b43..4d01400ada30 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index e913aee28c98..8f64a5c01345 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
-- 
cgit v1.2.3


From d35a00b8e33dab7385f724e713ae71c8be0a49f4 Mon Sep 17 00:00:00 2001
From: Felix Jia <felix.jia@alliedtelesis.co.nz>
Date: Thu, 26 Jan 2017 16:59:17 +1300
Subject: net/ipv6: allow sysctl to change link-local address generation mode

The address generation mode for IPv6 link-local can only be configured
by netlink messages. This patch adds the ability to change the address
generation mode via sysctl.

v1 -> v2
Removed the rtnl lock and switch to use RCU lock to iterate through
the netdev list.

v2 -> v3
Removed the addrgenmode variable from the idev structure and use the
systcl storage for the flag.

Simplifed the logic for sysctl handling by removing the supported
for all operation.

Added support for more types of tunnel interfaces for link-local
address generation.

Based the patches from net-next.

v3 -> v4
Removed unnecessary whitespace changes.

Signed-off-by: Felix Jia <felix.jia@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/net/if_inet6.h    |   1 -
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 104 +++++++++++++++++++++++++++++++++++++---------
 4 files changed, 86 insertions(+), 21 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 671d014e6429..71be5b330d21 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -69,6 +69,7 @@ struct ipv6_devconf {
 	__s32		seg6_require_hmac;
 #endif
 	__u32		enhanced_dad;
+	__u32		addr_gen_mode;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 0fa4c324b713..f656f9051aca 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -205,7 +205,6 @@ struct inet6_dev {
 	__s32			rs_interval;	/* in jiffies */
 	__u8			rs_probes;
 
-	__u8			addr_gen_mode;
 	unsigned long		tstamp; /* ipv6InterfaceTable update timestamp */
 	struct rcu_head		rcu;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index eaf65dc82e22..8ef9e75e004e 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -182,6 +182,7 @@ enum {
 	DEVCONF_SEG6_ENABLED,
 	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_ENHANCED_DAD,
+	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ac9bd5620f81..e35259dd17ba 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,6 +243,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -294,6 +295,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.seg6_require_hmac	= 0,
 #endif
 	.enhanced_dad           = 1,
+	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 };
 
 /* Check if a valid qdisc is available */
@@ -386,9 +388,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
 
 	if (ndev->cnf.stable_secret.initialized)
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	else
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+		ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
 
 	ndev->cnf.mtu6 = dev->mtu;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -2387,8 +2389,8 @@ static void manage_tempaddrs(struct inet6_dev *idev,
 
 static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
 {
-	return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
-	       idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+	return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+	       idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
 }
 
 int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
@@ -3152,7 +3154,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 
 	ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
 
-	switch (idev->addr_gen_mode) {
+	switch (idev->cnf.addr_gen_mode) {
 	case IN6_ADDR_GEN_MODE_RANDOM:
 		ipv6_gen_mode_random_init(idev);
 		/* fallthrough */
@@ -3204,8 +3206,8 @@ static void addrconf_dev_config(struct net_device *dev)
 
 	/* this device type has no EUI support */
 	if (dev->type == ARPHRD_NONE &&
-	    idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+	    idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
 
 	addrconf_addr_gen(idev, false);
 }
@@ -4982,6 +4984,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
 #endif
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
+	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5093,7 +5096,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
 	if (!nla)
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode))
+	if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
 		goto nla_put_failure;
 
 	read_lock_bh(&idev->lock);
@@ -5211,6 +5214,26 @@ static int inet6_validate_link_af(const struct net_device *dev,
 	return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
 }
 
+static int check_addr_gen_mode(int mode)
+{
+	if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+	    mode != IN6_ADDR_GEN_MODE_NONE &&
+	    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    mode != IN6_ADDR_GEN_MODE_RANDOM)
+		return -EINVAL;
+	return 1;
+}
+
+static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
+				int mode)
+{
+	if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+	    !idev->cnf.stable_secret.initialized &&
+	    !net->ipv6.devconf_dflt->stable_secret.initialized)
+		return -EINVAL;
+	return 1;
+}
+
 static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 {
 	int err = -EINVAL;
@@ -5232,18 +5255,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
 	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
 		u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
 
-		if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
-		    mode != IN6_ADDR_GEN_MODE_NONE &&
-		    mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    mode != IN6_ADDR_GEN_MODE_RANDOM)
-			return -EINVAL;
-
-		if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
-		    !idev->cnf.stable_secret.initialized &&
-		    !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
+		if (check_addr_gen_mode(mode) < 0 ||
+		    check_stable_privacy(idev, dev_net(dev), mode) < 0)
 			return -EINVAL;
 
-		idev->addr_gen_mode = mode;
+		idev->cnf.addr_gen_mode = mode;
 		err = 0;
 	}
 
@@ -5652,6 +5668,47 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
+					 void __user *buffer, size_t *lenp,
+					 loff_t *ppos)
+{
+	int ret = 0;
+	int new_val;
+	struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+	struct net *net = (struct net *)ctl->extra2;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		new_val = *((int *)ctl->data);
+
+		if (check_addr_gen_mode(new_val) < 0)
+			return -EINVAL;
+
+		/* request for default */
+		if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
+			ipv6_devconf_dflt.addr_gen_mode = new_val;
+
+		/* request for individual net device */
+		} else {
+			if (!idev)
+				return ret;
+
+			if (check_stable_privacy(idev, net, new_val) < 0)
+				return -EINVAL;
+
+			if (idev->cnf.addr_gen_mode != new_val) {
+				idev->cnf.addr_gen_mode = new_val;
+				rtnl_lock();
+				addrconf_dev_config(idev->dev);
+				rtnl_unlock();
+			}
+		}
+	}
+
+	return ret;
+}
+
 static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 					 void __user *buffer, size_t *lenp,
 					 loff_t *ppos)
@@ -5702,14 +5759,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
 			struct inet6_dev *idev = __in6_dev_get(dev);
 
 			if (idev) {
-				idev->addr_gen_mode =
+				idev->cnf.addr_gen_mode =
 					IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 			}
 		}
 	} else {
 		struct inet6_dev *idev = ctl->extra1;
 
-		idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+		idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	}
 
 out:
@@ -6096,6 +6153,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec,
 	},
+	{
+		.procname		= "addr_gen_mode",
+		.data			= &ipv6_devconf.addr_gen_mode,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= addrconf_sysctl_addr_gen_mode,
+	},
 	{
 		/* sentinel */
 	}
-- 
cgit v1.2.3


From 7e98102f489775d8c000884fca8a0d995ea688a9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 27 Jan 2017 16:24:38 -0800
Subject: tcp: record pkts sent and retransmistted

Add two stats in SCM_TIMESTAMPING_OPT_STATS:

TCP_NLA_DATA_SEGS_OUT: total data packets sent including retransmission
TCP_NLA_TOTAL_RETRANS: total data packets retransmitted

The names are picked to be consistent with corresponding fields in
TCP_INFO. This allows applications that are using the timestamping
API to measure latency stats to also retrive retransmission rate
of application write.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 6ff35eb48d10..38a2b07afdff 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -227,6 +227,8 @@ enum {
 	TCP_NLA_BUSY,		/* Time (usec) busy sending data */
 	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */
 	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
+	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */
+	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ed472ebf3b5..b751abc56935 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2870,7 +2870,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	struct sk_buff *stats;
 	struct tcp_info info;
 
-	stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+	stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -2881,6 +2881,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  info.tcpi_rwnd_limited, TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
 			  info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+			  tp->data_segs_out, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+			  tp->total_retrans, TCP_NLA_PAD);
 	return stats;
 }
 
-- 
cgit v1.2.3


From c92701322711682de89b2bd0f32affad040b6e86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:41 +1100
Subject: KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU

This adds two capabilities and two ioctls to allow userspace to
find out about and configure the POWER9 MMU in a guest.  The two
capabilities tell userspace whether KVM can support a guest using
the radix MMU, or using the hashed page table (HPT) MMU with a
process table and segment tables.  (Note that the MMUs in the
POWER9 processor cores do not use the process and segment tables
when in HPT mode, but the nest MMU does).

The KVM_PPC_CONFIGURE_V3_MMU ioctl allows userspace to specify
whether a guest will use the radix MMU or the HPT MMU, and to
specify the size and location (in guest space) of the process
table.

The KVM_PPC_GET_RMMU_INFO ioctl gives userspace information about
the radix MMU.  It returns a list of supported radix tree geometries
(base page size and number of bits indexed at each level of the
radix tree) and the encoding used to specify the various page
sizes for the TLB invalidate entry instruction.

Initially, both capabilities return 0 and the ioctls return -EINVAL,
until the necessary infrastructure for them to operate correctly
is added.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/virtual/kvm/api.txt   | 83 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +
 arch/powerpc/include/uapi/asm/kvm.h | 20 +++++++++
 arch/powerpc/kvm/book3s_hv.c        | 13 ++++++
 arch/powerpc/kvm/powerpc.c          | 32 ++++++++++++++
 include/uapi/linux/kvm.h            |  6 +++
 6 files changed, 156 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7cafaa..4470671b0c26 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3201,6 +3201,71 @@ struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
+4.99 KVM_PPC_CONFIGURE_V3_MMU
+
+Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_mmuv3_cfg (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read,
+         -EINVAL if the configuration is invalid
+
+This ioctl controls whether the guest will use radix or HPT (hashed
+page table) translation, and sets the pointer to the process table for
+the guest.
+
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;
+};
+
+There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and
+KVM_PPC_MMUV3_GTSE.  KVM_PPC_MMUV3_RADIX, if set, configures the guest
+to use radix tree translation, and if clear, to use HPT translation.
+KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest
+to be able to use the global TLB and SLB invalidation instructions;
+if clear, the guest may not use these instructions.
+
+The process_table field specifies the address and size of the guest
+process table, which is in the guest's space.  This field is formatted
+as the second doubleword of the partition table entry, as defined in
+the Power ISA V3.00, Book III section 5.7.6.1.
+
+4.100 KVM_PPC_GET_RMMU_INFO
+
+Capability: KVM_CAP_PPC_RADIX_MMU
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_rmmu_info (out)
+Returns: 0 on success,
+	 -EFAULT if struct kvm_ppc_rmmu_info cannot be written,
+	 -EINVAL if no useful information can be returned
+
+This ioctl returns a structure containing two things: (a) a list
+containing supported radix tree geometries, and (b) a list that maps
+page sizes to put in the "AP" (actual page size) field for the tlbie
+(TLB invalidate entry) instruction.
+
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
+The geometries[] field gives up to 8 supported geometries for the
+radix page table, in terms of the log base 2 of the smallest page
+size, and the number of bits indexed at each level of the tree, from
+the PTE level up to the PGD level in that order.  Any unused entries
+will have 0 in the page_shift field.
+
+The ap_encodings gives the supported page sizes and their AP field
+encodings, encoded with the AP value in the top 3 bits and the log
+base 2 of the page size in the bottom 6 bits.
+
 5. The kvm_run structure
 ------------------------
 
@@ -3942,3 +4007,21 @@ In order to use SynIC, it has to be activated by setting this
 capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
 will disable the use of APIC hardware virtualization even if supported
 by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.3 KVM_CAP_PPC_RADIX_MMU
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
+processor).
+
+8.4 KVM_CAP_PPC_HASH_MMU_V3
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+hashed page table MMU defined in Power ISA V3.00 (as implemented in
+the POWER9 processor), including in-memory segment tables.
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2da67bf1f2ec..48c760f89590 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -291,6 +291,8 @@ struct kvmppc_ops {
 				       struct irq_bypass_producer *);
 	void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
 					struct irq_bypass_producer *);
+	int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
+	int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 3603b6f51b11..cc0908b6c2a0 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
 	__u16	n_invalid;
 };
 
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;	/* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX	1	/* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE	2	/* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec34e39471a7..5f08ed070ae5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3657,6 +3657,17 @@ static void init_default_hcalls(void)
 	}
 }
 
+/* dummy implementations for now */
+static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+	return -EINVAL;
+}
+
+static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	return -EINVAL;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -3694,6 +3705,8 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
 	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
 #endif
+	.configure_mmu = kvmhv_configure_mmu,
+	.get_rmmu_info = kvmhv_get_rmmu_info,
 };
 
 static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892dec7cb6..38c0d154c01e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_HWRNG:
 		r = kvmppc_hwrng_present();
 		break;
+	case KVM_CAP_PPC_MMU_RADIX:
+		r = !!(0 && hv_enabled && radix_enabled());
+		break;
+	case KVM_CAP_PPC_MMU_HASH_V3:
+		r = !!(0 && hv_enabled && !radix_enabled() &&
+		       cpu_has_feature(CPU_FTR_ARCH_300));
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -1468,6 +1475,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
+	case KVM_PPC_CONFIGURE_V3_MMU: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_mmuv3_cfg cfg;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->configure_mmu)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(&cfg, argp, sizeof(cfg)))
+			goto out;
+		r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
+		break;
+	}
+	case KVM_PPC_GET_RMMU_INFO: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_rmmu_info info;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->get_rmmu_info)
+			goto out;
+		r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
 	default: {
 		struct kvm *kvm = filp->private_data;
 		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cac48eda1075..e0035808c814 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -871,6 +871,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_PPC_MMU_RADIX 134
+#define KVM_CAP_PPC_MMU_HASH_V3 135
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1187,6 +1189,10 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+#define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
+/* Available with KVM_CAP_PPC_RADIX_MMU */
+#define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
-- 
cgit v1.2.3


From ef1ead0c3b1dfb43d33caa4f50c8d214f86b6bc8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:48:58 +1100
Subject: KVM: PPC: Book3S HV: HPT resizing documentation and reserved numbers

This adds a new powerpc-specific KVM_CAP_SPAPR_RESIZE_HPT capability to
advertise whether KVM is capable of handling the PAPR extensions for
resizing the hashed page table during guest runtime.  It also adds
definitions for two new VM ioctl()s to implement this extension, and
documentation of the same.

Note that, HPT resizing is already possible with KVM PR without kernel
modification, since the HPT is managed within userspace (qemu).  The
capability defined here will only be set where an in-kernel implementation
of resizing is necessary, i.e. for KVM HV.  To determine if the userspace
resize implementation can be used, it's necessary to check
KVM_CAP_PPC_ALLOC_HTAB.  Unfortunately older kernels incorrectly set
KVM_CAP_PPC_ALLOC_HTAB even with KVM PR.  If userspace it want to support
resizing with KVM PR on such kernels, it will need a workaround.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt | 95 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          | 11 +++++
 2 files changed, 106 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index aca994a90355..64f217af0416 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3266,6 +3266,101 @@ The ap_encodings gives the supported page sizes and their AP field
 encodings, encoded with the AP value in the top 3 bits and the log
 base 2 of the page size in the bottom 6 bits.
 
+4.102 KVM_PPC_RESIZE_HPT_PREPARE
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+	 >0 if a new HPT is being prepared, the value is an estimated
+             number of milliseconds until preparation is complete
+         -EFAULT if struct kvm_reinject_control cannot be read,
+	 -EINVAL if the supplied shift or flags are invalid
+	 -ENOMEM if unable to allocate the new HPT
+	 -ENOSPC if there was a hash collision when moving existing
+                  HPT entries to the new HPT
+	 -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT).  Specifically this starts, stops or monitors
+the preparation of a new potential HPT for the guest, essentially
+implementing the H_RESIZE_HPT_PREPARE hypercall.
+
+If called with shift > 0 when there is no pending HPT for the guest,
+this begins preparation of a new pending HPT of size 2^(shift) bytes.
+It then returns a positive integer with the estimated number of
+milliseconds until preparation is complete.
+
+If called when there is a pending HPT whose size does not match that
+requested in the parameters, discards the existing pending HPT and
+creates a new one as above.
+
+If called when there is a pending HPT of the size requested, will:
+  * If preparation of the pending HPT is already complete, return 0
+  * If preparation of the pending HPT has failed, return an error
+    code, then discard the pending HPT.
+  * If preparation of the pending HPT is still in progress, return an
+    estimated number of milliseconds until preparation is complete.
+
+If called with shift == 0, discards any currently pending HPT and
+returns 0 (i.e. cancels any in-progress preparation).
+
+flags is reserved for future expansion, currently setting any bits in
+flags will result in an -EINVAL.
+
+Normally this will be called repeatedly with the same parameters until
+it returns <= 0.  The first call will initiate preparation, subsequent
+ones will monitor preparation until it completes or fails.
+
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
+4.103 KVM_PPC_RESIZE_HPT_COMMIT
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+         -EFAULT if struct kvm_reinject_control cannot be read,
+	 -EINVAL if the supplied shift or flags are invalid
+	 -ENXIO is there is no pending HPT, or the pending HPT doesn't
+                 have the requested size
+	 -EBUSY if the pending HPT is not fully prepared
+	 -ENOSPC if there was a hash collision when moving existing
+                  HPT entries to the new HPT
+	 -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT).  Specifically this requests that the guest be
+transferred to working with the new HPT, essentially implementing the
+H_RESIZE_HPT_COMMIT hypercall.
+
+This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has
+returned 0 with the same parameters.  In other cases
+KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or
+-EBUSY, though others may be possible if the preparation was started,
+but failed).
+
+This will have undefined effects on the guest if it has not already
+placed itself in a quiescent state where no vcpu will make MMU enabled
+memory accesses.
+
+On succsful completion, the pending HPT will become the guest's active
+HPT and the previous HPT will be discarded.
+
+On failure, the guest will still be operating on its previous HPT.
+
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e0035808c814..7964b970b9ad 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -685,6 +685,13 @@ struct kvm_ppc_smmu_info {
 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
 };
 
+/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -871,6 +878,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_SPAPR_RESIZE_HPT 133
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 
@@ -1189,6 +1197,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
+#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
+#define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
 /* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
 #define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
 /* Available with KVM_CAP_PPC_RADIX_MMU */
-- 
cgit v1.2.3


From 84d4add793c65b5bda802dcefcf0d7ab1a8e22ed Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias@cnexlabs.com>
Date: Tue, 31 Jan 2017 13:17:16 +0100
Subject: lightnvm: add ioctls for vector I/Os
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable user-space to issue vector I/O commands through ioctls. To issue
a vector I/O, the ppa list with addresses is also required and must be
mapped for the controller to access.

For each ioctl, the result and status bits are returned as well, such
that user-space can retrieve the open-channel SSD completion bits.

The implementation covers the traditional use-cases of bad block
management, and vectored read/write/erase.

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Metadata implementation, test, and fixes.
Signed-off-by: Simon A.F. Lund <slund@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c      |   4 +
 drivers/nvme/host/lightnvm.c  | 220 ++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      |   6 ++
 include/uapi/linux/lightnvm.h |  50 ++++++++++
 4 files changed, 280 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2fc86dc7a8df..037ee999e759 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -784,6 +784,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 		return nvme_sg_io(ns, (void __user *)arg);
 #endif
 	default:
+#ifdef CONFIG_NVM
+		if (ns->ndev)
+			return nvme_nvm_ioctl(ns, cmd, arg);
+#endif
 		return -ENOTTY;
 	}
 }
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 733992a25d6a..3b6cd9bdba7e 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -26,6 +26,8 @@
 #include <linux/bitops.h>
 #include <linux/lightnvm.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
+#include <uapi/linux/lightnvm.h>
 
 enum nvme_nvm_admin_opcode {
 	nvme_nvm_admin_identity		= 0xe2,
@@ -583,6 +585,224 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.max_phys_sect		= 64,
 };
 
+static void nvme_nvm_end_user_vio(struct request *rq, int error)
+{
+	struct completion *waiting = rq->end_io_data;
+
+	complete(waiting);
+}
+
+static int nvme_nvm_submit_user_cmd(struct request_queue *q,
+				struct nvme_ns *ns,
+				struct nvme_nvm_command *vcmd,
+				void __user *ubuf, unsigned int bufflen,
+				void __user *meta_buf, unsigned int meta_len,
+				void __user *ppa_buf, unsigned int ppa_len,
+				u32 *result, u64 *status, unsigned int timeout)
+{
+	bool write = nvme_is_write((struct nvme_command *)vcmd);
+	struct nvm_dev *dev = ns->ndev;
+	struct gendisk *disk = ns->disk;
+	struct request *rq;
+	struct bio *bio = NULL;
+	__le64 *ppa_list = NULL;
+	dma_addr_t ppa_dma;
+	__le64 *metadata = NULL;
+	dma_addr_t metadata_dma;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int ret;
+
+	rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
+			NVME_QID_ANY);
+	if (IS_ERR(rq)) {
+		ret = -ENOMEM;
+		goto err_cmd;
+	}
+
+	rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
+	rq->end_io_data = &wait;
+
+	if (ppa_buf && ppa_len) {
+		ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
+		if (!ppa_list) {
+			ret = -ENOMEM;
+			goto err_rq;
+		}
+		if (copy_from_user(ppa_list, (void __user *)ppa_buf,
+						sizeof(u64) * (ppa_len + 1))) {
+			ret = -EFAULT;
+			goto err_ppa;
+		}
+		vcmd->ph_rw.spba = cpu_to_le64(ppa_dma);
+	} else {
+		vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf);
+	}
+
+	if (ubuf && bufflen) {
+		ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL);
+		if (ret)
+			goto err_ppa;
+		bio = rq->bio;
+
+		if (meta_buf && meta_len) {
+			metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL,
+								&metadata_dma);
+			if (!metadata) {
+				ret = -ENOMEM;
+				goto err_map;
+			}
+
+			if (write) {
+				if (copy_from_user(metadata,
+						(void __user *)meta_buf,
+						meta_len)) {
+					ret = -EFAULT;
+					goto err_meta;
+				}
+			}
+			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
+		}
+
+		if (!disk)
+			goto submit;
+
+		bio->bi_bdev = bdget_disk(disk, 0);
+		if (!bio->bi_bdev) {
+			ret = -ENODEV;
+			goto err_meta;
+		}
+	}
+
+submit:
+	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio);
+
+	wait_for_completion_io(&wait);
+
+	ret = nvme_error_status(rq->errors);
+	if (result)
+		*result = rq->errors & 0x7ff;
+	if (status)
+		*status = le64_to_cpu(nvme_req(rq)->result.u64);
+
+	if (metadata && !ret && !write) {
+		if (copy_to_user(meta_buf, (void *)metadata, meta_len))
+			ret = -EFAULT;
+	}
+err_meta:
+	if (meta_buf && meta_len)
+		dma_pool_free(dev->dma_pool, metadata, metadata_dma);
+err_map:
+	if (bio) {
+		if (disk && bio->bi_bdev)
+			bdput(bio->bi_bdev);
+		blk_rq_unmap_user(bio);
+	}
+err_ppa:
+	if (ppa_buf && ppa_len)
+		dma_pool_free(dev->dma_pool, ppa_list, ppa_dma);
+err_rq:
+	blk_mq_free_request(rq);
+err_cmd:
+	return ret;
+}
+
+static int nvme_nvm_submit_vio(struct nvme_ns *ns,
+					struct nvm_user_vio __user *uvio)
+{
+	struct nvm_user_vio vio;
+	struct nvme_nvm_command c;
+	unsigned int length;
+	int ret;
+
+	if (copy_from_user(&vio, uvio, sizeof(vio)))
+		return -EFAULT;
+	if (vio.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.ph_rw.opcode = vio.opcode;
+	c.ph_rw.nsid = cpu_to_le32(ns->ns_id);
+	c.ph_rw.control = cpu_to_le16(vio.control);
+	c.ph_rw.length = cpu_to_le16(vio.nppas);
+
+	length = (vio.nppas + 1) << ns->lba_shift;
+
+	ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c,
+			(void __user *)(uintptr_t)vio.addr, length,
+			(void __user *)(uintptr_t)vio.metadata,
+							vio.metadata_len,
+			(void __user *)(uintptr_t)vio.ppa_list, vio.nppas,
+			&vio.result, &vio.status, 0);
+
+	if (ret && copy_to_user(uvio, &vio, sizeof(vio)))
+		return -EFAULT;
+
+	return ret;
+}
+
+static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
+					struct nvm_passthru_vio __user *uvcmd)
+{
+	struct nvm_passthru_vio vcmd;
+	struct nvme_nvm_command c;
+	struct request_queue *q;
+	unsigned int timeout = 0;
+	int ret;
+
+	if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd)))
+		return -EFAULT;
+	if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN)))
+		return -EACCES;
+	if (vcmd.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = vcmd.opcode;
+	c.common.nsid = cpu_to_le32(ns->ns_id);
+	c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
+	/* cdw11-12 */
+	c.ph_rw.length = cpu_to_le16(vcmd.nppas);
+	c.ph_rw.control  = cpu_to_le32(vcmd.control);
+	c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+
+	if (vcmd.timeout_ms)
+		timeout = msecs_to_jiffies(vcmd.timeout_ms);
+
+	q = admin ? ns->ctrl->admin_q : ns->queue;
+
+	ret = nvme_nvm_submit_user_cmd(q, ns,
+			(struct nvme_nvm_command *)&c,
+			(void __user *)(uintptr_t)vcmd.addr, vcmd.data_len,
+			(void __user *)(uintptr_t)vcmd.metadata,
+							vcmd.metadata_len,
+			(void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas,
+			&vcmd.result, &vcmd.status, timeout);
+
+	if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd)))
+		return -EFAULT;
+
+	return ret;
+}
+
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case NVME_NVM_IOCTL_ADMIN_VIO:
+		return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg);
+	case NVME_NVM_IOCTL_IO_VIO:
+		return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg);
+	case NVME_NVM_IOCTL_SUBMIT_VIO:
+		return nvme_nvm_submit_vio(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
 	struct request_queue *q = ns->queue;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6377e14586dc..330713c4abdb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -326,6 +326,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 int nvme_nvm_register_sysfs(struct nvme_ns *ns);
 void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
 #else
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
 				    int node)
@@ -343,6 +344,11 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
 {
 	return 0;
 }
+static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
+							unsigned long arg)
+{
+	return -ENOTTY;
+}
 #endif /* CONFIG_NVM */
 
 static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index 774a43128a7a..fd19f36b3129 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -122,6 +122,44 @@ struct nvm_ioctl_dev_factory {
 	__u32 flags;
 };
 
+struct nvm_user_vio {
+	__u8 opcode;
+	__u8 flags;
+	__u16 control;
+	__u16 nppas;
+	__u16 rsvd;
+	__u64 metadata;
+	__u64 addr;
+	__u64 ppa_list;
+	__u32 metadata_len;
+	__u32 data_len;
+	__u64 status;
+	__u32 result;
+	__u32 rsvd3[3];
+};
+
+struct nvm_passthru_vio {
+	__u8 opcode;
+	__u8 flags;
+	__u8 rsvd[2];
+	__u32 nsid;
+	__u32 cdw2;
+	__u32 cdw3;
+	__u64 metadata;
+	__u64 addr;
+	__u32 metadata_len;
+	__u32 data_len;
+	__u64 ppa_list;
+	__u16 nppas;
+	__u16 control;
+	__u32 cdw13;
+	__u32 cdw14;
+	__u32 cdw15;
+	__u64 status;
+	__u32 result;
+	__u32 timeout_ms;
+};
+
 /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */
 enum {
 	/* top level cmds */
@@ -137,6 +175,11 @@ enum {
 
 	/* Factory reset device */
 	NVM_DEV_FACTORY_CMD,
+
+	/* Vector user I/O */
+	NVM_DEV_VIO_ADMIN_CMD = 0x41,
+	NVM_DEV_VIO_CMD = 0x42,
+	NVM_DEV_VIO_USER_CMD = 0x43,
 };
 
 #define NVM_IOCTL 'L' /* 0x4c */
@@ -154,6 +197,13 @@ enum {
 #define NVM_DEV_FACTORY		_IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \
 						struct nvm_ioctl_dev_factory)
 
+#define NVME_NVM_IOCTL_IO_VIO		_IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \
+						struct nvm_passthru_vio)
+#define NVME_NVM_IOCTL_ADMIN_VIO	_IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\
+						struct nvm_passthru_vio)
+#define NVME_NVM_IOCTL_SUBMIT_VIO	_IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\
+						struct nvm_user_vio)
+
 #define NVM_VERSION_MAJOR	1
 #define NVM_VERSION_MINOR	0
 #define NVM_VERSION_PATCHLEVEL	0
-- 
cgit v1.2.3


From 32ddd944a056c786f6acdd95ed29e994adc613a2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 3 Jan 2017 12:30:11 -0500
Subject: nfsd: opt in to labeled nfs per export

Currently turning on NFSv4.2 results in 4.2 clients suddenly seeing the
individual file labels as they're set on the server.  This is not what
they've previously seen, and not appropriate in may cases.  (In
particular, if clients have heterogenous security policies then one
client's labels may not even make sense to another.)  Labeled NFS should
be opted in only in those cases when the administrator knows it makes
sense.

It's helpful to be able to turn 4.2 on by default, and otherwise the
protocol upgrade seems free of regressions.  So, default labeled NFS to
off and provide an export flag to reenable it.

Users wanting labeled NFS support on an export will henceforth need to:

	- make sure 4.2 support is enabled on client and server (as
	  before), and
	- upgrade the server nfs-utils to a version supporting the new
	  "security_label" export flag.
	- set that "security_label" flag on the export.

This is commit may be seen as a regression to anyone currently depending
on security labels.  We believe those cases are currently rare.

Reported-by: tibbs@math.uh.edu
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c                 | 1 +
 fs/nfsd/nfs4proc.c               | 4 ++++
 fs/nfsd/nfs4xdr.c                | 5 ++++-
 include/uapi/linux/nfsd/export.h | 5 +++--
 4 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 43e109cc0ccc..e71f11b1a180 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1102,6 +1102,7 @@ static struct flags {
 	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
 	{ NFSEXP_V4ROOT, {"v4root", ""}},
 	{ NFSEXP_PNFS, {"pnfs", ""}},
+	{ NFSEXP_SECURITY_LABEL, {"security_label", ""}},
 	{ 0, {"", ""}}
 };
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2d4eb47177d1..171f2d7ecfdd 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   u32 *bmval, u32 *writable)
 {
 	struct dentry *dentry = cstate->current_fh.fh_dentry;
+	struct svc_export *exp = cstate->current_fh.fh_export;
 
 	if (!nfsd_attrs_supported(cstate->minorversion, bmval))
 		return nfserr_attrnotsupp;
 	if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
 		return nfserr_attrnotsupp;
+	if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
+			!(exp->ex_flags & NFSEXP_SECURITY_LABEL))
+		return nfserr_attrnotsupp;
 	if (writable && !bmval_is_subset(bmval, writable))
 		return nfserr_inval;
 	if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 12cc6b29e78c..3cc190755958 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2417,8 +2417,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
 	     bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		err = security_inode_getsecctx(d_inode(dentry),
+		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
+			err = security_inode_getsecctx(d_inode(dentry),
 						&context, &contextlen);
+		else
+			err = -EOPNOTSUPP;
 		contextsupport = (err == 0);
 		if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 			if (err == -EOPNOTSUPP)
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 0df7bd5d2fb1..c3be256107c6 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -32,7 +32,8 @@
 #define NFSEXP_ASYNC		0x0010
 #define NFSEXP_GATHERED_WRITES	0x0020
 #define NFSEXP_NOREADDIRPLUS    0x0040
-/* 80 100 currently unused */
+#define NFSEXP_SECURITY_LABEL	0x0080
+/* 0x100 currently unused */
 #define NFSEXP_NOHIDE		0x0200
 #define NFSEXP_NOSUBTREECHECK	0x0400
 #define	NFSEXP_NOAUTHNLM	0x0800		/* Don't authenticate NLM requests - just trust */
@@ -53,7 +54,7 @@
 #define NFSEXP_PNFS		0x20000
 
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS		0x3FE7F
+#define NFSEXP_ALLFLAGS		0x3FEFF
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
-- 
cgit v1.2.3


From 93faccbbfa958a9668d3ab4e30f38dd205cee8d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Feb 2017 06:06:16 +1300
Subject: fs: Better permission checking for submounts

To support unprivileged users mounting filesystems two permission
checks have to be performed: a test to see if the user allowed to
create a mount in the mount namespace, and a test to see if
the user is allowed to access the specified filesystem.

The automount case is special in that mounting the original filesystem
grants permission to mount the sub-filesystems, to any user who
happens to stumble across the their mountpoint and satisfies the
ordinary filesystem permission checks.

Attempting to handle the automount case by using override_creds
almost works.  It preserves the idea that permission to mount
the original filesystem is permission to mount the sub-filesystem.
Unfortunately using override_creds messes up the filesystems
ordinary permission checks.

Solve this by being explicit that a mount is a submount by introducing
vfs_submount, and using it where appropriate.

vfs_submount uses a new mount internal mount flags MS_SUBMOUNT, to let
sget and friends know that a mount is a submount so they can take appropriate
action.

sget and sget_userns are modified to not perform any permission checks
on submounts.

follow_automount is modified to stop using override_creds as that
has proven problemantic.

do_mount is modified to always remove the new MS_SUBMOUNT flag so
that we know userspace will never by able to specify it.

autofs4 is modified to stop using current_real_cred that was put in
there to handle the previous version of submount permission checking.

cifs is modified to pass the mountpoint all of the way down to vfs_submount.

debugfs is modified to pass the mountpoint all of the way down to
trace_automount by adding a new parameter.  To make this change easier
a new typedef debugfs_automount_t is introduced to capture the type of
the debugfs automount function.

Cc: stable@vger.kernel.org
Fixes: 069d5ac9ae0d ("autofs:  Fix automounts by using current_real_cred()->uid")
Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
Reviewed-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/afs/mntpt.c          |  2 +-
 fs/autofs4/waitq.c      |  4 ++--
 fs/cifs/cifs_dfs_ref.c  |  7 ++++---
 fs/debugfs/inode.c      |  8 ++++----
 fs/namei.c              |  3 ---
 fs/namespace.c          | 17 ++++++++++++++++-
 fs/nfs/namespace.c      |  2 +-
 fs/nfs/nfs4namespace.c  |  2 +-
 fs/super.c              | 13 ++++++++++---
 include/linux/debugfs.h |  3 ++-
 include/linux/mount.h   |  3 +++
 include/uapi/linux/fs.h |  1 +
 kernel/trace/trace.c    |  4 ++--
 13 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	_debug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
 	_debug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_real_cred()->uid;
-		wq->gid = current_real_cred()->gid;
+		wq->uid = current_cred()->uid;
+		wq->gid = current_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
  * @fullpath:		full path in UNC format
  * @ref:		server's referral
  */
-static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
+		struct cifs_sb_info *cifs_sb,
 		const char *fullpath, const struct dfs_info3_param *ref)
 {
 	struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
 
-	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
 	kfree(mountdata);
 	kfree(devname);
 	return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
-		mnt = cifs_dfs_do_refmount(cifs_sb,
+		mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
 				full_path, referrals + i);
 		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
 			 __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f17fcf89e18e..1e30f74a9527 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	struct vfsmount *(*f)(void *);
-	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
-	return f(d_inode(path->dentry)->i_private);
+	debugfs_automount_t f;
+	f = (debugfs_automount_t)path->dentry->d_fsdata;
+	return f(path->dentry, d_inode(path->dentry)->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  */
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/namei.c b/fs/namei.c
index 6fa3e9138fe4..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
-	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
-	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
-	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..089a6b23135a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -989,6 +989,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+struct vfsmount *
+vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
+	     const char *name, void *data)
+{
+	/* Until it is worked out how to pass the user namespace
+	 * through from the parent mount to the submount don't support
+	 * unprivileged mounts with submounts.
+	 */
+	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
+		return ERR_PTR(-EPERM);
+
+	return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+}
+EXPORT_SYMBOL_GPL(vfs_submount);
+
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
@@ -2794,7 +2809,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME | MS_NOREMOTELOCK);
+		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 					   const char *devname,
 					   struct nfs_clone_mount *mountdata)
 {
-	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+	return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
 }
 
 /**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->hostname,
 				mountdata->mnt_path);
 
-		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
 		if (!IS_ERR(mnt))
 			break;
 	}
diff --git a/fs/super.c b/fs/super.c
index 1709ed029a2c..4185844f7a12 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & MS_KERNMOUNT) &&
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
 	    !(type->fs_flags & FS_USERNS_MOUNT) &&
 	    !capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
+		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
 {
 	struct user_namespace *user_ns = current_user_ns();
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & MS_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 014cc564d1c4..233006be30aa 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -97,9 +97,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
+typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data);
 
 void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
+extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
+				     struct file_system_type *type,
+				     const char *name, void *data);
 
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK	(1<<27)
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	ftrace_init_tracefs(tr, d_tracer);
 }
 
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 {
 	struct vfsmount *mnt;
 	struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
 	type = get_fs_type("tracefs");
 	if (!type)
 		return NULL;
-	mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+	mnt = vfs_submount(mntpt, type, "tracefs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return NULL;
-- 
cgit v1.2.3


From d95fa3c76a66b6d76b1e109ea505c55e66360f3c Mon Sep 17 00:00:00 2001
From: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Date: Wed, 25 Jan 2017 14:04:15 +1300
Subject: nsfs: Add an ioctl() to return owner UID of a userns

I'd like to write code that discovers the user namespace hierarchy on a
running system, and also shows who owns the various user namespaces.
Currently, there is no way of getting the owner UID of a user namespace.
Therefore, this patch adds a new NS_GET_CREATOR_UID ioctl() that fetches
the UID (as seen in the user namespace of the caller) of the creator of
the user namespace referred to by the specified file descriptor.

If the supplied file descriptor does not refer to a user namespace,
the operation fails with the error EINVAL. If the owner UID does
not have a mapping in the caller's user namespace return the
overflow UID as that appears easier to deal with in practice
in user-space applications.

-- EWB Changed the handling of unmapped UIDs from -EOVERFLOW
   back to the overflow uid.  Per conversation with
   Michael Kerrisk after examining his test code.

Acked-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Michael Kerrisk <mtk-manpages@gmail.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/nsfs.c                 | 11 +++++++++++
 include/uapi/linux/nsfs.h |  8 +++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 5d534763c662..1656843e87d2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -7,6 +7,7 @@
 #include <linux/seq_file.h>
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
+#include <linux/uaccess.h>
 
 static struct vfsmount *nsfs_mnt;
 
@@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns,
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg)
 {
+	struct user_namespace *user_ns;
 	struct ns_common *ns = get_proc_ns(file_inode(filp));
+	uid_t __user *argp;
+	uid_t uid;
 
 	switch (ioctl) {
 	case NS_GET_USERNS:
@@ -174,6 +178,13 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		return open_related_ns(ns, ns->ops->get_parent);
 	case NS_GET_NSTYPE:
 		return ns->ops->type;
+	case NS_GET_OWNER_UID:
+		if (ns->ops->type != CLONE_NEWUSER)
+			return -EINVAL;
+		user_ns = container_of(ns, struct user_namespace, ns);
+		argp = (uid_t __user *) arg;
+		uid = from_kuid_munged(current_user_ns(), user_ns->owner);
+		return put_user(uid, argp);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 2b48df11056a..1a3ca79f466b 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -6,11 +6,13 @@
 #define NSIO	0xb7
 
 /* Returns a file descriptor that refers to an owning user namespace */
-#define NS_GET_USERNS	_IO(NSIO, 0x1)
+#define NS_GET_USERNS		_IO(NSIO, 0x1)
 /* Returns a file descriptor that refers to a parent namespace */
-#define NS_GET_PARENT	_IO(NSIO, 0x2)
+#define NS_GET_PARENT		_IO(NSIO, 0x2)
 /* Returns the type of namespace (CLONE_NEW* value) referred to by
    file descriptor */
-#define NS_GET_NSTYPE	_IO(NSIO, 0x3)
+#define NS_GET_NSTYPE		_IO(NSIO, 0x3)
+/* Get owner UID (in the caller's user namespace) for a user namespace */
+#define NS_GET_OWNER_UID	_IO(NSIO, 0x4)
 
 #endif /* __LINUX_NSFS_H */
-- 
cgit v1.2.3


From ba94f3088b792b16ea576a256a6030feddc87f24 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Wed, 1 Feb 2017 11:00:45 -0800
Subject: unix: add ioctl to open a unix socket file with O_PATH

This ioctl opens a file to which a socket is bound and
returns a file descriptor. The caller has to have CAP_NET_ADMIN
in the socket network namespace.

Currently it is impossible to get a path and a mount point
for a socket file. socket_diag reports address, device ID and inode
number for unix sockets. An address can contain a relative path or
a file may be moved somewhere. And these properties say nothing about
a mount namespace and a mount point of a socket file.

With the introduced ioctl, we can get a path by reading
/proc/self/fd/X and get mnt_id from /proc/self/fdinfo/X.

In CRIU we are going to use this ioctl to dump and restore unix socket.

Here is an example how it can be used:

$ strace -e socket,bind,ioctl ./test /tmp/test_sock
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
bind(3, {sa_family=AF_UNIX, sun_path="test_sock"}, 11) = 0
ioctl(3, SIOCUNIXFILE, 0)           = 4
^Z

$ ss -a | grep test_sock
u_str  LISTEN     0      1      test_sock 17798                 * 0

$ ls -l /proc/760/fd/{3,4}
lrwx------ 1 root root 64 Feb  1 09:41 3 -> 'socket:[17798]'
l--------- 1 root root 64 Feb  1 09:41 4 -> /tmp/test_sock

$ cat /proc/760/fdinfo/4
pos:	0
flags:	012000000
mnt_id:	40

$ cat /proc/self/mountinfo | grep "^40\s"
40 19 0:37 / /tmp rw shared:23 - tmpfs tmpfs rw

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/un.h |  2 ++
 net/unix/af_unix.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/un.h b/include/uapi/linux/un.h
index 3ed3e46c1b1f..4f0ab3a548ad 100644
--- a/include/uapi/linux/un.h
+++ b/include/uapi/linux/un.h
@@ -10,4 +10,6 @@ struct sockaddr_un {
 	char sun_path[UNIX_PATH_MAX];	/* pathname */
 };
 
+#define SIOCUNIXFILE (SIOCPROTOPRIVATE + 0) /* open a socket file with O_PATH */
+
 #endif /* _LINUX_UN_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cef79873b09d..e2d18b9f910f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,6 +117,7 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 #include <linux/freezer.h>
+#include <linux/file.h>
 
 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
@@ -2592,6 +2593,43 @@ long unix_outq_len(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(unix_outq_len);
 
+static int unix_open_file(struct sock *sk)
+{
+	struct path path;
+	struct file *f;
+	int fd;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	unix_state_lock(sk);
+	path = unix_sk(sk)->path;
+	if (!path.dentry) {
+		unix_state_unlock(sk);
+		return -ENOENT;
+	}
+
+	path_get(&path);
+	unix_state_unlock(sk);
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		goto out;
+
+	f = dentry_open(&path, O_PATH, current_cred());
+	if (IS_ERR(f)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(f);
+		goto out;
+	}
+
+	fd_install(fd, f);
+out:
+	path_put(&path);
+
+	return fd;
+}
+
 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
@@ -2610,6 +2648,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		else
 			err = put_user(amount, (int __user *)arg);
 		break;
+	case SIOCUNIXFILE:
+		err = unix_open_file(sk);
+		break;
 	default:
 		err = -ENOIOCTLCMD;
 		break;
-- 
cgit v1.2.3


From 8fe809a992639b2013c0d8da2ba55cdea28a959a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 1 Feb 2017 20:47:59 -0800
Subject: net: add LINUX_MIB_PFMEMALLOCDROP counter

Debugging issues caused by pfmemalloc is often tedious.

Add a new SNMP counter to more easily diagnose these problems.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Josef Bacik <jbacik@fb.com>
Acked-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/core/filter.c         | 5 +++--
 net/ipv4/proc.c           | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index e7a31f830690..3b2bed7ca9a4 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -240,6 +240,7 @@ enum
 	LINUX_MIB_SACKMERGED,
 	LINUX_MIB_SACKSHIFTFALLBACK,
 	LINUX_MIB_TCPBACKLOGDROP,
+	LINUX_MIB_PFMEMALLOCDROP,
 	LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
 	LINUX_MIB_TCPDEFERACCEPTDROP,
 	LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
diff --git a/net/core/filter.c b/net/core/filter.c
index 1e00737e3bc3..0b753cbb2536 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -76,9 +76,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 	 * allow SOCK_MEMALLOC sockets to use it as this socket is
 	 * helping free memory
 	 */
-	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
 		return -ENOMEM;
-
+	}
 	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
 	if (err)
 		return err;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a9deeb90dd36..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -262,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
 	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
 	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+	SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
 	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
-- 
cgit v1.2.3


From 1ce8460496c05379c66edc178c3c55ca4e953044 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 1 Feb 2017 15:30:02 +0200
Subject: net: Introduce ife encapsulation module

This module is responsible for the ife encapsulation protocol
encode/decode logics. That module can:
 - ife_encode: encode skb and reserve space for the ife meta header
 - ife_decode: decode skb and extract the meta header size
 - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife
   header space.
 - ife_tlv_meta_decode - decodes one tlv entry from the packet
 - ife_tlv_meta_next - advance to the next tlv

Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS               |   7 +++
 include/net/ife.h         |  51 +++++++++++++++++
 include/uapi/linux/Kbuild |   1 +
 include/uapi/linux/ife.h  |  18 ++++++
 net/Kconfig               |   1 +
 net/Makefile              |   1 +
 net/ife/Kconfig           |  16 ++++++
 net/ife/Makefile          |   5 ++
 net/ife/ife.c             | 142 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 242 insertions(+)
 create mode 100644 include/net/ife.h
 create mode 100644 include/uapi/linux/ife.h
 create mode 100644 net/ife/Kconfig
 create mode 100644 net/ife/Makefile
 create mode 100644 net/ife/ife.c

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5e637e2b3ff9..2abda6cb3150 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6250,6 +6250,13 @@ F:	include/net/cfg802154.h
 F:	include/net/ieee802154_netdev.h
 F:	Documentation/networking/ieee802154.txt
 
+IFE PROTOCOL
+M:	Yotam Gigi <yotamg@mellanox.com>
+M:	Jamal Hadi Salim <jhs@mojatatu.com>
+F:	net/ife
+F:	include/net/ife.h
+F:	include/uapi/linux/ife.h
+
 IGORPLUG-USB IR RECEIVER
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org
diff --git a/include/net/ife.h b/include/net/ife.h
new file mode 100644
index 000000000000..2d87d6898b0a
--- /dev/null
+++ b/include/net/ife.h
@@ -0,0 +1,51 @@
+#ifndef __NET_IFE_H
+#define __NET_IFE_H
+
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <uapi/linux/ife.h>
+
+#if IS_ENABLED(CONFIG_NET_IFE)
+
+void *ife_encode(struct sk_buff *skb, u16 metalen);
+void *ife_decode(struct sk_buff *skb, u16 *metalen);
+
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+
+void *ife_tlv_meta_next(void *skbdata);
+
+#else
+
+static inline void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	return NULL;
+}
+
+static inline void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	return NULL;
+}
+
+static inline void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen,
+					u16 *totlen)
+{
+	return NULL;
+}
+
+static inline int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval)
+{
+	return 0;
+}
+
+static inline void *ife_tlv_meta_next(void *skbdata)
+{
+	return NULL;
+}
+
+#endif
+
+#endif /* __NET_IFE_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 486e050e64c5..a2e90722a4c4 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -195,6 +195,7 @@ header-y += if_tun.h
 header-y += if_tunnel.h
 header-y += if_vlan.h
 header-y += if_x25.h
+header-y += ife.h
 header-y += igmp.h
 header-y += ila.h
 header-y += in6.h
diff --git a/include/uapi/linux/ife.h b/include/uapi/linux/ife.h
new file mode 100644
index 000000000000..2954da32e012
--- /dev/null
+++ b/include/uapi/linux/ife.h
@@ -0,0 +1,18 @@
+#ifndef __UAPI_IFE_H
+#define __UAPI_IFE_H
+
+#define IFE_METAHDRLEN 2
+
+enum {
+	IFE_META_SKBMARK = 1,
+	IFE_META_HASHID,
+	IFE_META_PRIO,
+	IFE_META_QMAP,
+	IFE_META_TCINDEX,
+	__IFE_META_MAX
+};
+
+/*Can be overridden at runtime by module option*/
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index ce4aee69fc0d..2f2842d2d3ed 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -391,6 +391,7 @@ source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
 source "net/psample/Kconfig"
+source "net/ife/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
diff --git a/net/Makefile b/net/Makefile
index 7d41de48310e..9b681550e3a3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_CEPH_LIB)		+= ceph/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
 obj-$(CONFIG_NFC)		+= nfc/
 obj-$(CONFIG_PSAMPLE)		+= psample/
+obj-$(CONFIG_NET_IFE)		+= ife/
 obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_MPLS)		+= mpls/
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
new file mode 100644
index 000000000000..31e48b652c7c
--- /dev/null
+++ b/net/ife/Kconfig
@@ -0,0 +1,16 @@
+#
+# IFE subsystem configuration
+#
+
+menuconfig NET_IFE
+	depends on NET
+        tristate "Inter-FE based on IETF ForCES InterFE LFB"
+	default n
+	help
+	  Say Y here to add support of IFE encapsulation protocol
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this support as a module, choose M here: the module will
+	  be called ife.
diff --git a/net/ife/Makefile b/net/ife/Makefile
new file mode 100644
index 000000000000..2a90d97746cc
--- /dev/null
+++ b/net/ife/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the IFE encapsulation protocol
+#
+
+obj-$(CONFIG_NET_IFE) += ife.o
diff --git a/net/ife/ife.c b/net/ife/ife.c
new file mode 100644
index 000000000000..f360341c72eb
--- /dev/null
+++ b/net/ife/ife.c
@@ -0,0 +1,142 @@
+/*
+ * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB
+ * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/etherdevice.h>
+#include <net/ife.h>
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+void *ife_encode(struct sk_buff *skb, u16 metalen)
+{
+	/* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	 * where ORIGDATA = original ethernet header ...
+	 */
+	int hdrm = metalen + IFE_METAHDRLEN;
+	int total_push = hdrm + skb->dev->hard_header_len;
+	struct ifeheadr *ifehdr;
+	struct ethhdr *iethh;	/* inner ether header */
+	int skboff = 0;
+	int err;
+
+	err = skb_cow_head(skb, total_push);
+	if (unlikely(err))
+		return NULL;
+
+	iethh = (struct ethhdr *) skb->data;
+
+	__skb_push(skb, total_push);
+	memcpy(skb->data, iethh, skb->dev->hard_header_len);
+	skb_reset_mac_header(skb);
+	skboff += skb->dev->hard_header_len;
+
+	/* total metadata length */
+	ifehdr = (struct ifeheadr *) (skb->data + skboff);
+	metalen += IFE_METAHDRLEN;
+	ifehdr->metalen = htons(metalen);
+
+	return ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_encode);
+
+void *ife_decode(struct sk_buff *skb, u16 *metalen)
+{
+	struct ifeheadr *ifehdr;
+	int total_pull;
+	u16 ifehdrln;
+
+	ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+	ifehdrln = ntohs(ifehdr->metalen);
+	total_pull = skb->dev->hard_header_len + ifehdrln;
+
+	if (unlikely(ifehdrln < 2))
+		return NULL;
+
+	if (unlikely(!pskb_may_pull(skb, total_pull)))
+		return NULL;
+
+	skb_set_mac_header(skb, total_pull);
+	__skb_pull(skb, total_pull);
+	*metalen = ifehdrln - IFE_METAHDRLEN;
+
+	return &ifehdr->tlv_data;
+}
+EXPORT_SYMBOL_GPL(ife_decode);
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+/* Caller takes care of presenting data in network order
+ */
+void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+
+	*dlen = ntohs(tlv->len) - NLA_HDRLEN;
+	*attrtype = ntohs(tlv->type);
+
+	if (totlen)
+		*totlen = nla_total_size(*dlen);
+
+	return skbdata + sizeof(struct meta_tlvhdr);
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_decode);
+
+void *ife_tlv_meta_next(void *skbdata)
+{
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+	u16 tlvlen = ntohs(tlv->len);
+
+	tlvlen = NLA_ALIGN(tlvlen);
+
+	return skbdata + tlvlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_next);
+
+/* Caller takes care of presenting data in network order
+ */
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	__be32 *tlv = (__be32 *) (skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *) tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 295a6e06d21e1f469c9f38b00125a13b60ad4e7c Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 1 Feb 2017 15:30:03 +0200
Subject: net/sched: act_ife: Change to use ife module

Use the encode/decode functionality from the ife module instead of using
implementation inside the act_ife.

Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h        |   1 -
 include/uapi/linux/tc_act/tc_ife.h |  10 +---
 net/sched/Kconfig                  |   1 +
 net/sched/act_ife.c                | 110 +++++++++++--------------------------
 4 files changed, 34 insertions(+), 88 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index f37e7516ab28..30ba459ddd34 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -6,7 +6,6 @@
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 
-#define IFE_METAHDRLEN 2
 struct tcf_ife_info {
 	struct tc_action common;
 	u8 eth_dst[ETH_ALEN];
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index cd18360eca24..7c2817866c97 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
+#include <linux/ife.h>
 
 #define TCA_ACT_IFE 25
 /* Flag bits for now just encoding/decoding; mutually exclusive */
@@ -28,13 +29,4 @@ enum {
 };
 #define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
 
-#define IFE_META_SKBMARK 1
-#define IFE_META_HASHID 2
-#define	IFE_META_PRIO 3
-#define	IFE_META_QMAP 4
-#define	IFE_META_TCINDEX 5
-/*Can be overridden at runtime by module option*/
-#define	__IFE_META_MAX 6
-#define IFE_META_MAX (__IFE_META_MAX - 1)
-
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 72cfa3a6bac0..403790cce7d2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -776,6 +776,7 @@ config NET_ACT_SKBMOD
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
+        select NET_IFE
         ---help---
 	  Say Y here to allow for sourcing and terminating metadata
 	  For details refer to netdev01 paper:
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 70148c10ede9..71e7ff22f7c9 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -32,6 +32,7 @@
 #include <uapi/linux/tc_act/tc_ife.h>
 #include <net/tc_act/tc_ife.h>
 #include <linux/etherdevice.h>
+#include <net/ife.h>
 
 #define IFE_TAB_MASK 15
 
@@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
 	[TCA_IFE_TYPE] = { .type = NLA_U16},
 };
 
-/* Caller takes care of presenting data in network order
-*/
-static int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
-			       const void *dval)
-{
-	u32 *tlv = (u32 *)(skbdata);
-	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
-	char *dptr = (char *)tlv + NLA_HDRLEN;
-	u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
-
-	*tlv = htonl(htlv);
-	memset(dptr, 0, totlen - NLA_HDRLEN);
-	memcpy(dptr, dval, dlen);
-
-	return totlen;
-}
-
 int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
 {
 	u16 edata = 0;
@@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
 	return 0;
 }
 
-struct ifeheadr {
-	__be16 metalen;
-	u8 tlv_data[];
-};
-
-struct meta_tlvhdr {
-	__be16 type;
-	__be16 len;
-};
-
 static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
-	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
-	int ifehdrln = (int)ifehdr->metalen;
-	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+	u8 *ifehdr_end;
+	u8 *tlv_data;
+	u16 metalen;
 
 	spin_lock(&ife->tcf_lock);
 	bstats_update(&ife->tcf_bstats, skb);
 	tcf_lastuse_update(&ife->tcf_tm);
 	spin_unlock(&ife->tcf_lock);
 
-	ifehdrln = ntohs(ifehdrln);
-	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+	if (skb_at_tc_ingress(skb))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	tlv_data = ife_decode(skb, &metalen);
+	if (unlikely(!tlv_data)) {
 		spin_lock(&ife->tcf_lock);
 		ife->tcf_qstats.drops++;
 		spin_unlock(&ife->tcf_lock);
 		return TC_ACT_SHOT;
 	}
 
-	skb_set_mac_header(skb, ifehdrln);
-	__skb_pull(skb, ifehdrln);
-	skb->protocol = eth_type_trans(skb, skb->dev);
-	ifehdrln -= IFE_METAHDRLEN;
-
-	while (ifehdrln > 0) {
-		u8 *tlvdata = (u8 *)tlv;
-		u16 mtype = tlv->type;
-		u16 mlen = tlv->len;
-		u16 alen;
+	ifehdr_end = tlv_data + metalen;
+	for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) {
+		u8 *curr_data;
+		u16 mtype;
+		u16 dlen;
 
-		mtype = ntohs(mtype);
-		mlen = ntohs(mlen);
-		alen = NLA_ALIGN(mlen);
+		curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
 
-		if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN),
-				       (void *)(tlvdata + NLA_HDRLEN))) {
+		if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
 			/* abuse overlimits to count when we receive metadata
 			 * but dont have an ops for it
 			 */
-			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
-					    mtype, mlen);
+			pr_info_ratelimited("Unknown metaid %d dlen %d\n",
+					    mtype, dlen);
 			ife->tcf_qstats.overlimits++;
 		}
+	}
 
-		tlvdata += alen;
-		ifehdrln -= alen;
-		tlv = (struct meta_tlvhdr *)tlvdata;
+	if (WARN_ON(tlv_data != ifehdr_end)) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
 	}
 
+	skb->protocol = eth_type_trans(skb, skb->dev);
 	skb_reset_network_header(skb);
+
 	return action;
 }
 
@@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ethhdr *oethh;	/* outer ether header */
-	struct ethhdr *iethh;	/* inner eth header */
 	struct tcf_meta_info *e;
 	/*
 	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
@@ -735,10 +708,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	u16 metalen = ife_get_sz(skb, ife);
 	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
-	unsigned int skboff = skb->dev->hard_header_len;
+	unsigned int skboff = 0;
 	int new_len = skb->len + hdrm;
 	bool exceed_mtu = false;
-	int err;
+	void *ife_meta;
+	int err = 0;
 
 	if (!skb_at_tc_ingress(skb)) {
 		if (new_len > skb->dev->mtu)
@@ -765,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		return TC_ACT_SHOT;
 	}
 
-	err = skb_cow_head(skb, hdrm);
-	if (unlikely(err)) {
-		ife->tcf_qstats.drops++;
-		spin_unlock(&ife->tcf_lock);
-		return TC_ACT_SHOT;
-	}
-
 	if (skb_at_tc_ingress(skb))
 		skb_push(skb, skb->dev->hard_header_len);
 
-	iethh = (struct ethhdr *)skb->data;
-	__skb_push(skb, hdrm);
-	memcpy(skb->data, iethh, skb->mac_len);
-	skb_reset_mac_header(skb);
-	oethh = eth_hdr(skb);
-
-	/*total metadata length */
-	metalen += IFE_METAHDRLEN;
-	metalen = htons(metalen);
-	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
-	skboff += IFE_METAHDRLEN;
+	ife_meta = ife_encode(skb, metalen);
 
 	/* XXX: we dont have a clever way of telling encode to
 	 * not repeat some of the computations that are done by
@@ -793,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 	 */
 	list_for_each_entry(e, &ife->metalist, metalist) {
 		if (e->ops->encode) {
-			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+			err = e->ops->encode(skb, (void *)(ife_meta + skboff),
 					     e);
 		}
 		if (err < 0) {
@@ -804,15 +761,12 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
 		}
 		skboff += err;
 	}
+	oethh = (struct ethhdr *)skb->data;
 
 	if (!is_zero_ether_addr(ife->eth_src))
 		ether_addr_copy(oethh->h_source, ife->eth_src);
-	else
-		ether_addr_copy(oethh->h_source, iethh->h_source);
 	if (!is_zero_ether_addr(ife->eth_dst))
 		ether_addr_copy(oethh->h_dest, ife->eth_dst);
-	else
-		ether_addr_copy(oethh->h_dest, iethh->h_dest);
 	oethh->h_proto = htons(ife->eth_type);
 
 	if (skb_at_tc_ingress(skb))
-- 
cgit v1.2.3


From 3ad7a4b141ebd6091494913672d7166d5c2764e4 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 31 Jan 2017 22:59:52 -0800
Subject: vxlan: support fdb and learning in COLLECT_METADATA mode

Vxlan COLLECT_METADATA mode today solves the per-vni netdev
scalability problem in l3 networks. It expects all forwarding
information to be present in dst_metadata. This patch series
enhances collect metadata mode to include the case where only
vni is present in dst_metadata, and the vxlan driver can then use
the rest of the forwarding information datbase to make forwarding
decisions. There is no change to default COLLECT_METADATA
behaviour. These changes only apply to COLLECT_METADATA when
used with the bridging use-case with a special dst_metadata
tunnel info flag (eg: where vxlan device is part of a bridge).
For all this to work, the vxlan driver will need to now support a
single fdb table hashed by mac + vni. This series essentially makes
this happen.

use-case and workflow:
vxlan collect metadata device participates in bridging vlan
to vn-segments. Bridge driver above the vxlan device,
sends the vni corresponding to the vlan in the dst_metadata.
vxlan driver will lookup forwarding database with (mac + vni)
for the required remote destination information to forward the
packet.

Changes introduced by this patch:
    - allow learning and forwarding database state in vxlan netdev in
      COLLECT_METADATA mode. Current behaviour is not changed
      by default. tunnel info flag IP_TUNNEL_INFO_BRIDGE is used
      to support the new bridge friendly mode.
    - A single fdb table hashed by (mac, vni) to allow fdb entries with
      multiple vnis in the same fdb table
    - rx path already has the vni
    - tx path expects a vni in the packet with dst_metadata
    - prior to this series, fdb remote_dsts carried remote vni and
      the vxlan device carrying the fdb table represented the
      source vni. With the vxlan device now representing multiple vnis,
      this patch adds a src vni attribute to the fdb entry. The remote
      vni already uses NDA_VNI attribute. This patch introduces
      NDA_SRC_VNI netlink attribute to represent the src vni in a multi
      vni fdb table.

iproute2 example (patched and pruned iproute2 output to just show
relevant fdb entries):
example shows same host mac learnt on two vni's.

before (netdev per vni):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self

after this patch with collect metadata in bridged mode (single netdev):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            | 196 ++++++++++++++++++++++++++---------------
 include/uapi/linux/neighbour.h |   1 +
 2 files changed, 126 insertions(+), 71 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2e48ce22eabf..2374a75dcb55 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -75,6 +75,7 @@ struct vxlan_fdb {
 	struct list_head  remotes;
 	u8		  eth_addr[ETH_ALEN];
 	u16		  state;	/* see ndm_state */
+	__be32		  vni;
 	u8		  flags;	/* see ndm_flags */
 };
 
@@ -302,6 +303,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
 	    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
 		goto nla_put_failure;
+	if ((vxlan->flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
+	    nla_put_u32(skb, NDA_SRC_VNI,
+			be32_to_cpu(fdb->vni)))
+		goto nla_put_failure;
 	if (rdst->remote_ifindex &&
 	    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
 		goto nla_put_failure;
@@ -400,34 +405,51 @@ static u32 eth_hash(const unsigned char *addr)
 	return hash_64(value, FDB_HASH_BITS);
 }
 
+static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
+{
+	/* use 1 byte of OUI and 3 bytes of NIC */
+	u32 key = get_unaligned((u32 *)(addr + 2));
+
+	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
+}
+
 /* Hash chain to use given mac address */
 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
-						const u8 *mac)
+						const u8 *mac, __be32 vni)
 {
-	return &vxlan->fdb_head[eth_hash(mac)];
+	if (vxlan->flags & VXLAN_F_COLLECT_METADATA)
+		return &vxlan->fdb_head[eth_vni_hash(mac, vni)];
+	else
+		return &vxlan->fdb_head[eth_hash(mac)];
 }
 
 /* Look up Ethernet address in forwarding table */
 static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
-					const u8 *mac)
+					  const u8 *mac, __be32 vni)
 {
-	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
 	struct vxlan_fdb *f;
 
 	hlist_for_each_entry_rcu(f, head, hlist) {
-		if (ether_addr_equal(mac, f->eth_addr))
-			return f;
+		if (ether_addr_equal(mac, f->eth_addr)) {
+			if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
+				if (vni == f->vni)
+					return f;
+			} else {
+				return f;
+			}
+		}
 	}
 
 	return NULL;
 }
 
 static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
-					const u8 *mac)
+					const u8 *mac, __be32 vni)
 {
 	struct vxlan_fdb *f;
 
-	f = __vxlan_find_mac(vxlan, mac);
+	f = __vxlan_find_mac(vxlan, mac, vni);
 	if (f)
 		f->used = jiffies;
 
@@ -605,15 +627,15 @@ static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 			    const u8 *mac, union vxlan_addr *ip,
 			    __u16 state, __u16 flags,
-			    __be16 port, __be32 vni, __u32 ifindex,
-			    __u8 ndm_flags)
+			    __be16 port, __be32 src_vni, __be32 vni,
+			    __u32 ifindex, __u8 ndm_flags)
 {
 	struct vxlan_rdst *rd = NULL;
 	struct vxlan_fdb *f;
 	int notify = 0;
 	int rc;
 
-	f = __vxlan_find_mac(vxlan, mac);
+	f = __vxlan_find_mac(vxlan, mac, src_vni);
 	if (f) {
 		if (flags & NLM_F_EXCL) {
 			netdev_dbg(vxlan->dev,
@@ -670,6 +692,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 		f->state = state;
 		f->flags = ndm_flags;
 		f->updated = f->used = jiffies;
+		f->vni = src_vni;
 		INIT_LIST_HEAD(&f->remotes);
 		memcpy(f->eth_addr, mac, ETH_ALEN);
 
@@ -681,7 +704,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 
 		++vxlan->addrcnt;
 		hlist_add_head_rcu(&f->hlist,
-				   vxlan_fdb_head(vxlan, mac));
+				   vxlan_fdb_head(vxlan, mac, src_vni));
 	}
 
 	if (notify) {
@@ -718,8 +741,8 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 }
 
 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
-			   union vxlan_addr *ip, __be16 *port, __be32 *vni,
-			   u32 *ifindex)
+			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
+			   __be32 *vni, u32 *ifindex)
 {
 	struct net *net = dev_net(vxlan->dev);
 	int err;
@@ -757,6 +780,14 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 		*vni = vxlan->default_dst.remote_vni;
 	}
 
+	if (tb[NDA_SRC_VNI]) {
+		if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
+			return -EINVAL;
+		*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
+	} else {
+		*src_vni = vxlan->default_dst.remote_vni;
+	}
+
 	if (tb[NDA_IFINDEX]) {
 		struct net_device *tdev;
 
@@ -782,7 +813,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	/* struct net *net = dev_net(vxlan->dev); */
 	union vxlan_addr ip;
 	__be16 port;
-	__be32 vni;
+	__be32 src_vni, vni;
 	u32 ifindex;
 	int err;
 
@@ -795,7 +826,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	if (tb[NDA_DST] == NULL)
 		return -EINVAL;
 
-	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
+	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
 	if (err)
 		return err;
 
@@ -804,36 +835,24 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 
 	spin_lock_bh(&vxlan->hash_lock);
 	err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
-			       port, vni, ifindex, ndm->ndm_flags);
+			       port, src_vni, vni, ifindex, ndm->ndm_flags);
 	spin_unlock_bh(&vxlan->hash_lock);
 
 	return err;
 }
 
-/* Delete entry (via netlink) */
-static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
-			    struct net_device *dev,
-			    const unsigned char *addr, u16 vid)
+static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
+			      const unsigned char *addr, union vxlan_addr ip,
+			      __be16 port, __be32 src_vni, u32 vni, u32 ifindex,
+			      u16 vid)
 {
-	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
 	struct vxlan_rdst *rd = NULL;
-	union vxlan_addr ip;
-	__be16 port;
-	__be32 vni;
-	u32 ifindex;
-	int err;
+	int err = -ENOENT;
 
-	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
-	if (err)
-		return err;
-
-	err = -ENOENT;
-
-	spin_lock_bh(&vxlan->hash_lock);
-	f = vxlan_find_mac(vxlan, addr);
+	f = vxlan_find_mac(vxlan, addr, src_vni);
 	if (!f)
-		goto out;
+		return err;
 
 	if (!vxlan_addr_any(&ip)) {
 		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
@@ -841,8 +860,6 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			goto out;
 	}
 
-	err = 0;
-
 	/* remove a destination if it's not the only one on the list,
 	 * otherwise destroy the fdb entry
 	 */
@@ -856,6 +873,28 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	vxlan_fdb_destroy(vxlan, f);
 
 out:
+	return 0;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
+			    struct net_device *dev,
+			    const unsigned char *addr, u16 vid)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	union vxlan_addr ip;
+	__be32 src_vni, vni;
+	__be16 port;
+	u32 ifindex;
+	int err;
+
+	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+	if (err)
+		return err;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
+				 vid);
 	spin_unlock_bh(&vxlan->hash_lock);
 
 	return err;
@@ -901,12 +940,13 @@ out:
  * Return true if packet is bogus and should be dropped.
  */
 static bool vxlan_snoop(struct net_device *dev,
-			union vxlan_addr *src_ip, const u8 *src_mac)
+			union vxlan_addr *src_ip, const u8 *src_mac,
+			__be32 vni)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
 
-	f = vxlan_find_mac(vxlan, src_mac);
+	f = vxlan_find_mac(vxlan, src_mac, vni);
 	if (likely(f)) {
 		struct vxlan_rdst *rdst = first_remote_rcu(f);
 
@@ -935,6 +975,7 @@ static bool vxlan_snoop(struct net_device *dev,
 					 NUD_REACHABLE,
 					 NLM_F_EXCL|NLM_F_CREATE,
 					 vxlan->cfg.dst_port,
+					 vni,
 					 vxlan->default_dst.remote_vni,
 					 0, NTF_SELF);
 		spin_unlock(&vxlan->hash_lock);
@@ -1202,7 +1243,7 @@ static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
 
 static bool vxlan_set_mac(struct vxlan_dev *vxlan,
 			  struct vxlan_sock *vs,
-			  struct sk_buff *skb)
+			  struct sk_buff *skb, __be32 vni)
 {
 	union vxlan_addr saddr;
 
@@ -1226,7 +1267,7 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan,
 	}
 
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
-	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
+	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, vni))
 		return false;
 
 	return true;
@@ -1268,6 +1309,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 	__be16 protocol = htons(ETH_P_TEB);
 	bool raw_proto = false;
 	void *oiph;
+	__be32 vni = 0;
 
 	/* Need UDP and VXLAN header to be present */
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1289,7 +1331,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 	if (!vs)
 		goto drop;
 
-	vxlan = vxlan_vs_find_vni(vs, vxlan_vni(vxlan_hdr(skb)->vx_vni));
+	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
+
+	if ((vs->flags & VXLAN_F_COLLECT_METADATA) && !vni)
+		goto drop;
+
+	vxlan = vxlan_vs_find_vni(vs, vni);
 	if (!vxlan)
 		goto drop;
 
@@ -1307,7 +1354,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 
 	if (vxlan_collect_metadata(vs)) {
-		__be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
 		struct metadata_dst *tun_dst;
 
 		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
@@ -1345,7 +1391,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	if (!raw_proto) {
-		if (!vxlan_set_mac(vxlan, vs, skb))
+		if (!vxlan_set_mac(vxlan, vs, skb, vni))
 			goto drop;
 	} else {
 		skb_reset_mac_header(skb);
@@ -1377,7 +1423,7 @@ drop:
 	return 0;
 }
 
-static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
+static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct arphdr *parp;
@@ -1424,7 +1470,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 			goto out;
 		}
 
-		f = vxlan_find_mac(vxlan, n->ha);
+		f = vxlan_find_mac(vxlan, n->ha, vni);
 		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
 			/* bridge-local neighbor */
 			neigh_release(n);
@@ -1548,7 +1594,7 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request,
 	return reply;
 }
 
-static int neigh_reduce(struct net_device *dev, struct sk_buff *skb)
+static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct nd_msg *msg;
@@ -1585,7 +1631,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb)
 			goto out;
 		}
 
-		f = vxlan_find_mac(vxlan, n->ha);
+		f = vxlan_find_mac(vxlan, n->ha, vni);
 		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
 			/* bridge-local neighbor */
 			neigh_release(n);
@@ -1906,7 +1952,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 
 /* Bypass encapsulation if the destination is local */
 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
-			       struct vxlan_dev *dst_vxlan)
+			       struct vxlan_dev *dst_vxlan, __be32 vni)
 {
 	struct pcpu_sw_netstats *tx_stats, *rx_stats;
 	union vxlan_addr loopback;
@@ -1932,7 +1978,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 	}
 
 	if (dst_vxlan->flags & VXLAN_F_LEARN)
-		vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
+		vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source, vni);
 
 	u64_stats_update_begin(&tx_stats->syncp);
 	tx_stats->tx_packets++;
@@ -1976,7 +2022,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 
 			return -ENOENT;
 		}
-		vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
 		return 1;
 	}
 
@@ -1984,7 +2030,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
 }
 
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
-			   struct vxlan_rdst *rdst, bool did_rsc)
+			   __be32 default_vni, struct vxlan_rdst *rdst,
+			   bool did_rsc)
 {
 	struct dst_cache *dst_cache;
 	struct ip_tunnel_info *info;
@@ -2011,14 +2058,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		if (vxlan_addr_any(dst)) {
 			if (did_rsc) {
 				/* short-circuited back to local bridge */
-				vxlan_encap_bypass(skb, vxlan, vxlan);
+				vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
 				return;
 			}
 			goto drop;
 		}
 
 		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
-		vni = rdst->remote_vni;
+		vni = (rdst->remote_vni) ? : default_vni;
 		src = &vxlan->cfg.saddr;
 		dst_cache = &rdst->dst_cache;
 		md->gbp = skb->mark;
@@ -2173,23 +2220,29 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
 	struct vxlan_fdb *f;
+	__be32 vni = 0;
 
 	info = skb_tunnel_info(skb);
 
 	skb_reset_mac_header(skb);
 
 	if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
-		if (info && info->mode & IP_TUNNEL_INFO_TX)
-			vxlan_xmit_one(skb, dev, NULL, false);
-		else
-			kfree_skb(skb);
-		return NETDEV_TX_OK;
+		if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
+		    info->mode & IP_TUNNEL_INFO_TX) {
+			vni = tunnel_id_to_key32(info->key.tun_id);
+		} else {
+			if (info && info->mode & IP_TUNNEL_INFO_TX)
+				vxlan_xmit_one(skb, dev, vni, NULL, false);
+			else
+				kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
 	}
 
 	if (vxlan->flags & VXLAN_F_PROXY) {
 		eth = eth_hdr(skb);
 		if (ntohs(eth->h_proto) == ETH_P_ARP)
-			return arp_reduce(dev, skb);
+			return arp_reduce(dev, skb, vni);
 #if IS_ENABLED(CONFIG_IPV6)
 		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
 			 pskb_may_pull(skb, sizeof(struct ipv6hdr)
@@ -2200,13 +2253,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 				msg = (struct nd_msg *)skb_transport_header(skb);
 				if (msg->icmph.icmp6_code == 0 &&
 				    msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
-					return neigh_reduce(dev, skb);
+					return neigh_reduce(dev, skb, vni);
 		}
 #endif
 	}
 
 	eth = eth_hdr(skb);
-	f = vxlan_find_mac(vxlan, eth->h_dest);
+	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
 	did_rsc = false;
 
 	if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
@@ -2214,11 +2267,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
 		did_rsc = route_shortcircuit(dev, skb);
 		if (did_rsc)
-			f = vxlan_find_mac(vxlan, eth->h_dest);
+			f = vxlan_find_mac(vxlan, eth->h_dest, vni);
 	}
 
 	if (f == NULL) {
-		f = vxlan_find_mac(vxlan, all_zeros_mac);
+		f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
 		if (f == NULL) {
 			if ((vxlan->flags & VXLAN_F_L2MISS) &&
 			    !is_multicast_ether_addr(eth->h_dest))
@@ -2239,11 +2292,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 		skb1 = skb_clone(skb, GFP_ATOMIC);
 		if (skb1)
-			vxlan_xmit_one(skb1, dev, rdst, did_rsc);
+			vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
 	}
 
 	if (fdst)
-		vxlan_xmit_one(skb, dev, fdst, did_rsc);
+		vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
 	else
 		kfree_skb(skb);
 	return NETDEV_TX_OK;
@@ -2307,12 +2360,12 @@ static int vxlan_init(struct net_device *dev)
 	return 0;
 }
 
-static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan)
+static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
 {
 	struct vxlan_fdb *f;
 
 	spin_lock_bh(&vxlan->hash_lock);
-	f = __vxlan_find_mac(vxlan, all_zeros_mac);
+	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
 	if (f)
 		vxlan_fdb_destroy(vxlan, f);
 	spin_unlock_bh(&vxlan->hash_lock);
@@ -2322,7 +2375,7 @@ static void vxlan_uninit(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
-	vxlan_fdb_delete_default(vxlan);
+	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
 
 	free_percpu(dev->tstats);
 }
@@ -2923,6 +2976,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 				       NLM_F_EXCL|NLM_F_CREATE,
 				       vxlan->cfg.dst_port,
 				       vxlan->default_dst.remote_vni,
+				       vxlan->default_dst.remote_vni,
 				       vxlan->default_dst.remote_ifindex,
 				       NTF_SELF);
 		if (err)
@@ -2931,7 +2985,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 
 	err = register_netdevice(dev);
 	if (err) {
-		vxlan_fdb_delete_default(vxlan);
+		vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
 		return err;
 	}
 
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index bd99a8d80f36..f3d16dbe09d6 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -26,6 +26,7 @@ enum {
 	NDA_IFINDEX,
 	NDA_MASTER,
 	NDA_LINK_NETNSID,
+	NDA_SRC_VNI,
 	__NDA_MAX
 };
 
-- 
cgit v1.2.3


From b3c7ef0adadc5768e0baa786213c6bd1ce521a77 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 31 Jan 2017 22:59:53 -0800
Subject: bridge: uapi: add per vlan tunnel info

New nested netlink attribute to associate tunnel info per vlan.
This is used by bridge driver to send tunnel metadata to
bridge ports in vlan tunnel mode. This patch also adds new per
port flag IFLA_BRPORT_VLAN_TUNNEL to enable vlan tunnel mode.
off by default.

One example use for this is a vxlan bridging gateway or vtep
which maps vlans to vn-segments (or vnis). User can configure
per-vlan tunnel information which the bridge driver can use
to bridge vlan into the corresponding vn-segment.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_bridge.h | 11 +++++++++++
 include/uapi/linux/if_link.h   |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index debc9d5904e5..c5847dc75a93 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -47,6 +47,7 @@ struct br_ip_list {
 #define BR_PROXYARP_WIFI	BIT(10)
 #define BR_MCAST_FLOOD		BIT(11)
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
+#define BR_VLAN_TUNNEL		BIT(13)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ab92bca6d448..a9e6244ce438 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -118,6 +118,7 @@ enum {
 	IFLA_BRIDGE_FLAGS,
 	IFLA_BRIDGE_MODE,
 	IFLA_BRIDGE_VLAN_INFO,
+	IFLA_BRIDGE_VLAN_TUNNEL_INFO,
 	__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
@@ -134,6 +135,16 @@ struct bridge_vlan_info {
 	__u16 vid;
 };
 
+enum {
+	IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC,
+	IFLA_BRIDGE_VLAN_TUNNEL_ID,
+	IFLA_BRIDGE_VLAN_TUNNEL_VID,
+	IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+	__IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+};
+
+#define IFLA_BRIDGE_VLAN_TUNNEL_MAX (__IFLA_BRIDGE_VLAN_TUNNEL_MAX - 1)
+
 struct bridge_vlan_xstats {
 	__u64 rx_bytes;
 	__u64 rx_packets;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b9aa5641ebe5..320fc1e747ee 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -322,6 +322,7 @@ enum {
 	IFLA_BRPORT_PAD,
 	IFLA_BRPORT_MCAST_FLOOD,
 	IFLA_BRPORT_MCAST_TO_UCAST,
+	IFLA_BRPORT_VLAN_TUNNEL,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
cgit v1.2.3


From a963d710f367f68cd13d562a07db55ccb8daade9 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@chromium.org>
Date: Thu, 26 Jan 2017 14:49:43 -0800
Subject: netfilter: ctnetlink: Fix regression in CTA_STATUS processing

The libnetfilter_conntrack userland library always sets IPS_CONFIRMED
when building a CTA_STATUS attribute.  If this toggles the bit from
0->1, the parser will return an error.  On Linux 4.4+ this will cause any
NFQA_EXP attribute in the packet to be ignored.  This breaks conntrackd's
userland helpers because they operate on unconfirmed connections.

Instead of returning -EBUSY if the user program asks to modify an
unchangeable bit, simply ignore the change.

Also, fix the logic so that user programs are allowed to clear
the bits that they are allowed to change.

Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h |  4 ++++
 net/netfilter/nf_conntrack_netlink.c               | 26 +++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 6d074d14ee27..6a8e33dd4ecb 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -82,6 +82,10 @@ enum ip_conntrack_status {
 	IPS_DYING_BIT = 9,
 	IPS_DYING = (1 << IPS_DYING_BIT),
 
+	/* Bits that cannot be altered from userland. */
+	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
+				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING),
+
 	/* Connection has fixed timeout. */
 	IPS_FIXED_TIMEOUT_BIT = 10,
 	IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 27540455dc62..bf04b7e9d6f7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2269,6 +2269,30 @@ nla_put_failure:
 	return -ENOSPC;
 }
 
+static int
+ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+	unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+	unsigned long d = ct->status ^ status;
+
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EBUSY;
+
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EBUSY;
+
+	/* This check is less strict than ctnetlink_change_status()
+	 * because callers often flip IPS_EXPECTED bits when sending
+	 * an NFQA_CT attribute to the kernel.  So ignore the
+	 * unchangeable bits but do not error out.
+	 */
+	ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
+		     (ct->status & IPS_UNCHANGEABLE_MASK);
+	return 0;
+}
+
 static int
 ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
 {
@@ -2280,7 +2304,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
 			return err;
 	}
 	if (cda[CTA_STATUS]) {
-		err = ctnetlink_change_status(ct, cda);
+		err = ctnetlink_update_status(ct, cda);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3


From 19641f2d7674fbf2891e9579f61c1b23821086e8 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Fri, 3 Feb 2017 12:50:30 -0700
Subject: Include: Uapi: Add user ABI for Sed/Opal

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Rafael Antognolli <Rafael.Antognolli@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/sed-opal.h | 118 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 include/uapi/linux/sed-opal.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
new file mode 100644
index 000000000000..31799526082a
--- /dev/null
+++ b/include/uapi/linux/sed-opal.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_SED_OPAL_H
+#define _UAPI_SED_OPAL_H
+
+#include <linux/types.h>
+
+#define OPAL_KEY_MAX 256
+#define OPAL_MAX_LRS 9
+
+enum opal_mbr {
+	OPAL_MBR_ENABLE = 0x0,
+	OPAL_MBR_DISABLE = 0x01,
+};
+
+enum opal_user {
+	OPAL_ADMIN1 = 0x0,
+	OPAL_USER1 = 0x01,
+	OPAL_USER2 = 0x02,
+	OPAL_USER3 = 0x03,
+	OPAL_USER4 = 0x04,
+	OPAL_USER5 = 0x05,
+	OPAL_USER6 = 0x06,
+	OPAL_USER7 = 0x07,
+	OPAL_USER8 = 0x08,
+	OPAL_USER9 = 0x09,
+};
+
+enum opal_lock_state {
+	OPAL_RO = 0x01, /* 0001 */
+	OPAL_RW = 0x02, /* 0010 */
+	OPAL_LK = 0x04, /* 0100 */
+};
+
+struct opal_key {
+	uint8_t	lr;
+	uint8_t	key_len;
+	char 	key[OPAL_KEY_MAX];
+};
+
+struct opal_lr_act {
+	int sum;
+	uint8_t num_lrs;
+	uint8_t lr[OPAL_MAX_LRS];
+	struct opal_key key;
+};
+
+struct opal_session_info {
+	int sum;
+	enum opal_user who;
+	struct opal_key opal_key;
+	uint8_t __align[2];
+};
+
+struct opal_user_lr_setup {
+	size_t range_start;
+	size_t range_length;
+	int    RLE; /* Read Lock enabled */
+	int    WLE; /* Write Lock Enabled */
+	struct opal_session_info session;
+	uint8_t __align[4];
+};
+
+struct opal_lock_unlock {
+	enum opal_lock_state l_state;
+	struct opal_session_info session;
+};
+
+struct opal_new_pw {
+	struct opal_session_info session;
+
+	/* When we're not operating in sum, and we first set
+	 * passwords we need to set them via ADMIN authority.
+	 * After passwords are changed, we can set them via,
+	 * User authorities.
+	 * Because of this restriction we need to know about
+	 * Two different users. One in 'session' which we will use
+	 * to start the session and new_userr_pw as the user we're
+	 * chaning the pw for.
+	 */
+	struct opal_session_info new_user_pw;
+};
+
+struct opal_mbr_data {
+	u8 enable_disable;
+	struct opal_key key;
+	uint8_t __align[5];
+};
+
+#define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
+#define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
+#define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
+#define IOC_OPAL_ACTIVATE_LSP       _IOW('p', 223, struct opal_key)
+#define IOC_OPAL_SET_PW             _IOW('p', 224, struct opal_new_pw)
+#define IOC_OPAL_ACTIVATE_USR       _IOW('p', 225, struct opal_session_info)
+#define IOC_OPAL_REVERT_TPR         _IOW('p', 226, struct opal_key)
+#define IOC_OPAL_LR_SETUP           _IOW('p', 227, struct opal_user_lr_setup)
+#define IOC_OPAL_ADD_USR_TO_LR      _IOW('p', 228, struct opal_lock_unlock)
+#define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data)
+#define IOC_OPAL_ERASE_LR           _IOW('p', 230, struct opal_session_info)
+#define IOC_OPAL_SECURE_ERASE_LR    _IOW('p', 231, struct opal_session_info)
+
+#endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From 8c87fe722053658467bcc9e5ea82051ce3d3a693 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Mon, 6 Feb 2017 17:22:49 -0700
Subject: Fix SED-OPAL UAPI structs to prevent 32/64 bit size differences.

This patch is a quick fixup of the user structures that will prevent
the structures from being different sizes on 32 and 64 bit archs.
Taking this fix will allow us to *NOT* have to do compat ioctls for
the sed code.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Fixes: 19641f2d7674 ("Include: Uapi: Add user ABI for Sed/Opal")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/sed-opal.h | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 31799526082a..fc06e3a20a51 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -48,37 +48,38 @@ enum opal_lock_state {
 };
 
 struct opal_key {
-	uint8_t	lr;
-	uint8_t	key_len;
-	char 	key[OPAL_KEY_MAX];
+	__u8 lr;
+	__u8 key_len;
+	__u8 __align[6];
+	__u8 key[OPAL_KEY_MAX];
 };
 
 struct opal_lr_act {
-	int sum;
-	uint8_t num_lrs;
-	uint8_t lr[OPAL_MAX_LRS];
 	struct opal_key key;
+	__u32 sum;
+	__u8    num_lrs;
+	__u8 lr[OPAL_MAX_LRS];
+	__u8 align[2]; /* Align to 8 byte boundary */
 };
 
 struct opal_session_info {
-	int sum;
-	enum opal_user who;
+	__u32 sum;
+	__u32 who;
 	struct opal_key opal_key;
-	uint8_t __align[2];
 };
 
 struct opal_user_lr_setup {
-	size_t range_start;
-	size_t range_length;
-	int    RLE; /* Read Lock enabled */
-	int    WLE; /* Write Lock Enabled */
+	__u64 range_start;
+	__u64 range_length;
+	__u32 RLE; /* Read Lock enabled */
+	__u32 WLE; /* Write Lock Enabled */
 	struct opal_session_info session;
-	uint8_t __align[4];
 };
 
 struct opal_lock_unlock {
-	enum opal_lock_state l_state;
 	struct opal_session_info session;
+	__u32 l_state;
+	__u8 __align[4];
 };
 
 struct opal_new_pw {
@@ -97,9 +98,9 @@ struct opal_new_pw {
 };
 
 struct opal_mbr_data {
-	u8 enable_disable;
 	struct opal_key key;
-	uint8_t __align[5];
+	__u8 enable_disable;
+	__u8 __align[7];
 };
 
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
-- 
cgit v1.2.3


From 55dd00a73a518281bc846dc5de1a718349431eb2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 24 Jan 2017 15:09:39 -0200
Subject: KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall

Add a hypercall to retrieve the host realtime clock and the TSC value
used to calculate that clock read.

Used to implement clock synchronization between host and guest.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/hypercalls.txt | 35 +++++++++++++++++
 arch/x86/include/uapi/asm/kvm_para.h     |  9 +++++
 arch/x86/kvm/x86.c                       | 66 ++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm_para.h            |  2 +
 4 files changed, 112 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index c8d040e27046..feaaa634f154 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the
 same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
 specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
 is used in the hypercall for future use.
+
+
+6. KVM_HC_CLOCK_PAIRING
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to synchronize host and guest clocks.
+Usage:
+
+a0: guest physical address where host copies
+"struct kvm_clock_offset" structure.
+
+a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0)
+is supported (corresponding to the host's CLOCK_REALTIME clock).
+
+		struct kvm_clock_pairing {
+			__s64 sec;
+			__s64 nsec;
+			__u64 tsc;
+			__u32 flags;
+			__u32 pad[9];
+		};
+
+       Where:
+               * sec: seconds from clock_type clock.
+               * nsec: nanoseconds from clock_type clock.
+               * tsc: guest TSC value used to calculate sec/nsec pair
+               * flags: flags, unused (0) at the moment.
+
+The hypercall lets a guest compute a precise timestamp across
+host and guest.  The guest can use the returned TSC value to
+compute the CLOCK_REALTIME for its clock, at the same instant.
+
+Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
+or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1421a6585126..cff0bb6556f8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,15 @@ struct kvm_steal_time {
 	__u32 pad[11];
 };
 
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+	__s64 sec;
+	__s64 nsec;
+	__u64 tsc;
+	__u32 flags;
+	__u32 pad[9];
+};
+
 #define KVM_STEAL_ALIGNMENT_BITS 5
 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
 #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fd4d4f35caf..09e5d31dac98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1142,6 +1142,7 @@ struct pvclock_gtod_data {
 
 	u64		boot_ns;
 	u64		nsec_base;
+	u64		wall_time_sec;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1165,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	vdata->boot_ns			= boot_ns;
 	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 
+	vdata->wall_time_sec            = tk->xtime_sec;
+
 	write_seqcount_end(&vdata->seq);
 }
 #endif
@@ -1626,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 	return mode;
 }
 
+static int do_realtime(struct timespec *ts, u64 *cycle_now)
+{
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+	unsigned long seq;
+	int mode;
+	u64 ns;
+
+	do {
+		seq = read_seqcount_begin(&gtod->seq);
+		mode = gtod->clock.vclock_mode;
+		ts->tv_sec = gtod->wall_time_sec;
+		ns = gtod->nsec_base;
+		ns += vgettsc(cycle_now);
+		ns >>= gtod->clock.shift;
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return mode;
+}
+
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
@@ -1635,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 
 	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
 }
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+					   u64 *cycle_now)
+{
+	/* checked again under seqlock below */
+	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+		return false;
+
+	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+}
 #endif
 
 /*
@@ -6112,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+			        unsigned long clock_type)
+{
+	struct kvm_clock_pairing clock_pairing;
+	struct timespec ts;
+	cycle_t cycle;
+	int ret;
+
+	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+		return -KVM_EOPNOTSUPP;
+
+	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+		return -KVM_EOPNOTSUPP;
+
+	clock_pairing.sec = ts.tv_sec;
+	clock_pairing.nsec = ts.tv_nsec;
+	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+	clock_pairing.flags = 0;
+
+	ret = 0;
+	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+			    sizeof(struct kvm_clock_pairing)))
+		ret = -KVM_EFAULT;
+
+	return ret;
+}
+
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
  *
@@ -6176,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
 		ret = 0;
 		break;
+	case KVM_HC_CLOCK_PAIRING:
+		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index bf6cd7d5cac2..fed506aeff62 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -14,6 +14,7 @@
 #define KVM_EFAULT		EFAULT
 #define KVM_E2BIG		E2BIG
 #define KVM_EPERM		EPERM
+#define KVM_EOPNOTSUPP		95
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
@@ -23,6 +24,7 @@
 #define KVM_HC_MIPS_GET_CLOCK_FREQ	6
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
+#define KVM_HC_CLOCK_PAIRING		9
 
 /*
  * hypercalls use architecture specific
-- 
cgit v1.2.3


From bee427b86217b78a0a5fc85575cc155e4c32bbf9 Mon Sep 17 00:00:00 2001
From: Andrzej Zaborowski <andrew.zaborowski@intel.com>
Date: Wed, 25 Jan 2017 12:43:41 +0100
Subject: cfg80211: Pass new RSSI level in CQM RSSI notification

Update the drivers to pass the RSSI level as a cfg80211_cqm_rssi_notify
parameter and pass this value to userspace in a new nl80211 attribute.
This helps both userspace and also helps in the implementation of the
multiple RSSI thresholds CQM mechanism.

Note for marvell/mwifiex I pass 0 for the RSSI value because the new
RSSI value is not available to the driver at the time of the
cfg80211_cqm_rssi_notify call, but the driver queries the new value
immediately after that, so it is actually available just a moment later
if we wanted to defer caling cfg80211_cqm_rssi_notify until that moment.
Without this, the new cfg80211 code (patch 3) will call .get_station
which will send a duplicate HostCmd_CMD_RSSI_INFO command to the hardware.

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/marvell/mwifiex/sta_event.c |  4 ++--
 drivers/net/wireless/rndis_wlan.c                |  2 +-
 include/net/cfg80211.h                           |  3 ++-
 include/uapi/linux/nl80211.h                     |  3 +++
 net/mac80211/mlme.c                              |  2 +-
 net/wireless/nl80211.c                           |  9 +++++++--
 net/wireless/trace.h                             | 11 +++++++----
 7 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/wireless/marvell/mwifiex/sta_event.c b/drivers/net/wireless/marvell/mwifiex/sta_event.c
index 9df0c4dc06ed..5cc3aa7c31cd 100644
--- a/drivers/net/wireless/marvell/mwifiex/sta_event.c
+++ b/drivers/net/wireless/marvell/mwifiex/sta_event.c
@@ -824,7 +824,7 @@ int mwifiex_process_sta_event(struct mwifiex_private *priv)
 	case EVENT_RSSI_LOW:
 		cfg80211_cqm_rssi_notify(priv->netdev,
 					 NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
-					 GFP_KERNEL);
+					 0, GFP_KERNEL);
 		mwifiex_send_cmd(priv, HostCmd_CMD_RSSI_INFO,
 				 HostCmd_ACT_GEN_GET, 0, NULL, false);
 		priv->subsc_evt_rssi_state = RSSI_LOW_RECVD;
@@ -839,7 +839,7 @@ int mwifiex_process_sta_event(struct mwifiex_private *priv)
 	case EVENT_RSSI_HIGH:
 		cfg80211_cqm_rssi_notify(priv->netdev,
 					 NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
-					 GFP_KERNEL);
+					 0, GFP_KERNEL);
 		mwifiex_send_cmd(priv, HostCmd_CMD_RSSI_INFO,
 				 HostCmd_ACT_GEN_GET, 0, NULL, false);
 		priv->subsc_evt_rssi_state = RSSI_HIGH_RECVD;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 603c90470225..785334f7a538 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -3187,7 +3187,7 @@ static void rndis_do_cqm(struct usbnet *usbdev, s32 rssi)
 		return;
 
 	priv->last_cqm_event_rssi = rssi;
-	cfg80211_cqm_rssi_notify(usbdev->net, event, GFP_KERNEL);
+	cfg80211_cqm_rssi_notify(usbdev->net, event, rssi, GFP_KERNEL);
 }
 
 #define DEVICE_POLLER_JIFFIES (HZ)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5cfd2806a078..a2c18b53e053 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5390,6 +5390,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
  * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
  * @dev: network device
  * @rssi_event: the triggered RSSI event
+ * @rssi_level: new RSSI level value or 0 if not available
  * @gfp: context flags
  *
  * This function is called when a configured connection quality monitoring
@@ -5397,7 +5398,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
  */
 void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			      enum nl80211_cqm_rssi_threshold_event rssi_event,
-			      gfp_t gfp);
+			      s32 rssi_level, gfp_t gfp);
 
 /**
  * cfg80211_cqm_pktloss_notify - notify userspace about packetloss to peer
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d6c62ee9bd1d..cd547b864595 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3952,6 +3952,8 @@ enum nl80211_ps_state {
  *	%NL80211_CMD_NOTIFY_CQM. Set to 0 to turn off TX error reporting.
  * @NL80211_ATTR_CQM_BEACON_LOSS_EVENT: flag attribute that's set in a beacon
  *	loss event
+ * @NL80211_ATTR_CQM_RSSI_LEVEL: the RSSI value in dBm that triggered the
+ *	RSSI threshold event.
  * @__NL80211_ATTR_CQM_AFTER_LAST: internal
  * @NL80211_ATTR_CQM_MAX: highest key attribute
  */
@@ -3965,6 +3967,7 @@ enum nl80211_attr_cqm {
 	NL80211_ATTR_CQM_TXE_PKTS,
 	NL80211_ATTR_CQM_TXE_INTVL,
 	NL80211_ATTR_CQM_BEACON_LOSS_EVENT,
+	NL80211_ATTR_CQM_RSSI_LEVEL,
 
 	/* keep last */
 	__NL80211_ATTR_CQM_AFTER_LAST,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ee423688c92e..6e90301154d5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5048,7 +5048,7 @@ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
 
 	trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level);
 
-	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
+	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp);
 }
 EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b455898df63c..9d738f75bd4e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9474,6 +9474,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
 	[NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
 	[NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
 	[NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
+	[NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
 };
 
 static int nl80211_set_cqm_txe(struct genl_info *info,
@@ -13959,11 +13960,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
 
 void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			      enum nl80211_cqm_rssi_threshold_event rssi_event,
-			      gfp_t gfp)
+			      s32 rssi_level, gfp_t gfp)
 {
 	struct sk_buff *msg;
 
-	trace_cfg80211_cqm_rssi_notify(dev, rssi_event);
+	trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);
 
 	if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
 		    rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
@@ -13977,6 +13978,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			rssi_event))
 		goto nla_put_failure;
 
+	if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
+				      rssi_level))
+		goto nla_put_failure;
+
 	cfg80211_send_cqm(msg, gfp);
 
 	return;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ea1b47e04fa4..2419c390f150 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2490,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
 
 TRACE_EVENT(cfg80211_cqm_rssi_notify,
 	TP_PROTO(struct net_device *netdev,
-		 enum nl80211_cqm_rssi_threshold_event rssi_event),
-	TP_ARGS(netdev, rssi_event),
+		 enum nl80211_cqm_rssi_threshold_event rssi_event,
+		 s32 rssi_level),
+	TP_ARGS(netdev, rssi_event, rssi_level),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		__field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
+		__field(s32, rssi_level)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		__entry->rssi_event = rssi_event;
+		__entry->rssi_level = rssi_level;
 	),
-	TP_printk(NETDEV_PR_FMT ", rssi event: %d",
-		  NETDEV_PR_ARG, __entry->rssi_event)
+	TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
+		  NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
 );
 
 TRACE_EVENT(cfg80211_reg_can_beacon,
-- 
cgit v1.2.3


From c078ca3b0c5bf82c2b31906c446d6e2ad8ea0783 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 17 Jan 2017 22:51:26 +0100
Subject: netfilter: nft_exthdr: Add support for existence check

If NFT_EXTHDR_F_PRESENT is set, exthdr will not copy any header field
data into *dest, but instead set it to 1 if the header is found and 0
otherwise.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  6 ++++++
 net/netfilter/nft_exthdr.c               | 22 ++++++++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 7b730cab99bd..53aac8b8ed6b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -704,6 +704,10 @@ enum nft_payload_attributes {
 };
 #define NFTA_PAYLOAD_MAX	(__NFTA_PAYLOAD_MAX - 1)
 
+enum nft_exthdr_flags {
+	NFT_EXTHDR_F_PRESENT = (1 << 0),
+};
+
 /**
  * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes
  *
@@ -711,6 +715,7 @@ enum nft_payload_attributes {
  * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8)
  * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
+ * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -718,6 +723,7 @@ enum nft_exthdr_attributes {
 	NFTA_EXTHDR_TYPE,
 	NFTA_EXTHDR_OFFSET,
 	NFTA_EXTHDR_LEN,
+	NFTA_EXTHDR_FLAGS,
 	__NFTA_EXTHDR_MAX
 };
 #define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47beb3abcc9d..a89e5ab150db 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -23,6 +23,7 @@ struct nft_exthdr {
 	u8			offset;
 	u8			len;
 	enum nft_registers	dreg:8;
+	u8			flags;
 };
 
 static void nft_exthdr_eval(const struct nft_expr *expr,
@@ -35,8 +36,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr,
 	int err;
 
 	err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
-	if (err < 0)
+	if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+		*dest = (err >= 0);
+		return;
+	} else if (err < 0) {
 		goto err;
+	}
 	offset += priv->offset;
 
 	dest[priv->len / NFT_REG32_SIZE] = 0;
@@ -52,6 +57,7 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
 	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int nft_exthdr_init(const struct nft_ctx *ctx,
@@ -59,7 +65,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
-	u32 offset, len;
+	u32 offset, len, flags = 0;
 	int err;
 
 	if (tb[NFTA_EXTHDR_DREG] == NULL ||
@@ -76,10 +82,20 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
+	if (tb[NFTA_EXTHDR_FLAGS]) {
+		err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags);
+		if (err < 0)
+			return err;
+
+		if (flags & ~NFT_EXTHDR_F_PRESENT)
+			return -EINVAL;
+	}
+
 	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
 	priv->offset = offset;
 	priv->len    = len;
 	priv->dreg   = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
+	priv->flags  = flags;
 
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, priv->len);
@@ -97,6 +113,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3


From ab23821f7ecfb022a4aec78fb6f4fd0f6aa1ccab Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 Feb 2017 13:35:48 +0100
Subject: netfilter: nft_ct: add zone id get support

Just like with counters the direction attribute is optional.
We set priv->dir to MAX unconditionally to avoid duplicating the assignment
for all keys with optional direction.

For keys where direction is mandatory, existing code already returns
an error.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_ct.c                   | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 53aac8b8ed6b..3e60ed78c538 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -870,6 +870,7 @@ enum nft_rt_attributes {
  * @NFT_CT_PKTS: conntrack packets
  * @NFT_CT_BYTES: conntrack bytes
  * @NFT_CT_AVGPKT: conntrack average bytes per packet
+ * @NFT_CT_ZONE: conntrack zone
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -889,6 +890,7 @@ enum nft_ct_keys {
 	NFT_CT_PKTS,
 	NFT_CT_BYTES,
 	NFT_CT_AVGPKT,
+	NFT_CT_ZONE,
 };
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 66a2377510e1..5bd4cdfdcda5 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -151,6 +151,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 	case NFT_CT_PROTOCOL:
 		*dest = nf_ct_protonum(ct);
 		return;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE: {
+		const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+		if (priv->dir < IP_CT_DIR_MAX)
+			*dest = nf_ct_zone_id(zone, priv->dir);
+		else
+			*dest = zone->id;
+
+		return;
+	}
+#endif
 	default:
 		break;
 	}
@@ -266,6 +278,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	int err;
 
 	priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+	priv->dir = IP_CT_DIR_MAX;
 	switch (priv->key) {
 	case NFT_CT_DIRECTION:
 		if (tb[NFTA_CT_DIRECTION] != NULL)
@@ -333,11 +346,13 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
 	case NFT_CT_AVGPKT:
-		/* no direction? return sum of original + reply */
-		if (tb[NFTA_CT_DIRECTION] == NULL)
-			priv->dir = IP_CT_DIR_MAX;
 		len = sizeof(u64);
 		break;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	case NFT_CT_ZONE:
+		len = sizeof(u16);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -465,6 +480,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	case NFT_CT_BYTES:
 	case NFT_CT_PKTS:
 	case NFT_CT_AVGPKT:
+	case NFT_CT_ZONE:
 		if (priv->dir < IP_CT_DIR_MAX &&
 		    nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
 			goto nla_put_failure;
-- 
cgit v1.2.3


From 935b7f643018878bd9d4193eea8b575aff736b9b Mon Sep 17 00:00:00 2001
From: Manuel Messner <mm@skelett.io>
Date: Tue, 7 Feb 2017 03:14:53 +0100
Subject: netfilter: nft_exthdr: add TCP option matching

This patch implements the kernel side of the TCP option patch.

Signed-off-by: Manuel Messner <mm@skelett.io>
Reviewed-by: Florian Westphal <fw@strlen.de>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  17 ++++-
 net/netfilter/Kconfig                    |   4 +-
 net/netfilter/nft_exthdr.c               | 119 +++++++++++++++++++++++++++----
 3 files changed, 124 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3e60ed78c538..207951516ede 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -709,13 +709,27 @@ enum nft_exthdr_flags {
 };
 
 /**
- * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes
+ * enum nft_exthdr_op - nf_tables match options
+ *
+ * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers
+ * @NFT_EXTHDR_OP_TCP: match against tcp options
+ */
+enum nft_exthdr_op {
+	NFT_EXTHDR_OP_IPV6,
+	NFT_EXTHDR_OP_TCPOPT,
+	__NFT_EXTHDR_OP_MAX
+};
+#define NFT_EXTHDR_OP_MAX	(__NFT_EXTHDR_OP_MAX - 1)
+
+/**
+ * enum nft_exthdr_attributes - nf_tables extension header expression netlink attributes
  *
  * @NFTA_EXTHDR_DREG: destination register (NLA_U32: nft_registers)
  * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8)
  * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
  * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
+ * @NFTA_EXTHDR_OP: option match type (NLA_U8)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -724,6 +738,7 @@ enum nft_exthdr_attributes {
 	NFTA_EXTHDR_OFFSET,
 	NFTA_EXTHDR_LEN,
 	NFTA_EXTHDR_FLAGS,
+	NFTA_EXTHDR_OP,
 	__NFTA_EXTHDR_MAX
 };
 #define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ea479ed43373..9b28864cc36a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -467,10 +467,10 @@ config NF_TABLES_NETDEV
 	  This option enables support for the "netdev" table.
 
 config NFT_EXTHDR
-	tristate "Netfilter nf_tables IPv6 exthdr module"
+	tristate "Netfilter nf_tables exthdr module"
 	help
 	  This option adds the "exthdr" expression that you can use to match
-	  IPv6 extension headers.
+	  IPv6 extension headers and tcp options.
 
 config NFT_META
 	tristate "Netfilter nf_tables meta module"
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a89e5ab150db..c308920b194c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -15,20 +15,29 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
-// FIXME:
-#include <net/ipv6.h>
+#include <net/tcp.h>
 
 struct nft_exthdr {
 	u8			type;
 	u8			offset;
 	u8			len;
+	u8			op;
 	enum nft_registers	dreg:8;
 	u8			flags;
 };
 
-static void nft_exthdr_eval(const struct nft_expr *expr,
-			    struct nft_regs *regs,
-			    const struct nft_pktinfo *pkt)
+static unsigned int optlen(const u8 *opt, unsigned int offset)
+{
+	/* Beware zero-length options: make finite progress */
+	if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0)
+		return 1;
+	else
+		return opt[offset + 1];
+}
+
+static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
 	u32 *dest = &regs->data[priv->dreg];
@@ -52,6 +61,53 @@ err:
 	regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	unsigned int i, optl, tcphdr_len, offset;
+	u32 *dest = &regs->data[priv->dreg];
+	struct tcphdr *tcph;
+	u8 *opt;
+
+	if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+		goto err;
+
+	tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff);
+	if (!tcph)
+		goto err;
+
+	tcphdr_len = __tcp_hdrlen(tcph);
+	if (tcphdr_len < sizeof(*tcph))
+		goto err;
+
+	tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff);
+	if (!tcph)
+		goto err;
+
+	opt = (u8 *)tcph;
+	for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+		optl = optlen(opt, i);
+
+		if (priv->type != opt[i])
+			continue;
+
+		if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
+			goto err;
+
+		offset = i + priv->offset;
+		dest[priv->len / NFT_REG32_SIZE] = 0;
+		memcpy(dest, opt + offset, priv->len);
+
+		return;
+	}
+
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
@@ -65,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_exthdr *priv = nft_expr_priv(expr);
-	u32 offset, len, flags = 0;
+	u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
 	int err;
 
-	if (tb[NFTA_EXTHDR_DREG] == NULL ||
-	    tb[NFTA_EXTHDR_TYPE] == NULL ||
-	    tb[NFTA_EXTHDR_OFFSET] == NULL ||
-	    tb[NFTA_EXTHDR_LEN] == NULL)
+	if (!tb[NFTA_EXTHDR_DREG] ||
+	    !tb[NFTA_EXTHDR_TYPE] ||
+	    !tb[NFTA_EXTHDR_OFFSET] ||
+	    !tb[NFTA_EXTHDR_LEN])
 		return -EINVAL;
 
 	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
@@ -91,11 +147,18 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 			return -EINVAL;
 	}
 
+	if (tb[NFTA_EXTHDR_OP]) {
+		err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
+		if (err < 0)
+			return err;
+	}
+
 	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
 	priv->offset = offset;
 	priv->len    = len;
 	priv->dreg   = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
 	priv->flags  = flags;
+	priv->op     = op;
 
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, priv->len);
@@ -115,6 +178,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -122,17 +187,45 @@ nla_put_failure:
 }
 
 static struct nft_expr_type nft_exthdr_type;
-static const struct nft_expr_ops nft_exthdr_ops = {
+static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
+	.type		= &nft_exthdr_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.eval		= nft_exthdr_ipv6_eval,
+	.init		= nft_exthdr_init,
+	.dump		= nft_exthdr_dump,
+};
+
+static const struct nft_expr_ops nft_exthdr_tcp_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
-	.eval		= nft_exthdr_eval,
+	.eval		= nft_exthdr_tcp_eval,
 	.init		= nft_exthdr_init,
 	.dump		= nft_exthdr_dump,
 };
 
+static const struct nft_expr_ops *
+nft_exthdr_select_ops(const struct nft_ctx *ctx,
+		      const struct nlattr * const tb[])
+{
+	u32 op;
+
+	if (!tb[NFTA_EXTHDR_OP])
+		return &nft_exthdr_ipv6_ops;
+
+	op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
+	switch (op) {
+	case NFT_EXTHDR_OP_TCPOPT:
+		return &nft_exthdr_tcp_ops;
+	case NFT_EXTHDR_OP_IPV6:
+		return &nft_exthdr_ipv6_ops;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_exthdr_type __read_mostly = {
 	.name		= "exthdr",
-	.ops		= &nft_exthdr_ops,
+	.select_ops	= &nft_exthdr_select_ops,
 	.policy		= nft_exthdr_policy,
 	.maxattr	= NFTA_EXTHDR_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3


From 8585989d146c61dd073d2135c5bb11d0f979d576 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Wed, 8 Feb 2017 15:00:34 +0200
Subject: cfg80211: fix NAN bands definition

The nl80211_nan_dual_band_conf enumeration doesn't make much sense.
The default value is assigned to a bit, which makes it weird if the
default bit and other bits are set at the same time.

To improve this, get rid of NL80211_NAN_BAND_DEFAULT and add a wiphy
configuration to let the drivers define which bands are supported.
This is exposed to the userspace, which then can make a decision on
which band(s) to use.  Additionally, rename all "dual_band" elements
to "bands", to make things clearer.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 18 ++++++++++----
 include/uapi/linux/nl80211.h | 57 ++++++++++++++++++++------------------------
 net/mac80211/cfg.c           |  4 ++--
 net/mac80211/trace.h         | 16 ++++++-------
 net/wireless/core.c          |  3 ++-
 net/wireless/nl80211.c       | 35 ++++++++++++++++++++-------
 net/wireless/trace.h         | 16 ++++++-------
 7 files changed, 86 insertions(+), 63 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a2c18b53e053..c92dc03c8528 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5,7 +5,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2015-2016	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2416,11 +2416,13 @@ struct cfg80211_qos_map {
  * This struct defines NAN configuration parameters
  *
  * @master_pref: master preference (1 - 255)
- * @dual: dual band operation mode, see &enum nl80211_nan_dual_band_conf
+ * @bands: operating bands, a bitmap of &enum nl80211_band values.
+ *	For instance, for NL80211_BAND_2GHZ, bit 0 would be set
+ *	(i.e. BIT(NL80211_BAND_2GHZ)).
  */
 struct cfg80211_nan_conf {
 	u8 master_pref;
-	u8 dual;
+	u8 bands;
 };
 
 /**
@@ -2428,11 +2430,11 @@ struct cfg80211_nan_conf {
  * configuration
  *
  * @CFG80211_NAN_CONF_CHANGED_PREF: master preference
- * @CFG80211_NAN_CONF_CHANGED_DUAL: dual band operation
+ * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands
  */
 enum cfg80211_nan_conf_changes {
 	CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
-	CFG80211_NAN_CONF_CHANGED_DUAL = BIT(1),
+	CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1),
 };
 
 /**
@@ -3596,6 +3598,10 @@ struct wiphy_iftype_ext_capab {
  *	attribute indices defined in &enum nl80211_bss_select_attr.
  *
  * @cookie_counter: unique generic cookie counter, used to identify objects.
+ * @nan_supported_bands: bands supported by the device in NAN mode, a
+ *	bitmap of &enum nl80211_band values.  For instance, for
+ *	NL80211_BAND_2GHZ, bit 0 would be set
+ *	(i.e. BIT(NL80211_BAND_2GHZ)).
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -3727,6 +3733,8 @@ struct wiphy {
 
 	u64 cookie_counter;
 
+	u8 nan_supported_bands;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cd547b864595..5ed257c4cd4e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -10,7 +10,7 @@
  * Copyright 2008, 2009 Luis R. Rodriguez <lrodriguez@atheros.com>
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -854,12 +854,15 @@
  *	cfg80211_scan_done().
  *
  * @NL80211_CMD_START_NAN: Start NAN operation, identified by its
- *	%NL80211_ATTR_WDEV interface. This interface must have been previously
- *	created with %NL80211_CMD_NEW_INTERFACE. After it has been started, the
- *	NAN interface will create or join a cluster. This command must have a
- *	valid %NL80211_ATTR_NAN_MASTER_PREF attribute and optional
- *	%NL80211_ATTR_NAN_DUAL attributes.
- *	After this command NAN functions can be added.
+ *	%NL80211_ATTR_WDEV interface. This interface must have been
+ *	previously created with %NL80211_CMD_NEW_INTERFACE. After it
+ *	has been started, the NAN interface will create or join a
+ *	cluster. This command must have a valid
+ *	%NL80211_ATTR_NAN_MASTER_PREF attribute and optional
+ *	%NL80211_ATTR_BANDS attributes.  If %NL80211_ATTR_BANDS is
+ *	omitted or set to 0, it means don't-care and the device will
+ *	decide what to use.  After this command NAN functions can be
+ *	added.
  * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by
  *	its %NL80211_ATTR_WDEV interface.
  * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined
@@ -880,10 +883,14 @@
  *	This command is also used as a notification sent when a NAN function is
  *	terminated. This will contain a %NL80211_ATTR_NAN_FUNC_INST_ID
  *	and %NL80211_ATTR_COOKIE attributes.
- * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN configuration. NAN
- *	must be operational (%NL80211_CMD_START_NAN was executed).
- *	It must contain at least one of the following attributes:
- *	%NL80211_ATTR_NAN_MASTER_PREF, %NL80211_ATTR_NAN_DUAL.
+ * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN
+ *	configuration. NAN must be operational (%NL80211_CMD_START_NAN
+ *	was executed).  It must contain at least one of the following
+ *	attributes: %NL80211_ATTR_NAN_MASTER_PREF,
+ *	%NL80211_ATTR_BANDS.  If %NL80211_ATTR_BANDS is omitted, the
+ *	current configuration is not changed.  If it is present but
+ *	set to zero, the configuration is changed to don't-care
+ *	(i.e. the device can decide what to do).
  * @NL80211_CMD_NAN_FUNC_MATCH: Notification sent when a match is reported.
  *	This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and
  *	%NL80211_ATTR_COOKIE.
@@ -1963,10 +1970,13 @@ enum nl80211_commands {
  *	%NL80211_CMD_CHANGE_NAN_CONFIG. Its type is u8 and it can't be 0.
  *	Also, values 1 and 255 are reserved for certification purposes and
  *	should not be used during a normal device operation.
- * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see
- *	&enum nl80211_nan_dual_band_conf). This attribute is used with
- *	%NL80211_CMD_START_NAN and optionally with
- *	%NL80211_CMD_CHANGE_NAN_CONFIG.
+ * @NL80211_ATTR_BANDS: operating bands configuration.  This is a u32
+ *	bitmask of BIT(NL80211_BAND_*) as described in %enum
+ *	nl80211_band.  For instance, for NL80211_BAND_2GHZ, bit 0
+ *	would be set.  This attribute is used with
+ *	%NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG, and
+ *	it is optional.  If no bands are set, it means don't-care and
+ *	the device will decide what to use.
  * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See
  *	&enum nl80211_nan_func_attributes for description of this nested
  *	attribute.
@@ -2397,7 +2407,7 @@ enum nl80211_attrs {
 	NL80211_ATTR_MESH_PEER_AID,
 
 	NL80211_ATTR_NAN_MASTER_PREF,
-	NL80211_ATTR_NAN_DUAL,
+	NL80211_ATTR_BANDS,
 	NL80211_ATTR_NAN_FUNC,
 	NL80211_ATTR_NAN_MATCH,
 
@@ -5070,21 +5080,6 @@ enum nl80211_bss_select_attr {
 	NL80211_BSS_SELECT_ATTR_MAX = __NL80211_BSS_SELECT_ATTR_AFTER_LAST - 1
 };
 
-/**
- * enum nl80211_nan_dual_band_conf - NAN dual band configuration
- *
- * Defines the NAN dual band mode of operation
- *
- * @NL80211_NAN_BAND_DEFAULT: device default mode
- * @NL80211_NAN_BAND_2GHZ: 2.4GHz mode
- * @NL80211_NAN_BAND_5GHZ: 5GHz mode
-  */
-enum nl80211_nan_dual_band_conf {
-	NL80211_NAN_BAND_DEFAULT	= 1 << 0,
-	NL80211_NAN_BAND_2GHZ		= 1 << 1,
-	NL80211_NAN_BAND_5GHZ		= 1 << 2,
-};
-
 /**
  * enum nl80211_nan_function_type - NAN function type
  *
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index a0be2f6cd121..ac879bb17870 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
 	if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
 		new_conf.master_pref = conf->master_pref;
 
-	if (changes & CFG80211_NAN_CONF_CHANGED_DUAL)
-		new_conf.dual = conf->dual;
+	if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
+		new_conf.bands = conf->bands;
 
 	ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
 	if (!ret)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index f78d9f4f8711..0d645bc148d0 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual)
+		__field(u8, bands)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 	),
 
 	TP_printk(
 		LOCAL_PR_FMT  VIF_PR_FMT
-		", master preference: %u, dual: %d",
+		", master preference: %u, bands: 0x%0x",
 		LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
-		__entry->dual
+		__entry->bands
 	)
 );
 
@@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual)
+		__field(u8, bands)
 		__field(u32, changes)
 	),
 
@@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 		__entry->changes = changes;
 	),
 
 	TP_printk(
 		LOCAL_PR_FMT  VIF_PR_FMT
-		", master preference: %u, dual: %d, changes: 0x%x",
+		", master preference: %u, bands: 0x%0x, changes: 0x%x",
 		LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
-		__entry->dual, __entry->changes
+		__entry->bands, __entry->changes
 	)
 );
 
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 903fc419217a..e55e05bc4805 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -626,7 +626,8 @@ int wiphy_register(struct wiphy *wiphy)
 
 	if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
 		    (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
-		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
+		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
+		     !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
 		return -EINVAL;
 
 #ifndef CONFIG_WIRELESS_WDS
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9d738f75bd4e..b5f755b3ac5d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -398,7 +398,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	},
 	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
 	[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
-	[NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
+	[NL80211_ATTR_BANDS] = { .type = NLA_U32 },
 	[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
 	[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
 				    .len = FILS_MAX_KEK_LEN },
@@ -1886,6 +1886,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 			}
 		}
 
+		if (nla_put_u32(msg, NL80211_ATTR_BANDS,
+				rdev->wiphy.nan_supported_bands))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -10777,15 +10781,22 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
 		return -EINVAL;
 
-	if (!info->attrs[NL80211_ATTR_NAN_DUAL])
-		return -EINVAL;
-
 	conf.master_pref =
 		nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
 	if (!conf.master_pref)
 		return -EINVAL;
 
-	conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+	if (info->attrs[NL80211_ATTR_BANDS]) {
+		u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+		if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+			return -EOPNOTSUPP;
+
+		if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+			return -EINVAL;
+
+		conf.bands = bands;
+	}
 
 	err = rdev_start_nan(rdev, wdev, &conf);
 	if (err)
@@ -11150,9 +11161,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
 		changed |= CFG80211_NAN_CONF_CHANGED_PREF;
 	}
 
-	if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
-		conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
-		changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
+	if (info->attrs[NL80211_ATTR_BANDS]) {
+		u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+		if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
+			return -EOPNOTSUPP;
+
+		if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+			return -EINVAL;
+
+		conf.bands = bands;
+		changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
 	}
 
 	if (!changed)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2419c390f150..776e80cef9b4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1915,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan,
 		WIPHY_ENTRY
 		WDEV_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual);
+		__field(u8, bands);
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 	),
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
-		  ", master preference: %u, dual: %d",
+		  ", master preference: %u, bands: 0x%0x",
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
-		  __entry->dual)
+		  __entry->bands)
 );
 
 TRACE_EVENT(rdev_nan_change_conf,
@@ -1937,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf,
 		WIPHY_ENTRY
 		WDEV_ENTRY
 		__field(u8, master_pref)
-		__field(u8, dual);
+		__field(u8, bands);
 		__field(u32, changes);
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
 		__entry->master_pref = conf->master_pref;
-		__entry->dual = conf->dual;
+		__entry->bands = conf->bands;
 		__entry->changes = changes;
 	),
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
-		  ", master preference: %u, dual: %d, changes: %x",
+		  ", master preference: %u, bands: 0x%0x, changes: %x",
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
-		  __entry->dual, __entry->changes)
+		  __entry->bands, __entry->changes)
 );
 
 DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
-- 
cgit v1.2.3


From 19ba1eb15a2a9b7298d1d984043025ab9496fbfb Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 7 Feb 2017 23:58:02 -0800
Subject: Input: psmouse - add a custom serio protocol to send extra
 information

The tracksticks on the Lenovo thinkpads have their buttons connected
through the touchpad device. We already fixed that in synaptics.c, but
when we switch the device into RMI4 mode to have proper support, the
pass-through functionality can't deal with them easily.

We add a new PS/2 flag and protocol designed for psmouse.  The RMI4 F03
pass-through can then emit a special set of commands to notify psmouse the
state of the buttons.

This patch implements the protocol in psmouse, while an other will
do the same for rmi4-f03.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/psmouse-base.c | 41 +++++++++++++++++++++++++++++++++++---
 drivers/input/mouse/psmouse.h      |  5 +++++
 include/uapi/linux/serio.h         |  7 ++++---
 3 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index bee267424972..a598b7223cef 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -127,6 +127,13 @@ struct psmouse_protocol {
 	int (*init)(struct psmouse *);
 };
 
+static void psmouse_report_standard_buttons(struct input_dev *dev, u8 buttons)
+{
+	input_report_key(dev, BTN_LEFT,   buttons & BIT(0));
+	input_report_key(dev, BTN_MIDDLE, buttons & BIT(2));
+	input_report_key(dev, BTN_RIGHT,  buttons & BIT(1));
+}
+
 /*
  * psmouse_process_byte() analyzes the PS/2 data stream and reports
  * relevant events to the input module once full packet has arrived.
@@ -199,9 +206,8 @@ psmouse_ret_t psmouse_process_byte(struct psmouse *psmouse)
 	}
 
 	/* Generic PS/2 Mouse */
-	input_report_key(dev, BTN_LEFT,    packet[0]       & 1);
-	input_report_key(dev, BTN_MIDDLE, (packet[0] >> 2) & 1);
-	input_report_key(dev, BTN_RIGHT,  (packet[0] >> 1) & 1);
+	psmouse_report_standard_buttons(dev,
+					packet[0] | psmouse->extra_buttons);
 
 	input_report_rel(dev, REL_X, packet[1] ? (int) packet[1] - (int) ((packet[0] << 4) & 0x100) : 0);
 	input_report_rel(dev, REL_Y, packet[2] ? (int) ((packet[0] << 3) & 0x100) - (int) packet[2] : 0);
@@ -282,6 +288,30 @@ static int psmouse_handle_byte(struct psmouse *psmouse)
 	return 0;
 }
 
+static void psmouse_handle_oob_data(struct psmouse *psmouse, u8 data)
+{
+	switch (psmouse->oob_data_type) {
+	case PSMOUSE_OOB_NONE:
+		psmouse->oob_data_type = data;
+		break;
+
+	case PSMOUSE_OOB_EXTRA_BTNS:
+		psmouse_report_standard_buttons(psmouse->dev, data);
+		input_sync(psmouse->dev);
+
+		psmouse->extra_buttons = data;
+		psmouse->oob_data_type = PSMOUSE_OOB_NONE;
+		break;
+
+	default:
+		psmouse_warn(psmouse,
+			     "unknown OOB_DATA type: 0x%02x\n",
+			     psmouse->oob_data_type);
+		psmouse->oob_data_type = PSMOUSE_OOB_NONE;
+		break;
+	}
+}
+
 /*
  * psmouse_interrupt() handles incoming characters, either passing them
  * for normal processing or gathering them as command response.
@@ -306,6 +336,11 @@ static irqreturn_t psmouse_interrupt(struct serio *serio,
 		goto out;
 	}
 
+	if (flags & SERIO_OOB_DATA) {
+		psmouse_handle_oob_data(psmouse, data);
+		goto out;
+	}
+
 	if (unlikely(psmouse->ps2dev.flags & PS2_FLAG_ACK))
 		if  (ps2_handle_ack(&psmouse->ps2dev, data))
 			goto out;
diff --git a/drivers/input/mouse/psmouse.h b/drivers/input/mouse/psmouse.h
index e0ca6cda3d16..8c83b8e2505c 100644
--- a/drivers/input/mouse/psmouse.h
+++ b/drivers/input/mouse/psmouse.h
@@ -1,6 +1,9 @@
 #ifndef _PSMOUSE_H
 #define _PSMOUSE_H
 
+#define PSMOUSE_OOB_NONE	0x00
+#define PSMOUSE_OOB_EXTRA_BTNS	0x01
+
 #define PSMOUSE_CMD_SETSCALE11	0x00e6
 #define PSMOUSE_CMD_SETSCALE21	0x00e7
 #define PSMOUSE_CMD_SETRES	0x10e8
@@ -53,6 +56,8 @@ struct psmouse {
 	unsigned char pktcnt;
 	unsigned char pktsize;
 	unsigned char type;
+	unsigned char oob_data_type;
+	unsigned char extra_buttons;
 	bool ignore_parity;
 	bool acks_disable_command;
 	unsigned int model;
diff --git a/include/uapi/linux/serio.h b/include/uapi/linux/serio.h
index f2447a83ac8d..ccd0ccd00f47 100644
--- a/include/uapi/linux/serio.h
+++ b/include/uapi/linux/serio.h
@@ -17,9 +17,10 @@
 /*
  * bit masks for use in "interrupt" flags (3rd argument)
  */
-#define SERIO_TIMEOUT	1
-#define SERIO_PARITY	2
-#define SERIO_FRAME	4
+#define SERIO_TIMEOUT	BIT(0)
+#define SERIO_PARITY	BIT(1)
+#define SERIO_FRAME	BIT(2)
+#define SERIO_OOB_DATA	BIT(3)
 
 /*
  * Serio types
-- 
cgit v1.2.3


From a92ce1a42dde1caaee4afae67531e3e7acecf6e4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 9 Feb 2017 01:18:18 +0800
Subject: sctp: implement sender-side procedures for SSN/TSN Reset Request
 Parameter

This patch is to implement Sender-Side Procedures for the SSN/TSN
Reset Request Parameter descibed in rfc6525 section 5.1.4.

It is also to add sockopt SCTP_RESET_ASSOC in rfc6525 section 6.3.3
for users.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h   |  1 +
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 29 +++++++++++++++++++++++++++++
 net/sctp/stream.c         | 40 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 480b65a24aff..b60ca14068d8 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -198,6 +198,7 @@ int sctp_offload_init(void);
  */
 int sctp_send_reset_streams(struct sctp_association *asoc,
 			    struct sctp_reset_streams *params);
+int sctp_send_reset_assoc(struct sctp_association *asoc);
 
 /*
  * Module global variables
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 03c27cefffb1..c0bd8c3d565a 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -117,6 +117,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_ASSOC_STATUS	115
 #define SCTP_ENABLE_STREAM_RESET	118
 #define SCTP_RESET_STREAMS	119
+#define SCTP_RESET_ASSOC	120
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a8b4252fe084..45a7c417eb7f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3818,6 +3818,32 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_reset_assoc(struct sock *sk,
+				       char __user *optval,
+				       unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	sctp_assoc_t associd;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(associd))
+		goto out;
+
+	if (copy_from_user(&associd, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, associd);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_reset_assoc(asoc);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3990,6 +4016,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_RESET_STREAMS:
 		retval = sctp_setsockopt_reset_streams(sk, optval, optlen);
 		break;
+	case SCTP_RESET_ASSOC:
+		retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 6a686e330c57..53e49fc2f0a3 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -177,3 +177,43 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 out:
 	return retval;
 }
+
+int sctp_send_reset_assoc(struct sctp_association *asoc)
+{
+	struct sctp_chunk *chunk = NULL;
+	int retval;
+	__u16 i;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
+		return -ENOPROTOOPT;
+
+	if (asoc->strreset_outstanding)
+		return -EINPROGRESS;
+
+	chunk = sctp_make_strreset_tsnreq(asoc);
+	if (!chunk)
+		return -ENOMEM;
+
+	/* Block further xmit of data until this request is completed */
+	for (i = 0; i < asoc->stream->outcnt; i++)
+		asoc->stream->out[i].state = SCTP_STREAM_CLOSED;
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+
+		for (i = 0; i < asoc->stream->outcnt; i++)
+			asoc->stream->out[i].state = SCTP_STREAM_OPEN;
+
+		return retval;
+	}
+
+	asoc->strreset_outstanding = 1;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 242bd2d519d7194633e309286ba7ba29a1ad63e8 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 9 Feb 2017 01:18:20 +0800
Subject: sctp: implement sender-side procedures for Add Incoming/Outgoing
 Streams Request Parameter

This patch is to implement Sender-Side Procedures for the Add
Outgoing and Incoming Streams Request Parameter described in
rfc6525 section 5.1.5-5.1.6.

It is also to add sockopt SCTP_ADD_STREAMS in rfc6525 section
6.3.4 for users.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h   |  2 ++
 include/uapi/linux/sctp.h |  7 +++++
 net/sctp/socket.c         | 29 ++++++++++++++++++
 net/sctp/stream.c         | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index b60ca14068d8..6dfc5536a3e6 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -199,6 +199,8 @@ int sctp_offload_init(void);
 int sctp_send_reset_streams(struct sctp_association *asoc,
 			    struct sctp_reset_streams *params);
 int sctp_send_reset_assoc(struct sctp_association *asoc);
+int sctp_send_add_streams(struct sctp_association *asoc,
+			  struct sctp_add_streams *params);
 
 /*
  * Module global variables
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c0bd8c3d565a..a91a9cccbae6 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -118,6 +118,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ENABLE_STREAM_RESET	118
 #define SCTP_RESET_STREAMS	119
 #define SCTP_RESET_ASSOC	120
+#define SCTP_ADD_STREAMS	121
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -1027,4 +1028,10 @@ struct sctp_reset_streams {
 	uint16_t srs_stream_list[];	/* list if srs_num_streams is not 0 */
 };
 
+struct sctp_add_streams {
+	sctp_assoc_t sas_assoc_id;
+	uint16_t sas_instrms;
+	uint16_t sas_outstrms;
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 45a7c417eb7f..75f35cea4371 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3844,6 +3844,32 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_add_streams(struct sock *sk,
+				       char __user *optval,
+				       unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_add_streams params;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.sas_assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_send_add_streams(asoc, &params);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4019,6 +4045,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_RESET_ASSOC:
 		retval = sctp_setsockopt_reset_assoc(sk, optval, optlen);
 		break;
+	case SCTP_ADD_STREAMS:
+		retval = sctp_setsockopt_add_streams(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 53e49fc2f0a3..eb02490245ba 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -217,3 +217,80 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
 
 	return 0;
 }
+
+int sctp_send_add_streams(struct sctp_association *asoc,
+			  struct sctp_add_streams *params)
+{
+	struct sctp_stream *stream = asoc->stream;
+	struct sctp_chunk *chunk = NULL;
+	int retval = -ENOMEM;
+	__u32 outcnt, incnt;
+	__u16 out, in;
+
+	if (!asoc->peer.reconf_capable ||
+	    !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+		retval = -ENOPROTOOPT;
+		goto out;
+	}
+
+	if (asoc->strreset_outstanding) {
+		retval = -EINPROGRESS;
+		goto out;
+	}
+
+	out = params->sas_outstrms;
+	in  = params->sas_instrms;
+	outcnt = stream->outcnt + out;
+	incnt = stream->incnt + in;
+	if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
+	    (!out && !in)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (out) {
+		struct sctp_stream_out *streamout;
+
+		streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
+				     GFP_KERNEL);
+		if (!streamout)
+			goto out;
+
+		memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
+		stream->out = streamout;
+	}
+
+	if (in) {
+		struct sctp_stream_in *streamin;
+
+		streamin = krealloc(stream->in, incnt * sizeof(*streamin),
+				    GFP_KERNEL);
+		if (!streamin)
+			goto out;
+
+		memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
+		stream->in = streamin;
+	}
+
+	chunk = sctp_make_strreset_addstrm(asoc, out, in);
+	if (!chunk)
+		goto out;
+
+	asoc->strreset_chunk = chunk;
+	sctp_chunk_hold(asoc->strreset_chunk);
+
+	retval = sctp_send_reconf(asoc, chunk);
+	if (retval) {
+		sctp_chunk_put(asoc->strreset_chunk);
+		asoc->strreset_chunk = NULL;
+		goto out;
+	}
+
+	stream->incnt = incnt;
+	stream->outcnt = outcnt;
+
+	asoc->strreset_outstanding = !!out + !!in;
+
+out:
+	return retval;
+}
-- 
cgit v1.2.3


From cb80d58fae76d8ea93555149b2b16e19b89a1f4f Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 9 Feb 2017 11:21:55 -0800
Subject: openvswitch: Unionize ovs_key_ct_label with a u32 array.

Make the array of labels in struct ovs_key_ct_label an union, adding a
u32 array of the same byte size as the existing u8 array.  It is
faster to loop through the labels 32 bits at the time, which is also
the alignment of netlink attributes.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  8 ++++++--
 net/openvswitch/conntrack.c      | 15 ++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 375d812fea36..96aee34ef55f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -446,9 +446,13 @@ struct ovs_key_nd {
 	__u8	nd_tll[ETH_ALEN];
 };
 
-#define OVS_CT_LABELS_LEN	16
+#define OVS_CT_LABELS_LEN_32	4
+#define OVS_CT_LABELS_LEN	(OVS_CT_LABELS_LEN_32 * sizeof(__u32))
 struct ovs_key_ct_labels {
-	__u8	ct_labels[OVS_CT_LABELS_LEN];
+	union {
+		__u8	ct_labels[OVS_CT_LABELS_LEN];
+		__u32	ct_labels_32[OVS_CT_LABELS_LEN_32];
+	};
 };
 
 /* OVS_KEY_ATTR_CT_STATE flags */
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a6ff374d57d3..f23934ccce20 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -281,20 +281,21 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
 		/* Triggers a change event, which makes sense only for
 		 * confirmed connections.
 		 */
-		int err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
-						OVS_CT_LABELS_LEN / sizeof(u32));
+		int err = nf_connlabels_replace(ct, labels->ct_labels_32,
+						mask->ct_labels_32,
+						OVS_CT_LABELS_LEN_32);
 		if (err)
 			return err;
 	} else {
 		u32 *dst = (u32 *)cl->bits;
-		const u32 *msk = (const u32 *)mask->ct_labels;
-		const u32 *lbl = (const u32 *)labels->ct_labels;
+		const u32 *msk = mask->ct_labels_32;
+		const u32 *lbl = labels->ct_labels_32;
 		int i;
 
 		/* No-one else has access to the non-confirmed entry, copy
 		 * labels over, keeping any bits we are not explicitly setting.
 		 */
-		for (i = 0; i < OVS_CT_LABELS_LEN / sizeof(u32); i++)
+		for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
 			dst[i] = (dst[i] & ~msk[i]) | (lbl[i] & msk[i]);
 	}
 
@@ -866,8 +867,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 {
 	size_t i;
 
-	for (i = 0; i < sizeof(*labels); i++)
-		if (labels->ct_labels[i])
+	for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+		if (labels->ct_labels_32[i])
 			return true;
 
 	return false;
-- 
cgit v1.2.3


From 9dd7f8907c3705dc7a7a375d1c6e30b06e6daffc Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 9 Feb 2017 11:21:59 -0800
Subject: openvswitch: Add original direction conntrack tuple to sw_flow_key.

Add the fields of the conntrack original direction 5-tuple to struct
sw_flow_key.  The new fields are initially marked as non-existent, and
are populated whenever a conntrack action is executed and either finds
or generates a conntrack entry.  This means that these fields exist
for all packets that were not rejected by conntrack as untrackable.

The original tuple fields in the sw_flow_key are filled from the
original direction tuple of the conntrack entry relating to the
current packet, or from the original direction tuple of the master
conntrack entry, if the current conntrack entry has a master.
Generally, expected connections of connections having an assigned
helper (e.g., FTP), have a master conntrack entry.

The main purpose of the new conntrack original tuple fields is to
allow matching on them for policy decision purposes, with the premise
that the admissibility of tracked connections reply packets (as well
as original direction packets), and both direction packets of any
related connections may be based on ACL rules applying to the master
connection's original direction 5-tuple.  This also makes it easier to
make policy decisions when the actual packet headers might have been
transformed by NAT, as the original direction 5-tuple represents the
packet headers before any such transformation.

When using the original direction 5-tuple the admissibility of return
and/or related packets need not be based on the mere existence of a
conntrack entry, allowing separation of admission policy from the
established conntrack state.  While existence of a conntrack entry is
required for admission of the return or related packets, policy
changes can render connections that were initially admitted to be
rejected or dropped afterwards.  If the admission of the return and
related packets was based on mere conntrack state (e.g., connection
being in an established state), a policy change that would make the
connection rejected or dropped would need to find and delete all
conntrack entries affected by such a change.  When using the original
direction 5-tuple matching the affected conntrack entries can be
allowed to time out instead, as the established state of the
connection would not need to be the basis for packet admission any
more.

It should be noted that the directionality of related connections may
be the same or different than that of the master connection, and
neither the original direction 5-tuple nor the conntrack state bits
carry this information.  If needed, the directionality of the master
connection can be stored in master's conntrack mark or labels, which
are automatically inherited by the expected related connections.

The fact that neither ARP nor ND packets are trackable by conntrack
allows mutual exclusion between ARP/ND and the new conntrack original
tuple fields.  Hence, the IP addresses are overlaid in union with ARP
and ND fields.  This allows the sw_flow_key to not grow much due to
this patch, but it also means that we must be careful to never use the
new key fields with ARP or ND packets.  ARP is easy to distinguish and
keep mutually exclusive based on the ethernet type, but ND being an
ICMPv6 protocol requires a bit more attention.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 20 +++++++++-
 net/openvswitch/actions.c        |  2 +
 net/openvswitch/conntrack.c      | 86 +++++++++++++++++++++++++++++++++++++---
 net/openvswitch/conntrack.h      | 10 ++++-
 net/openvswitch/flow.c           | 34 +++++++++++++---
 net/openvswitch/flow.h           | 49 ++++++++++++++++++-----
 net/openvswitch/flow_netlink.c   | 85 +++++++++++++++++++++++++++++----------
 net/openvswitch/flow_netlink.h   |  7 +++-
 8 files changed, 246 insertions(+), 47 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 96aee34ef55f..90af8b8e10f8 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -331,6 +331,8 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_ZONE,	/* u16 connection tracking zone. */
 	OVS_KEY_ATTR_CT_MARK,	/* u32 connection tracking mark */
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
+	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
+	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -472,6 +474,22 @@ struct ovs_key_ct_labels {
 
 #define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
+struct ovs_key_ct_tuple_ipv4 {
+	__be32 ipv4_src;
+	__be32 ipv4_dst;
+	__be16 src_port;
+	__be16 dst_port;
+	__u8   ipv4_proto;
+};
+
+struct ovs_key_ct_tuple_ipv6 {
+	__be32 ipv6_src[4];
+	__be32 ipv6_dst[4];
+	__be16 src_port;
+	__be16 dst_port;
+	__u8   ipv6_proto;
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index efa9a8858cc6..b1beb2b94ec7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1074,6 +1074,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_CT_ZONE:
 	case OVS_KEY_ATTR_CT_MARK:
 	case OVS_KEY_ATTR_CT_LABELS:
+	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
+	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
 		err = -EINVAL;
 		break;
 	}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f989ccf38eab..bfd7606c8be1 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -147,6 +147,20 @@ static void ovs_ct_get_labels(const struct nf_conn *ct,
 		memset(labels, 0, OVS_CT_LABELS_LEN);
 }
 
+static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
+					const struct nf_conntrack_tuple *orig,
+					u8 icmp_proto)
+{
+	key->ct.orig_proto = orig->dst.protonum;
+	if (orig->dst.protonum == icmp_proto) {
+		key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
+		key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
+	} else {
+		key->ct.orig_tp.src = orig->src.u.all;
+		key->ct.orig_tp.dst = orig->dst.u.all;
+	}
+}
+
 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 				const struct nf_conntrack_zone *zone,
 				const struct nf_conn *ct)
@@ -155,6 +169,35 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	key->ct.zone = zone->id;
 	key->ct.mark = ovs_ct_get_mark(ct);
 	ovs_ct_get_labels(ct, &key->ct.labels);
+
+	if (ct) {
+		const struct nf_conntrack_tuple *orig;
+
+		/* Use the master if we have one. */
+		if (ct->master)
+			ct = ct->master;
+		orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+		/* IP version must match with the master connection. */
+		if (key->eth.type == htons(ETH_P_IP) &&
+		    nf_ct_l3num(ct) == NFPROTO_IPV4) {
+			key->ipv4.ct_orig.src = orig->src.u3.ip;
+			key->ipv4.ct_orig.dst = orig->dst.u3.ip;
+			__ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
+			return;
+		} else if (key->eth.type == htons(ETH_P_IPV6) &&
+			   !sw_flow_key_is_nd(key) &&
+			   nf_ct_l3num(ct) == NFPROTO_IPV6) {
+			key->ipv6.ct_orig.src = orig->src.u3.in6;
+			key->ipv6.ct_orig.dst = orig->dst.u3.in6;
+			__ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
+			return;
+		}
+	}
+	/* Clear 'ct.orig_proto' to mark the non-existence of conntrack
+	 * original direction key fields.
+	 */
+	key->ct.orig_proto = 0;
 }
 
 /* Update 'key' based on skb->_nfct.  If 'post_ct' is true, then OVS has
@@ -208,24 +251,55 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+#define IN6_ADDR_INITIALIZER(ADDR) \
+	{ (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
+	  (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
+
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+		   const struct sw_flow_key *output, struct sk_buff *skb)
 {
-	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct.state))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
-	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+	    nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct.zone))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+	    nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
 		return -EMSGSIZE;
 
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
-	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
-		    &key->ct.labels))
+	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
+		    &output->ct.labels))
 		return -EMSGSIZE;
 
+	if (swkey->ct.orig_proto) {
+		if (swkey->eth.type == htons(ETH_P_IP)) {
+			struct ovs_key_ct_tuple_ipv4 orig = {
+				output->ipv4.ct_orig.src,
+				output->ipv4.ct_orig.dst,
+				output->ct.orig_tp.src,
+				output->ct.orig_tp.dst,
+				output->ct.orig_proto,
+			};
+			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
+				    sizeof(orig), &orig))
+				return -EMSGSIZE;
+		} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+			struct ovs_key_ct_tuple_ipv6 orig = {
+				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
+				IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
+				output->ct.orig_tp.src,
+				output->ct.orig_tp.dst,
+				output->ct.orig_proto,
+			};
+			if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
+				    sizeof(orig), &orig))
+				return -EMSGSIZE;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..9e92445dc092 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
 
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+		   const struct sw_flow_key *output, struct sk_buff *skb);
 void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -79,9 +80,14 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb,
 	key->ct.zone = 0;
 	key->ct.mark = 0;
 	memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+	/* Clear 'ct.orig_proto' to mark the non-existence of original
+	 * direction key fields.
+	 */
+	key->ct.orig_proto = 0;
 }
 
-static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
+				 const struct sw_flow_key *output,
 				 struct sk_buff *skb)
 {
 	return 0;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2c0a00f7f1b7..9d4bb8eb63f2 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -765,7 +765,7 @@ static int key_extract_mac_proto(struct sk_buff *skb)
 int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
-	int res;
+	int res, err;
 
 	/* Extract metadata from packet. */
 	if (tun_info) {
@@ -792,7 +792,6 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->phy.priority = skb->priority;
 	key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
 	key->phy.skb_mark = skb->mark;
-	ovs_ct_fill_key(skb, key);
 	key->ovs_flow_hash = 0;
 	res = key_extract_mac_proto(skb);
 	if (res < 0)
@@ -800,17 +799,26 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	key->mac_proto = res;
 	key->recirc_id = 0;
 
-	return key_extract(skb, key);
+	err = key_extract(skb, key);
+	if (!err)
+		ovs_ct_fill_key(skb, key);   /* Must be after key_extract(). */
+	return err;
 }
 
 int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 				   struct sk_buff *skb,
 				   struct sw_flow_key *key, bool log)
 {
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	u64 attrs = 0;
 	int err;
 
+	err = parse_flow_nlattrs(attr, a, &attrs, log);
+	if (err)
+		return -EINVAL;
+
 	/* Extract metadata from netlink attributes. */
-	err = ovs_nla_get_flow_metadata(net, attr, key, log);
+	err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
 	if (err)
 		return err;
 
@@ -824,5 +832,21 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 	 */
 
 	skb->protocol = key->eth.type;
-	return key_extract(skb, key);
+	err = key_extract(skb, key);
+	if (err)
+		return err;
+
+	/* Check that we have conntrack original direction tuple metadata only
+	 * for packets for which it makes sense.  Otherwise the key may be
+	 * corrupted due to overlapping key fields.
+	 */
+	if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
+	    key->eth.type != htons(ETH_P_IP))
+		return -EINVAL;
+	if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
+	    (key->eth.type != htons(ETH_P_IPV6) ||
+	     sw_flow_key_is_nd(key)))
+		return -EINVAL;
+
+	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index f61cae7f9030..76e05b25f030 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -107,10 +107,16 @@ struct sw_flow_key {
 				__be32 src;	/* IP source address. */
 				__be32 dst;	/* IP destination address. */
 			} addr;
-			struct {
-				u8 sha[ETH_ALEN];	/* ARP source hardware address. */
-				u8 tha[ETH_ALEN];	/* ARP target hardware address. */
-			} arp;
+			union {
+				struct {
+					__be32 src;
+					__be32 dst;
+				} ct_orig;	/* Conntrack original direction fields. */
+				struct {
+					u8 sha[ETH_ALEN];	/* ARP source hardware address. */
+					u8 tha[ETH_ALEN];	/* ARP target hardware address. */
+				} arp;
+			};
 		} ipv4;
 		struct {
 			struct {
@@ -118,23 +124,44 @@ struct sw_flow_key {
 				struct in6_addr dst;	/* IPv6 destination address. */
 			} addr;
 			__be32 label;			/* IPv6 flow label. */
-			struct {
-				struct in6_addr target;	/* ND target address. */
-				u8 sll[ETH_ALEN];	/* ND source link layer address. */
-				u8 tll[ETH_ALEN];	/* ND target link layer address. */
-			} nd;
+			union {
+				struct {
+					struct in6_addr src;
+					struct in6_addr dst;
+				} ct_orig;	/* Conntrack original direction fields. */
+				struct {
+					struct in6_addr target;	/* ND target address. */
+					u8 sll[ETH_ALEN];	/* ND source link layer address. */
+					u8 tll[ETH_ALEN];	/* ND target link layer address. */
+				} nd;
+			};
 		} ipv6;
 	};
 	struct {
 		/* Connection tracking fields. */
+		u8 state;
+		u8 orig_proto;		/* CT orig tuple IP protocol. */
 		u16 zone;
 		u32 mark;
-		u8 state;
+		struct {
+			__be16 src;	/* CT orig tuple tp src port. */
+			__be16 dst;	/* CT orig tuple tp dst port. */
+		} orig_tp;
+
 		struct ovs_key_ct_labels labels;
 	} ct;
 
 } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
 
+static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
+{
+	return key->eth.type == htons(ETH_P_IPV6) &&
+		key->ip.proto == NEXTHDR_ICMP &&
+		key->tp.dst == 0 &&
+		(key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+		 key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
+}
+
 struct sw_flow_key_range {
 	unsigned short int start;
 	unsigned short int end;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c87d359b9b37..989f38f120bb 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -129,7 +129,9 @@ static bool match_validate(const struct sw_flow_match *match,
 	/* The following mask attributes allowed only if they
 	 * pass the validation tests. */
 	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+			| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
 			| (1 << OVS_KEY_ATTR_IPV6)
+			| (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
 			| (1 << OVS_KEY_ATTR_TCP)
 			| (1 << OVS_KEY_ATTR_TCP_FLAGS)
 			| (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match,
 
 	if (match->key->eth.type == htons(ETH_P_IP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
 			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+			mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
+		}
 
 		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
 			if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match,
 
 	if (match->key->eth.type == htons(ETH_P_IPV6)) {
 		key_expected |= 1 << OVS_KEY_ATTR_IPV6;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
 			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+			mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
+		}
 
 		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
 			if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match,
 						htons(NDISC_NEIGHBOUR_SOLICITATION) ||
 				    match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
 					key_expected |= 1 << OVS_KEY_ATTR_ND;
+					/* Original direction conntrack tuple
+					 * uses the same space as the ND fields
+					 * in the key, so both are not allowed
+					 * at the same time.
+					 */
+					mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
 					if (match->mask && (match->mask->key.tp.src == htons(0xff)))
 						mask_allowed |= 1 << OVS_KEY_ATTR_ND;
 				}
@@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
+		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },
 	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) },
 	[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
+	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
+		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
+	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
+		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
 	return __parse_flow_nlattrs(attr, a, attrsp, log, true);
 }
 
-static int parse_flow_nlattrs(const struct nlattr *attr,
-			      const struct nlattr *a[], u64 *attrsp,
-			      bool log)
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+		       u64 *attrsp, bool log)
 {
 	return __parse_flow_nlattrs(attr, a, attrsp, log, false);
 }
@@ -1082,6 +1098,34 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				   sizeof(*cl), is_mask);
 		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
 	}
+	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
+		const struct ovs_key_ct_tuple_ipv4 *ct;
+
+		ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
+
+		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_proto, ct->ipv4_proto, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
+	}
+	if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
+		const struct ovs_key_ct_tuple_ipv6 *ct;
+
+		ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
+
+		SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
+				   sizeof(match->key->ipv6.ct_orig.src),
+				   is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
+				   sizeof(match->key->ipv6.ct_orig.dst),
+				   is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+		SW_FLOW_KEY_PUT(match, ct.orig_proto, ct->ipv6_proto, is_mask);
+		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
+	}
 
 	/* For layer 3 packets the Ethernet type is provided
 	 * and treated as metadata but no MAC addresses are provided.
@@ -1493,9 +1537,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
 
 /**
  * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
- * @key: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
+ * @net: Network namespace.
+ * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
+ * metadata.
+ * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
+ * attributes.
+ * @attrs: Bit mask for the netlink attributes included in @a.
  * @log: Boolean to allow kernel error logging.  Normally true, but when
  * probing for feature compatibility this should be passed in as false to
  * suppress unnecessary error logging.
@@ -1504,25 +1551,23 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
  * take the same form accepted by flow_from_nlattrs(), but only enough of it to
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
+ *
+ * This must be called before the packet key fields are filled in 'key'.
  */
 
-int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
-			      struct sw_flow_key *key,
-			      bool log)
+int ovs_nla_get_flow_metadata(struct net *net,
+			      const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+			      u64 attrs, struct sw_flow_key *key, bool log)
 {
-	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
 	struct sw_flow_match match;
-	u64 attrs = 0;
-	int err;
-
-	err = parse_flow_nlattrs(attr, a, &attrs, log);
-	if (err)
-		return -EINVAL;
 
 	memset(&match, 0, sizeof(match));
 	match.key = key;
 
 	memset(&key->ct, 0, sizeof(key->ct));
+	memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
+	memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
+
 	key->phy.in_port = DP_MAX_PORTS;
 
 	return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1584,7 +1629,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
 		goto nla_put_failure;
 
-	if (ovs_ct_put_key(output, skb))
+	if (ovs_ct_put_key(swkey, output, skb))
 		goto nla_put_failure;
 
 	if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
 
 int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
 		    int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
-			      struct sw_flow_key *, bool log);
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+		       u64 *attrsp, bool log);
+int ovs_nla_get_flow_metadata(struct net *net,
+			      const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+			      u64 attrs, struct sw_flow_key *key, bool log);
 
 int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
-- 
cgit v1.2.3


From dd41d33f0b033885211a5d6f3ee19e73238aa9ee Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 9 Feb 2017 11:22:00 -0800
Subject: openvswitch: Add force commit.

Stateful network admission policy may allow connections to one
direction and reject connections initiated in the other direction.
After policy change it is possible that for a new connection an
overlapping conntrack entry already exists, where the original
direction of the existing connection is opposed to the new
connection's initial packet.

Most importantly, conntrack state relating to the current packet gets
the "reply" designation based on whether the original direction tuple
or the reply direction tuple matched.  If this "directionality" is
wrong w.r.t. to the stateful network admission policy it may happen
that packets in neither direction are correctly admitted.

This patch adds a new "force commit" option to the OVS conntrack
action that checks the original direction of an existing conntrack
entry.  If that direction is opposed to the current packet, the
existing conntrack entry is deleted and a new one is subsequently
created in the correct direction.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  5 +++++
 net/openvswitch/conntrack.c      | 26 ++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 90af8b8e10f8..7f41f7d0000f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -674,6 +674,10 @@ struct ovs_action_hash {
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
  * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
  * translation (NAT) on the packet.
+ * @OVS_CT_ATTR_FORCE_COMMIT: Like %OVS_CT_ATTR_COMMIT, but instead of doing
+ * nothing if the connection is already committed will check that the current
+ * packet is in conntrack entry's original direction.  If directionality does
+ * not match, will delete the existing conntrack entry and commit a new one.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -684,6 +688,7 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
 	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
+	OVS_CT_ATTR_FORCE_COMMIT,  /* No argument */
 	__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index bfd7606c8be1..8b15bab70583 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,6 +65,7 @@ struct ovs_conntrack_info {
 	struct nf_conn *ct;
 	u8 commit : 1;
 	u8 nat : 3;                 /* enum ovs_ct_nat */
+	u8 force : 1;
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
@@ -613,10 +614,13 @@ static bool skb_nfct_cached(struct net *net,
 	 */
 	if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
 	    !(key->ct.state & OVS_CS_F_INVALID) &&
-	    key->ct.zone == info->zone.id)
+	    key->ct.zone == info->zone.id) {
 		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
 					  !!(key->ct.state
 					     & OVS_CS_F_NAT_MASK));
+		if (ct)
+			nf_ct_get(skb, &ctinfo);
+	}
 	if (!ct)
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -630,6 +634,18 @@ static bool skb_nfct_cached(struct net *net,
 		if (help && rcu_access_pointer(help->helper) != info->helper)
 			return false;
 	}
+	/* Force conntrack entry direction to the current packet? */
+	if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+		/* Delete the conntrack entry if confirmed, else just release
+		 * the reference.
+		 */
+		if (nf_ct_is_confirmed(ct))
+			nf_ct_delete(ct, 0, 0);
+		else
+			nf_conntrack_put(&ct->ct_general);
+		nf_ct_set(skb, NULL, 0);
+		return false;
+	}
 
 	return true;
 }
@@ -1207,6 +1223,7 @@ static int parse_nat(const struct nlattr *attr,
 
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
+	[OVS_CT_ATTR_FORCE_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
 				    .maxlen = sizeof(u16) },
 	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),
@@ -1246,6 +1263,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 		}
 
 		switch (type) {
+		case OVS_CT_ATTR_FORCE_COMMIT:
+			info->force = true;
+			/* fall through. */
 		case OVS_CT_ATTR_COMMIT:
 			info->commit = true;
 			break;
@@ -1472,7 +1492,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 	if (!start)
 		return -EMSGSIZE;
 
-	if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+	if (ct_info->commit && nla_put_flag(skb, ct_info->force
+					    ? OVS_CT_ATTR_FORCE_COMMIT
+					    : OVS_CT_ATTR_COMMIT))
 		return -EMSGSIZE;
 	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
 	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
-- 
cgit v1.2.3


From 7795753661f1a9423c3c8fbde322f6a2a8b94b68 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 8 Feb 2017 17:09:05 +0100
Subject: serial: exar: Fix feature control register constants

According to the XR17V352 manual, bit 4 is IrDA control and bit 5 for
485. Fortunately, no driver used them so far.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_reg.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index 274d8fc206e3..25b93a764a1a 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -374,8 +374,8 @@
 #define UART_EXAR_DVID		0x8d	/* Device identification */
 
 #define UART_EXAR_FCTR		0x08	/* Feature Control Register */
-#define UART_FCTR_EXAR_IRDA	0x08	/* IrDa data encode select */
-#define UART_FCTR_EXAR_485	0x10	/* Auto 485 half duplex dir ctl */
+#define UART_FCTR_EXAR_IRDA	0x10	/* IrDa data encode select */
+#define UART_FCTR_EXAR_485	0x20	/* Auto 485 half duplex dir ctl */
 #define UART_FCTR_EXAR_TRGA	0x00	/* FIFO trigger table A */
 #define UART_FCTR_EXAR_TRGB	0x60	/* FIFO trigger table B */
 #define UART_FCTR_EXAR_TRGC	0x80	/* FIFO trigger table C */
-- 
cgit v1.2.3


From 7e12357ed64afdc8e60d64b8f8f17d711acf950a Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 8 Feb 2017 17:09:08 +0100
Subject: serial: exar: Move register defines from uapi header to consumer site

None of these registers is relevant for the userspace API.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_exar.c | 13 +++++++++++++
 drivers/tty/serial/8250/8250_port.c |  6 ++++++
 include/uapi/linux/serial_reg.h     | 18 ------------------
 3 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 9af4266eff96..f612da326e82 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -34,6 +34,19 @@
 #define PCI_DEVICE_ID_EXAR_XR17V4358		0x4358
 #define PCI_DEVICE_ID_EXAR_XR17V8358		0x8358
 
+#define UART_EXAR_8XMODE	0x88	/* 8X sampling rate select */
+
+#define UART_EXAR_FCTR		0x08	/* Feature Control Register */
+#define UART_FCTR_EXAR_IRDA	0x10	/* IrDa data encode select */
+#define UART_FCTR_EXAR_485	0x20	/* Auto 485 half duplex dir ctl */
+#define UART_FCTR_EXAR_TRGA	0x00	/* FIFO trigger table A */
+#define UART_FCTR_EXAR_TRGB	0x60	/* FIFO trigger table B */
+#define UART_FCTR_EXAR_TRGC	0x80	/* FIFO trigger table C */
+#define UART_FCTR_EXAR_TRGD	0xc0	/* FIFO trigger table D programmable */
+
+#define UART_EXAR_TXTRG		0x0a	/* Tx FIFO trigger level write-only */
+#define UART_EXAR_RXTRG		0x0b	/* Rx FIFO trigger level write-only */
+
 #define UART_EXAR_MPIOINT_7_0	0x8f	/* MPIOINT[7:0] */
 #define UART_EXAR_MPIOLVL_7_0	0x90	/* MPIOLVL[7:0] */
 #define UART_EXAR_MPIO3T_7_0	0x91	/* MPIO3T[7:0] */
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index ec6b5e3dcaaa..6119516ef5fc 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -44,6 +44,12 @@
 
 #include "8250.h"
 
+/*
+ * These are definitions for the Exar XR17V35X and XR17(C|D)15X
+ */
+#define UART_EXAR_SLEEP		0x8b	/* Sleep mode */
+#define UART_EXAR_DVID		0x8d	/* Device identification */
+
 /*
  * Debugging.
  */
diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index 25b93a764a1a..5db76880b4ad 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -366,24 +366,6 @@
 #define UART_OMAP_MDR1_CIR_MODE		0x06	/* CIR mode */
 #define UART_OMAP_MDR1_DISABLE		0x07	/* Disable (default state) */
 
-/*
- * These are definitions for the Exar XR17V35X and XR17(C|D)15X
- */
-#define UART_EXAR_8XMODE	0x88	/* 8X sampling rate select */
-#define UART_EXAR_SLEEP		0x8b	/* Sleep mode */
-#define UART_EXAR_DVID		0x8d	/* Device identification */
-
-#define UART_EXAR_FCTR		0x08	/* Feature Control Register */
-#define UART_FCTR_EXAR_IRDA	0x10	/* IrDa data encode select */
-#define UART_FCTR_EXAR_485	0x20	/* Auto 485 half duplex dir ctl */
-#define UART_FCTR_EXAR_TRGA	0x00	/* FIFO trigger table A */
-#define UART_FCTR_EXAR_TRGB	0x60	/* FIFO trigger table B */
-#define UART_FCTR_EXAR_TRGC	0x80	/* FIFO trigger table C */
-#define UART_FCTR_EXAR_TRGD	0xc0	/* FIFO trigger table D programmable */
-
-#define UART_EXAR_TXTRG		0x0a	/* Tx FIFO trigger level write-only */
-#define UART_EXAR_RXTRG		0x0b	/* Rx FIFO trigger level write-only */
-
 /*
  * These are definitions for the Altera ALTR_16550_F32/F64/F128
  * Normalized from 0x100 to 0x40 because of shift by 2 (32 bit regs).
-- 
cgit v1.2.3


From feba3900cabb8e7c87368faa28e7a6936809ba22 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 3 Feb 2017 14:40:45 -0800
Subject: binder: Split flat_binder_object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

flat_binder_object is used for both handling
binder objects and file descriptors, even though
the two are mostly independent. Since we'll
have more fixup objects in binder in the future,
instead of extending flat_binder_object again,
split out file descriptors to their own object
while retaining backwards compatibility to
existing user-space clients. All binder objects
just share a header.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Martijn Coenen <maco@google.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Serban Constantinescu <serban.constantinescu@arm.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Rom Lemarchand <romlem@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Martijn Coenen <maco@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 158 +++++++++++++++++++++++++-----------
 include/uapi/linux/android/binder.h |  31 ++++++-
 2 files changed, 137 insertions(+), 52 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3c71b982bf2a..331d2abca9a2 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -145,6 +145,11 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 			binder_stop_on_user_error = 2; \
 	} while (0)
 
+#define to_flat_binder_object(hdr) \
+	container_of(hdr, struct flat_binder_object, hdr)
+
+#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -1240,6 +1245,47 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 	}
 }
 
+/**
+ * binder_validate_object() - checks for a valid metadata object in a buffer.
+ * @buffer:	binder_buffer that we're parsing.
+ * @offset:	offset in the buffer at which to validate an object.
+ *
+ * Return:	If there's a valid metadata object at @offset in @buffer, the
+ *		size of that object. Otherwise, it returns zero.
+ */
+static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
+{
+	/* Check if we can read a header first */
+	struct binder_object_header *hdr;
+	size_t object_size = 0;
+
+	if (offset > buffer->data_size - sizeof(*hdr) ||
+	    buffer->data_size < sizeof(*hdr) ||
+	    !IS_ALIGNED(offset, sizeof(u32)))
+		return 0;
+
+	/* Ok, now see if we can read a complete object. */
+	hdr = (struct binder_object_header *)(buffer->data + offset);
+	switch (hdr->type) {
+	case BINDER_TYPE_BINDER:
+	case BINDER_TYPE_WEAK_BINDER:
+	case BINDER_TYPE_HANDLE:
+	case BINDER_TYPE_WEAK_HANDLE:
+		object_size = sizeof(struct flat_binder_object);
+		break;
+	case BINDER_TYPE_FD:
+		object_size = sizeof(struct binder_fd_object);
+		break;
+	default:
+		return 0;
+	}
+	if (offset <= buffer->data_size - object_size &&
+	    buffer->data_size >= object_size)
+		return object_size;
+	else
+		return 0;
+}
+
 static void binder_transaction_buffer_release(struct binder_proc *proc,
 					      struct binder_buffer *buffer,
 					      binder_size_t *failed_at)
@@ -1262,21 +1308,23 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	else
 		off_end = (void *)offp + buffer->offsets_size;
 	for (; offp < off_end; offp++) {
-		struct flat_binder_object *fp;
+		struct binder_object_header *hdr;
+		size_t object_size = binder_validate_object(buffer, *offp);
 
-		if (*offp > buffer->data_size - sizeof(*fp) ||
-		    buffer->data_size < sizeof(*fp) ||
-		    !IS_ALIGNED(*offp, sizeof(u32))) {
-			pr_err("transaction release %d bad offset %lld, size %zd\n",
+		if (object_size == 0) {
+			pr_err("transaction release %d bad object at offset %lld, size %zd\n",
 			       debug_id, (u64)*offp, buffer->data_size);
 			continue;
 		}
-		fp = (struct flat_binder_object *)(buffer->data + *offp);
-		switch (fp->type) {
+		hdr = (struct binder_object_header *)(buffer->data + *offp);
+		switch (hdr->type) {
 		case BINDER_TYPE_BINDER:
 		case BINDER_TYPE_WEAK_BINDER: {
-			struct binder_node *node = binder_get_node(proc, fp->binder);
+			struct flat_binder_object *fp;
+			struct binder_node *node;
 
+			fp = to_flat_binder_object(hdr);
+			node = binder_get_node(proc, fp->binder);
 			if (node == NULL) {
 				pr_err("transaction release %d bad node %016llx\n",
 				       debug_id, (u64)fp->binder);
@@ -1285,15 +1333,17 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_TRANSACTION,
 				     "        node %d u%016llx\n",
 				     node->debug_id, (u64)node->ptr);
-			binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0);
+			binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER,
+					0);
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
+			struct flat_binder_object *fp;
 			struct binder_ref *ref;
 
+			fp = to_flat_binder_object(hdr);
 			ref = binder_get_ref(proc, fp->handle,
-					     fp->type == BINDER_TYPE_HANDLE);
-
+					     hdr->type == BINDER_TYPE_HANDLE);
 			if (ref == NULL) {
 				pr_err("transaction release %d bad handle %d\n",
 				 debug_id, fp->handle);
@@ -1302,19 +1352,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_TRANSACTION,
 				     "        ref %d desc %d (node %d)\n",
 				     ref->debug_id, ref->desc, ref->node->debug_id);
-			binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE);
+			binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE);
 		} break;
 
-		case BINDER_TYPE_FD:
+		case BINDER_TYPE_FD: {
+			struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
 			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        fd %d\n", fp->handle);
+				     "        fd %d\n", fp->fd);
 			if (failed_at)
-				task_close_fd(proc, fp->handle);
-			break;
+				task_close_fd(proc, fp->fd);
+		} break;
 
 		default:
 			pr_err("transaction release %d bad object type %x\n",
-				debug_id, fp->type);
+				debug_id, hdr->type);
 			break;
 		}
 	}
@@ -1531,28 +1583,29 @@ static void binder_transaction(struct binder_proc *proc,
 	off_end = (void *)offp + tr->offsets_size;
 	off_min = 0;
 	for (; offp < off_end; offp++) {
-		struct flat_binder_object *fp;
+		struct binder_object_header *hdr;
+		size_t object_size = binder_validate_object(t->buffer, *offp);
 
-		if (*offp > t->buffer->data_size - sizeof(*fp) ||
-		    *offp < off_min ||
-		    t->buffer->data_size < sizeof(*fp) ||
-		    !IS_ALIGNED(*offp, sizeof(u32))) {
-			binder_user_error("%d:%d got transaction with invalid offset, %lld (min %lld, max %lld)\n",
+		if (object_size == 0 || *offp < off_min) {
+			binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
 					  proc->pid, thread->pid, (u64)*offp,
 					  (u64)off_min,
-					  (u64)(t->buffer->data_size -
-					  sizeof(*fp)));
+					  (u64)t->buffer->data_size);
 			return_error = BR_FAILED_REPLY;
 			goto err_bad_offset;
 		}
-		fp = (struct flat_binder_object *)(t->buffer->data + *offp);
-		off_min = *offp + sizeof(struct flat_binder_object);
-		switch (fp->type) {
+
+		hdr = (struct binder_object_header *)(t->buffer->data + *offp);
+		off_min = *offp + object_size;
+		switch (hdr->type) {
 		case BINDER_TYPE_BINDER:
 		case BINDER_TYPE_WEAK_BINDER: {
+			struct flat_binder_object *fp;
+			struct binder_node *node;
 			struct binder_ref *ref;
-			struct binder_node *node = binder_get_node(proc, fp->binder);
 
+			fp = to_flat_binder_object(hdr);
+			node = binder_get_node(proc, fp->binder);
 			if (node == NULL) {
 				node = binder_new_node(proc, fp->binder, fp->cookie);
 				if (node == NULL) {
@@ -1580,14 +1633,14 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_binder_get_ref_for_node_failed;
 			}
-			if (fp->type == BINDER_TYPE_BINDER)
-				fp->type = BINDER_TYPE_HANDLE;
+			if (hdr->type == BINDER_TYPE_BINDER)
+				hdr->type = BINDER_TYPE_HANDLE;
 			else
-				fp->type = BINDER_TYPE_WEAK_HANDLE;
+				hdr->type = BINDER_TYPE_WEAK_HANDLE;
 			fp->binder = 0;
 			fp->handle = ref->desc;
 			fp->cookie = 0;
-			binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
+			binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE,
 				       &thread->todo);
 
 			trace_binder_transaction_node_to_ref(t, node, ref);
@@ -1598,11 +1651,12 @@ static void binder_transaction(struct binder_proc *proc,
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
+			struct flat_binder_object *fp;
 			struct binder_ref *ref;
 
+			fp = to_flat_binder_object(hdr);
 			ref = binder_get_ref(proc, fp->handle,
-					     fp->type == BINDER_TYPE_HANDLE);
-
+					     hdr->type == BINDER_TYPE_HANDLE);
 			if (ref == NULL) {
 				binder_user_error("%d:%d got transaction with invalid handle, %d\n",
 						proc->pid,
@@ -1616,13 +1670,15 @@ static void binder_transaction(struct binder_proc *proc,
 				goto err_binder_get_ref_failed;
 			}
 			if (ref->node->proc == target_proc) {
-				if (fp->type == BINDER_TYPE_HANDLE)
-					fp->type = BINDER_TYPE_BINDER;
+				if (hdr->type == BINDER_TYPE_HANDLE)
+					hdr->type = BINDER_TYPE_BINDER;
 				else
-					fp->type = BINDER_TYPE_WEAK_BINDER;
+					hdr->type = BINDER_TYPE_WEAK_BINDER;
 				fp->binder = ref->node->ptr;
 				fp->cookie = ref->node->cookie;
-				binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
+				binder_inc_node(ref->node,
+						hdr->type == BINDER_TYPE_BINDER,
+						0, NULL);
 				trace_binder_transaction_ref_to_node(t, ref);
 				binder_debug(BINDER_DEBUG_TRANSACTION,
 					     "        ref %d desc %d -> node %d u%016llx\n",
@@ -1639,7 +1695,9 @@ static void binder_transaction(struct binder_proc *proc,
 				fp->binder = 0;
 				fp->handle = new_ref->desc;
 				fp->cookie = 0;
-				binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
+				binder_inc_ref(new_ref,
+					       hdr->type == BINDER_TYPE_HANDLE,
+					       NULL);
 				trace_binder_transaction_ref_to_ref(t, ref,
 								    new_ref);
 				binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1652,25 +1710,26 @@ static void binder_transaction(struct binder_proc *proc,
 		case BINDER_TYPE_FD: {
 			int target_fd;
 			struct file *file;
+			struct binder_fd_object *fp = to_binder_fd_object(hdr);
 
 			if (reply) {
 				if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
 					binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n",
-						proc->pid, thread->pid, fp->handle);
+						proc->pid, thread->pid, fp->fd);
 					return_error = BR_FAILED_REPLY;
 					goto err_fd_not_allowed;
 				}
 			} else if (!target_node->accept_fds) {
 				binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n",
-					proc->pid, thread->pid, fp->handle);
+					proc->pid, thread->pid, fp->fd);
 				return_error = BR_FAILED_REPLY;
 				goto err_fd_not_allowed;
 			}
 
-			file = fget(fp->handle);
+			file = fget(fp->fd);
 			if (file == NULL) {
 				binder_user_error("%d:%d got transaction with invalid fd, %d\n",
-					proc->pid, thread->pid, fp->handle);
+					proc->pid, thread->pid, fp->fd);
 				return_error = BR_FAILED_REPLY;
 				goto err_fget_failed;
 			}
@@ -1688,17 +1747,18 @@ static void binder_transaction(struct binder_proc *proc,
 				goto err_get_unused_fd_failed;
 			}
 			task_fd_install(target_proc, target_fd, file);
-			trace_binder_transaction_fd(t, fp->handle, target_fd);
+			trace_binder_transaction_fd(t, fp->fd, target_fd);
 			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        fd %d -> %d\n", fp->handle, target_fd);
+				     "        fd %d -> %d\n", fp->fd,
+				     target_fd);
 			/* TODO: fput? */
-			fp->binder = 0;
-			fp->handle = target_fd;
+			fp->pad_binder = 0;
+			fp->fd = target_fd;
 		} break;
 
 		default:
 			binder_user_error("%d:%d got transaction with invalid object type, %x\n",
-				proc->pid, thread->pid, fp->type);
+				proc->pid, thread->pid, hdr->type);
 			return_error = BR_FAILED_REPLY;
 			goto err_bad_object_type;
 		}
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 41420e341e75..f67c2b1c0713 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -48,6 +48,14 @@ typedef __u64 binder_size_t;
 typedef __u64 binder_uintptr_t;
 #endif
 
+/**
+ * struct binder_object_header - header shared by all binder metadata objects.
+ * @type:	type of the object
+ */
+struct binder_object_header {
+	__u32        type;
+};
+
 /*
  * This is the flattened representation of a Binder object for transfer
  * between processes.  The 'offsets' supplied as part of a binder transaction
@@ -56,9 +64,8 @@ typedef __u64 binder_uintptr_t;
  * between processes.
  */
 struct flat_binder_object {
-	/* 8 bytes for large_flat_header. */
-	__u32		type;
-	__u32		flags;
+	struct binder_object_header	hdr;
+	__u32				flags;
 
 	/* 8 bytes of data. */
 	union {
@@ -70,6 +77,24 @@ struct flat_binder_object {
 	binder_uintptr_t	cookie;
 };
 
+/**
+ * struct binder_fd_object - describes a filedescriptor to be fixed up.
+ * @hdr:	common header structure
+ * @pad_flags:	padding to remain compatible with old userspace code
+ * @pad_binder:	padding to remain compatible with old userspace code
+ * @fd:		file descriptor
+ * @cookie:	opaque data, used by user-space
+ */
+struct binder_fd_object {
+	struct binder_object_header	hdr;
+	__u32				pad_flags;
+	union {
+		binder_uintptr_t	pad_binder;
+		__u32			fd;
+	};
+
+	binder_uintptr_t		cookie;
+};
 /*
  * On 64-bit platforms where user code may run in 32-bits the driver must
  * translate the buffer (and local binder) addresses appropriately.
-- 
cgit v1.2.3


From 7980240b6d63e0694f5023c29cbc648fafdf3e23 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 3 Feb 2017 14:40:51 -0800
Subject: binder: Add support for scatter-gather
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously all data passed over binder needed
to be serialized, with the exception of Binder
objects and file descriptors.

This patchs adds support for scatter-gathering raw
memory buffers into a binder transaction, avoiding
the need to first serialize them into a Parcel.

To remain backwards compatibile with existing
binder clients, it introduces two new command
ioctls for this purpose - BC_TRANSACTION_SG and
BC_REPLY_SG. These commands may only be used with
the new binder_transaction_data_sg structure,
which adds a field for the total size of the
buffers we are scatter-gathering.

Because memory buffers may contain pointers to
other buffers, we allow callers to specify
a parent buffer and an offset into it, to indicate
this is a location pointing to the buffer that
we are fixing up. The kernel will then take care
of fixing up the pointer to that buffer as well.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Martijn Coenen <maco@google.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Serban Constantinescu <serban.constantinescu@arm.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Rom Lemarchand <romlem@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Martijn Coenen <maco@google.com>
[jstultz: Fold in small fix from Amit Pundir <amit.pundir@linaro.org>]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 244 ++++++++++++++++++++++++++++++++++--
 include/uapi/linux/android/binder.h |  45 +++++++
 2 files changed, 276 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 25aa452d2738..3d241fbab533 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -152,6 +152,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 
 #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
 
+#define to_binder_buffer_object(hdr) \
+	container_of(hdr, struct binder_buffer_object, hdr)
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -165,7 +168,7 @@ enum binder_stat_types {
 
 struct binder_stats {
 	int br[_IOC_NR(BR_FAILED_REPLY) + 1];
-	int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1];
+	int bc[_IOC_NR(BC_REPLY_SG) + 1];
 	int obj_created[BINDER_STAT_COUNT];
 	int obj_deleted[BINDER_STAT_COUNT];
 };
@@ -1304,6 +1307,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 	case BINDER_TYPE_FD:
 		object_size = sizeof(struct binder_fd_object);
 		break;
+	case BINDER_TYPE_PTR:
+		object_size = sizeof(struct binder_buffer_object);
+		break;
 	default:
 		return 0;
 	}
@@ -1314,11 +1320,111 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 		return 0;
 }
 
+/**
+ * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer.
+ * @b:		binder_buffer containing the object
+ * @index:	index in offset array at which the binder_buffer_object is
+ *		located
+ * @start:	points to the start of the offset array
+ * @num_valid:	the number of valid offsets in the offset array
+ *
+ * Return:	If @index is within the valid range of the offset array
+ *		described by @start and @num_valid, and if there's a valid
+ *		binder_buffer_object at the offset found in index @index
+ *		of the offset array, that object is returned. Otherwise,
+ *		%NULL is returned.
+ *		Note that the offset found in index @index itself is not
+ *		verified; this function assumes that @num_valid elements
+ *		from @start were previously verified to have valid offsets.
+ */
+static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b,
+							binder_size_t index,
+							binder_size_t *start,
+							binder_size_t num_valid)
+{
+	struct binder_buffer_object *buffer_obj;
+	binder_size_t *offp;
+
+	if (index >= num_valid)
+		return NULL;
+
+	offp = start + index;
+	buffer_obj = (struct binder_buffer_object *)(b->data + *offp);
+	if (buffer_obj->hdr.type != BINDER_TYPE_PTR)
+		return NULL;
+
+	return buffer_obj;
+}
+
+/**
+ * binder_validate_fixup() - validates pointer/fd fixups happen in order.
+ * @b:			transaction buffer
+ * @objects_start	start of objects buffer
+ * @buffer:		binder_buffer_object in which to fix up
+ * @offset:		start offset in @buffer to fix up
+ * @last_obj:		last binder_buffer_object that we fixed up in
+ * @last_min_offset:	minimum fixup offset in @last_obj
+ *
+ * Return:		%true if a fixup in buffer @buffer at offset @offset is
+ *			allowed.
+ *
+ * For safety reasons, we only allow fixups inside a buffer to happen
+ * at increasing offsets; additionally, we only allow fixup on the last
+ * buffer object that was verified, or one of its parents.
+ *
+ * Example of what is allowed:
+ *
+ * A
+ *   B (parent = A, offset = 0)
+ *   C (parent = A, offset = 16)
+ *     D (parent = C, offset = 0)
+ *   E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset)
+ *
+ * Examples of what is not allowed:
+ *
+ * Decreasing offsets within the same parent:
+ * A
+ *   C (parent = A, offset = 16)
+ *   B (parent = A, offset = 0) // decreasing offset within A
+ *
+ * Referring to a parent that wasn't the last object or any of its parents:
+ * A
+ *   B (parent = A, offset = 0)
+ *   C (parent = A, offset = 0)
+ *   C (parent = A, offset = 16)
+ *     D (parent = B, offset = 0) // B is not A or any of A's parents
+ */
+static bool binder_validate_fixup(struct binder_buffer *b,
+				  binder_size_t *objects_start,
+				  struct binder_buffer_object *buffer,
+				  binder_size_t fixup_offset,
+				  struct binder_buffer_object *last_obj,
+				  binder_size_t last_min_offset)
+{
+	if (!last_obj) {
+		/* Nothing to fix up in */
+		return false;
+	}
+
+	while (last_obj != buffer) {
+		/*
+		 * Safe to retrieve the parent of last_obj, since it
+		 * was already previously verified by the driver.
+		 */
+		if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0)
+			return false;
+		last_min_offset = last_obj->parent_offset + sizeof(uintptr_t);
+		last_obj = (struct binder_buffer_object *)
+			(b->data + *(objects_start + last_obj->parent));
+	}
+	return (fixup_offset >= last_min_offset);
+}
+
 static void binder_transaction_buffer_release(struct binder_proc *proc,
 					      struct binder_buffer *buffer,
 					      binder_size_t *failed_at)
 {
-	binder_size_t *offp, *off_end;
+	binder_size_t *offp, *off_start, *off_end;
 	int debug_id = buffer->debug_id;
 
 	binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1329,13 +1435,13 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	if (buffer->target_node)
 		binder_dec_node(buffer->target_node, 1, 0);
 
-	offp = (binder_size_t *)(buffer->data +
-				 ALIGN(buffer->data_size, sizeof(void *)));
+	off_start = (binder_size_t *)(buffer->data +
+				      ALIGN(buffer->data_size, sizeof(void *)));
 	if (failed_at)
 		off_end = failed_at;
 	else
-		off_end = (void *)offp + buffer->offsets_size;
-	for (; offp < off_end; offp++) {
+		off_end = (void *)off_start + buffer->offsets_size;
+	for (offp = off_start; offp < off_end; offp++) {
 		struct binder_object_header *hdr;
 		size_t object_size = binder_validate_object(buffer, *offp);
 
@@ -1391,7 +1497,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			if (failed_at)
 				task_close_fd(proc, fp->fd);
 		} break;
-
+		case BINDER_TYPE_PTR:
+			/*
+			 * Nothing to do here, this will get cleaned up when the
+			 * transaction buffer gets freed
+			 */
+			break;
 		default:
 			pr_err("transaction release %d bad object type %x\n",
 				debug_id, hdr->type);
@@ -1561,6 +1672,53 @@ err_fd_not_accepted:
 	return ret;
 }
 
+static int binder_fixup_parent(struct binder_transaction *t,
+			       struct binder_thread *thread,
+			       struct binder_buffer_object *bp,
+			       binder_size_t *off_start,
+			       binder_size_t num_valid,
+			       struct binder_buffer_object *last_fixup_obj,
+			       binder_size_t last_fixup_min_off)
+{
+	struct binder_buffer_object *parent;
+	u8 *parent_buffer;
+	struct binder_buffer *b = t->buffer;
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+
+	if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT))
+		return 0;
+
+	parent = binder_validate_ptr(b, bp->parent, off_start, num_valid);
+	if (!parent) {
+		binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+
+	if (!binder_validate_fixup(b, off_start,
+				   parent, bp->parent_offset,
+				   last_fixup_obj,
+				   last_fixup_min_off)) {
+		binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+
+	if (parent->length < sizeof(binder_uintptr_t) ||
+	    bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) {
+		/* No space for a pointer here! */
+		binder_user_error("%d:%d got transaction with invalid parent offset\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+	parent_buffer = (u8 *)(parent->buffer -
+			       target_proc->user_buffer_offset);
+	*(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
+
+	return 0;
+}
+
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       struct binder_transaction_data *tr, int reply,
@@ -1569,8 +1727,9 @@ static void binder_transaction(struct binder_proc *proc,
 	int ret;
 	struct binder_transaction *t;
 	struct binder_work *tcomplete;
-	binder_size_t *offp, *off_end;
+	binder_size_t *offp, *off_end, *off_start;
 	binder_size_t off_min;
+	u8 *sg_bufp, *sg_buf_end;
 	struct binder_proc *target_proc;
 	struct binder_thread *target_thread = NULL;
 	struct binder_node *target_node = NULL;
@@ -1579,6 +1738,8 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_transaction *in_reply_to = NULL;
 	struct binder_transaction_log_entry *e;
 	uint32_t return_error;
+	struct binder_buffer_object *last_fixup_obj = NULL;
+	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
@@ -1753,8 +1914,9 @@ static void binder_transaction(struct binder_proc *proc,
 	if (target_node)
 		binder_inc_node(target_node, 1, 0, NULL);
 
-	offp = (binder_size_t *)(t->buffer->data +
-				 ALIGN(tr->data_size, sizeof(void *)));
+	off_start = (binder_size_t *)(t->buffer->data +
+				      ALIGN(tr->data_size, sizeof(void *)));
+	offp = off_start;
 
 	if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t)
 			   tr->data.ptr.buffer, tr->data_size)) {
@@ -1776,7 +1938,16 @@ static void binder_transaction(struct binder_proc *proc,
 		return_error = BR_FAILED_REPLY;
 		goto err_bad_offset;
 	}
-	off_end = (void *)offp + tr->offsets_size;
+	if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
+		binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
+				  proc->pid, thread->pid,
+				  (u64)extra_buffers_size);
+		return_error = BR_FAILED_REPLY;
+		goto err_bad_offset;
+	}
+	off_end = (void *)off_start + tr->offsets_size;
+	sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *)));
+	sg_buf_end = sg_bufp + extra_buffers_size;
 	off_min = 0;
 	for (; offp < off_end; offp++) {
 		struct binder_object_header *hdr;
@@ -1829,7 +2000,41 @@ static void binder_transaction(struct binder_proc *proc,
 			fp->pad_binder = 0;
 			fp->fd = target_fd;
 		} break;
-
+		case BINDER_TYPE_PTR: {
+			struct binder_buffer_object *bp =
+				to_binder_buffer_object(hdr);
+			size_t buf_left = sg_buf_end - sg_bufp;
+
+			if (bp->length > buf_left) {
+				binder_user_error("%d:%d got transaction with too large buffer\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_bad_offset;
+			}
+			if (copy_from_user(sg_bufp,
+					   (const void __user *)(uintptr_t)
+					   bp->buffer, bp->length)) {
+				binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_copy_data_failed;
+			}
+			/* Fixup buffer pointer to target proc address space */
+			bp->buffer = (uintptr_t)sg_bufp +
+				target_proc->user_buffer_offset;
+			sg_bufp += ALIGN(bp->length, sizeof(u64));
+
+			ret = binder_fixup_parent(t, thread, bp, off_start,
+						  offp - off_start,
+						  last_fixup_obj,
+						  last_fixup_min_off);
+			if (ret < 0) {
+				return_error = BR_FAILED_REPLY;
+				goto err_translate_failed;
+			}
+			last_fixup_obj = bp;
+			last_fixup_min_off = 0;
+		} break;
 		default:
 			binder_user_error("%d:%d got transaction with invalid object type, %x\n",
 				proc->pid, thread->pid, hdr->type);
@@ -2083,6 +2288,17 @@ static int binder_thread_write(struct binder_proc *proc,
 			break;
 		}
 
+		case BC_TRANSACTION_SG:
+		case BC_REPLY_SG: {
+			struct binder_transaction_data_sg tr;
+
+			if (copy_from_user(&tr, ptr, sizeof(tr)))
+				return -EFAULT;
+			ptr += sizeof(tr);
+			binder_transaction(proc, thread, &tr.transaction_data,
+					   cmd == BC_REPLY_SG, tr.buffers_size);
+			break;
+		}
 		case BC_TRANSACTION:
 		case BC_REPLY: {
 			struct binder_transaction_data tr;
@@ -3609,7 +3825,9 @@ static const char * const binder_command_strings[] = {
 	"BC_EXIT_LOOPER",
 	"BC_REQUEST_DEATH_NOTIFICATION",
 	"BC_CLEAR_DEATH_NOTIFICATION",
-	"BC_DEAD_BINDER_DONE"
+	"BC_DEAD_BINDER_DONE",
+	"BC_TRANSACTION_SG",
+	"BC_REPLY_SG",
 };
 
 static const char * const binder_objstat_strings[] = {
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index f67c2b1c0713..f3ef6e2634ba 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -33,6 +33,7 @@ enum {
 	BINDER_TYPE_HANDLE	= B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_WEAK_HANDLE	= B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_FD		= B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+	BINDER_TYPE_PTR		= B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
 };
 
 enum {
@@ -95,6 +96,39 @@ struct binder_fd_object {
 
 	binder_uintptr_t		cookie;
 };
+
+/* struct binder_buffer_object - object describing a userspace buffer
+ * @hdr:		common header structure
+ * @flags:		one or more BINDER_BUFFER_* flags
+ * @buffer:		address of the buffer
+ * @length:		length of the buffer
+ * @parent:		index in offset array pointing to parent buffer
+ * @parent_offset:	offset in @parent pointing to this buffer
+ *
+ * A binder_buffer object represents an object that the
+ * binder kernel driver can copy verbatim to the target
+ * address space. A buffer itself may be pointed to from
+ * within another buffer, meaning that the pointer inside
+ * that other buffer needs to be fixed up as well. This
+ * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
+ * flag in @flags, by setting @parent buffer to the index
+ * in the offset array pointing to the parent binder_buffer_object,
+ * and by setting @parent_offset to the offset in the parent buffer
+ * at which the pointer to this buffer is located.
+ */
+struct binder_buffer_object {
+	struct binder_object_header	hdr;
+	__u32				flags;
+	binder_uintptr_t		buffer;
+	binder_size_t			length;
+	binder_size_t			parent;
+	binder_size_t			parent_offset;
+};
+
+enum {
+	BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
+};
+
 /*
  * On 64-bit platforms where user code may run in 32-bits the driver must
  * translate the buffer (and local binder) addresses appropriately.
@@ -187,6 +221,11 @@ struct binder_transaction_data {
 	} data;
 };
 
+struct binder_transaction_data_sg {
+	struct binder_transaction_data transaction_data;
+	binder_size_t buffers_size;
+};
+
 struct binder_ptr_cookie {
 	binder_uintptr_t ptr;
 	binder_uintptr_t cookie;
@@ -371,6 +410,12 @@ enum binder_driver_command_protocol {
 	/*
 	 * void *: cookie
 	 */
+
+	BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
+	BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
+	/*
+	 * binder_transaction_data_sg: the sent command.
+	 */
 };
 
 #endif /* _UAPI_LINUX_BINDER_H */
-- 
cgit v1.2.3


From def95c73567dfacb22900cd0c4f01caff39e4c9e Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 3 Feb 2017 14:40:52 -0800
Subject: binder: Add support for file-descriptor arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a new binder_fd_array object,
that allows us to support one or more file descriptors
embedded in a buffer that is scatter-gathered.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Martijn Coenen <maco@google.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Serban Constantinescu <serban.constantinescu@arm.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Rom Lemarchand <romlem@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Martijn Coenen <maco@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 137 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/android/binder.h |  28 ++++++++
 2 files changed, 165 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3d241fbab533..9451b762fa1c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -155,6 +155,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 #define to_binder_buffer_object(hdr) \
 	container_of(hdr, struct binder_buffer_object, hdr)
 
+#define to_binder_fd_array_object(hdr) \
+	container_of(hdr, struct binder_fd_array_object, hdr)
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -1310,6 +1313,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 	case BINDER_TYPE_PTR:
 		object_size = sizeof(struct binder_buffer_object);
 		break;
+	case BINDER_TYPE_FDA:
+		object_size = sizeof(struct binder_fd_array_object);
+		break;
 	default:
 		return 0;
 	}
@@ -1503,6 +1509,47 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			 * transaction buffer gets freed
 			 */
 			break;
+		case BINDER_TYPE_FDA: {
+			struct binder_fd_array_object *fda;
+			struct binder_buffer_object *parent;
+			uintptr_t parent_buffer;
+			u32 *fd_array;
+			size_t fd_index;
+			binder_size_t fd_buf_size;
+
+			fda = to_binder_fd_array_object(hdr);
+			parent = binder_validate_ptr(buffer, fda->parent,
+						     off_start,
+						     offp - off_start);
+			if (!parent) {
+				pr_err("transaction release %d bad parent offset",
+				       debug_id);
+				continue;
+			}
+			/*
+			 * Since the parent was already fixed up, convert it
+			 * back to kernel address space to access it
+			 */
+			parent_buffer = parent->buffer -
+				proc->user_buffer_offset;
+
+			fd_buf_size = sizeof(u32) * fda->num_fds;
+			if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+				pr_err("transaction release %d invalid number of fds (%lld)\n",
+				       debug_id, (u64)fda->num_fds);
+				continue;
+			}
+			if (fd_buf_size > parent->length ||
+			    fda->parent_offset > parent->length - fd_buf_size) {
+				/* No space for all file descriptors here. */
+				pr_err("transaction release %d not enough space for %lld fds in buffer\n",
+				       debug_id, (u64)fda->num_fds);
+				continue;
+			}
+			fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+			for (fd_index = 0; fd_index < fda->num_fds; fd_index++)
+				task_close_fd(proc, fd_array[fd_index]);
+		} break;
 		default:
 			pr_err("transaction release %d bad object type %x\n",
 				debug_id, hdr->type);
@@ -1672,6 +1719,63 @@ err_fd_not_accepted:
 	return ret;
 }
 
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
+				     struct binder_buffer_object *parent,
+				     struct binder_transaction *t,
+				     struct binder_thread *thread,
+				     struct binder_transaction *in_reply_to)
+{
+	binder_size_t fdi, fd_buf_size, num_installed_fds;
+	int target_fd;
+	uintptr_t parent_buffer;
+	u32 *fd_array;
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+
+	fd_buf_size = sizeof(u32) * fda->num_fds;
+	if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+		binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n",
+				  proc->pid, thread->pid, (u64)fda->num_fds);
+		return -EINVAL;
+	}
+	if (fd_buf_size > parent->length ||
+	    fda->parent_offset > parent->length - fd_buf_size) {
+		/* No space for all file descriptors here. */
+		binder_user_error("%d:%d not enough space to store %lld fds in buffer\n",
+				  proc->pid, thread->pid, (u64)fda->num_fds);
+		return -EINVAL;
+	}
+	/*
+	 * Since the parent was already fixed up, convert it
+	 * back to the kernel address space to access it
+	 */
+	parent_buffer = parent->buffer - target_proc->user_buffer_offset;
+	fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+	if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
+		binder_user_error("%d:%d parent offset not aligned correctly.\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+	for (fdi = 0; fdi < fda->num_fds; fdi++) {
+		target_fd = binder_translate_fd(fd_array[fdi], t, thread,
+						in_reply_to);
+		if (target_fd < 0)
+			goto err_translate_fd_failed;
+		fd_array[fdi] = target_fd;
+	}
+	return 0;
+
+err_translate_fd_failed:
+	/*
+	 * Failed to allocate fd or security error, free fds
+	 * installed so far.
+	 */
+	num_installed_fds = fdi;
+	for (fdi = 0; fdi < num_installed_fds; fdi++)
+		task_close_fd(target_proc, fd_array[fdi]);
+	return target_fd;
+}
+
 static int binder_fixup_parent(struct binder_transaction *t,
 			       struct binder_thread *thread,
 			       struct binder_buffer_object *bp,
@@ -2000,6 +2104,38 @@ static void binder_transaction(struct binder_proc *proc,
 			fp->pad_binder = 0;
 			fp->fd = target_fd;
 		} break;
+		case BINDER_TYPE_FDA: {
+			struct binder_fd_array_object *fda =
+				to_binder_fd_array_object(hdr);
+			struct binder_buffer_object *parent =
+				binder_validate_ptr(t->buffer, fda->parent,
+						    off_start,
+						    offp - off_start);
+			if (!parent) {
+				binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_bad_parent;
+			}
+			if (!binder_validate_fixup(t->buffer, off_start,
+						   parent, fda->parent_offset,
+						   last_fixup_obj,
+						   last_fixup_min_off)) {
+				binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_bad_parent;
+			}
+			ret = binder_translate_fd_array(fda, parent, t, thread,
+							in_reply_to);
+			if (ret < 0) {
+				return_error = BR_FAILED_REPLY;
+				goto err_translate_failed;
+			}
+			last_fixup_obj = parent;
+			last_fixup_min_off =
+				fda->parent_offset + sizeof(u32) * fda->num_fds;
+		} break;
 		case BINDER_TYPE_PTR: {
 			struct binder_buffer_object *bp =
 				to_binder_buffer_object(hdr);
@@ -2070,6 +2206,7 @@ static void binder_transaction(struct binder_proc *proc,
 err_translate_failed:
 err_bad_object_type:
 err_bad_offset:
+err_bad_parent:
 err_copy_data_failed:
 	trace_binder_transaction_failed_buffer_release(t->buffer);
 	binder_transaction_buffer_release(target_proc, t->buffer, offp);
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index f3ef6e2634ba..51f891fb1b18 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -33,6 +33,7 @@ enum {
 	BINDER_TYPE_HANDLE	= B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_WEAK_HANDLE	= B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_FD		= B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+	BINDER_TYPE_FDA		= B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
 	BINDER_TYPE_PTR		= B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
 };
 
@@ -129,6 +130,33 @@ enum {
 	BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
 };
 
+/* struct binder_fd_array_object - object describing an array of fds in a buffer
+ * @hdr:		common header structure
+ * @num_fds:		number of file descriptors in the buffer
+ * @parent:		index in offset array to buffer holding the fd array
+ * @parent_offset:	start offset of fd array in the buffer
+ *
+ * A binder_fd_array object represents an array of file
+ * descriptors embedded in a binder_buffer_object. It is
+ * different from a regular binder_buffer_object because it
+ * describes a list of file descriptors to fix up, not an opaque
+ * blob of memory, and hence the kernel needs to treat it differently.
+ *
+ * An example of how this would be used is with Android's
+ * native_handle_t object, which is a struct with a list of integers
+ * and a list of file descriptors. The native_handle_t struct itself
+ * will be represented by a struct binder_buffer_objct, whereas the
+ * embedded list of file descriptors is represented by a
+ * struct binder_fd_array_object with that binder_buffer_object as
+ * a parent.
+ */
+struct binder_fd_array_object {
+	struct binder_object_header	hdr;
+	binder_size_t			num_fds;
+	binder_size_t			parent;
+	binder_size_t			parent_offset;
+};
+
 /*
  * On 64-bit platforms where user code may run in 32-bits the driver must
  * translate the buffer (and local binder) addresses appropriately.
-- 
cgit v1.2.3


From 71d0ed7079dffbc5cd0941d77d9b84e04109c9bb Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 7 Feb 2017 09:56:07 +0200
Subject: net/act_pedit: Support using offset relative to the conventional
 network headers

Extend pedit to enable the user setting offset relative to network
headers. This change would enable to work with more complex header
schemes (vs the simple IPv4 case) where setting a fixed offset relative
to the network header is not enough.

After this patch, the action has information about the exact header type
and field inside this header. This information could be used later on
for hardware offloading of pedit.

Backward compatibility was being kept:
1. Old kernel <-> new userspace
2. New kernel <-> old userspace
3. add rule using new userspace <-> dump using old userspace
4. add rule using old userspace <-> dump using new userspace

When using the extended api, new netlink attributes are being used. This
way, operation will fail in (1) and (3) - and no malformed rule be added
or dumped. Of course, new user space that doesn't need the new
functionality can use the old netlink attributes and operation will
succeed.
Since action can support both api's, (2) should work, and it is easy to
write the new user space to have (4) work.

The action is having a strict check that only header types and commands
it can handle are accepted. This way future additions will be much
easier.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent ffff: \
  flower \
    ip_proto tcp \
    dst_port 80 \
  action pedit munge tcp dport set 8080 pipe \
  action mirred egress redirect dev veth0

Will forward tcp port whose original dest port is 80, while modifying
the destination port to 8080.

Signed-off-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_pedit.h        |   5 +
 include/uapi/linux/tc_act/tc_pedit.h |  23 ++++
 net/sched/act_pedit.c                | 196 ++++++++++++++++++++++++++++++++---
 3 files changed, 208 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 29e38d6823df..e076f22035a5 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -3,11 +3,16 @@
 
 #include <net/act_api.h>
 
+struct tcf_pedit_key_ex {
+	enum pedit_header_type htype;
+};
+
 struct tcf_pedit {
 	struct tc_action	common;
 	unsigned char		tcfp_nkeys;
 	unsigned char		tcfp_flags;
 	struct tc_pedit_key	*tcfp_keys;
+	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
 #define to_pedit(a) ((struct tcf_pedit *)a)
 
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 6389959a5157..22f19eeda997 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -11,10 +11,33 @@ enum {
 	TCA_PEDIT_TM,
 	TCA_PEDIT_PARMS,
 	TCA_PEDIT_PAD,
+	TCA_PEDIT_PARMS_EX,
+	TCA_PEDIT_KEYS_EX,
+	TCA_PEDIT_KEY_EX,
 	__TCA_PEDIT_MAX
 };
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)
                                                                                 
+enum {
+	TCA_PEDIT_KEY_EX_HTYPE = 1,
+	__TCA_PEDIT_KEY_EX_MAX
+};
+#define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
+
+ /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It
+  * means no specific header type - offset is relative to the network layer
+  */
+enum pedit_header_type {
+	TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
+	TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
+	__PEDIT_HDR_TYPE_MAX,
+};
+#define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
+
 struct tc_pedit_key {
 	__u32           mask;  /* AND */
 	__u32           val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..fdd012bd3602 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,6 +22,7 @@
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_pedit.h>
+#include <uapi/linux/tc_act/tc_pedit.h>
 
 #define PEDIT_TAB_MASK	15
 
@@ -30,18 +31,112 @@ static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
+	[TCA_PEDIT_KEYS_EX]   = { .type = NLA_NESTED },
 };
 
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
+	[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+							u8 n)
+{
+	struct tcf_pedit_key_ex *keys_ex;
+	struct tcf_pedit_key_ex *k;
+	const struct nlattr *ka;
+	int err = -EINVAL;
+	int rem;
+
+	if (!nla || !n)
+		return NULL;
+
+	keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+	if (!keys_ex)
+		return ERR_PTR(-ENOMEM);
+
+	k = keys_ex;
+
+	nla_for_each_nested(ka, nla, rem) {
+		struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
+
+		if (!n) {
+			err = -EINVAL;
+			goto err_out;
+		}
+		n--;
+
+		if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
+				       pedit_key_ex_policy);
+		if (err)
+			goto err_out;
+
+		if (!tb[TCA_PEDIT_KEY_EX_HTYPE]) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+
+		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		k++;
+	}
+
+	if (n)
+		goto err_out;
+
+	return keys_ex;
+
+err_out:
+	kfree(keys_ex);
+	return ERR_PTR(err);
+}
+
+static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
+				 struct tcf_pedit_key_ex *keys_ex, int n)
+{
+	struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+
+	for (; n > 0; n--) {
+		struct nlattr *key_start;
+
+		key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+
+		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype)) {
+			nlmsg_trim(skb, keys_start);
+			return -EINVAL;
+		}
+
+		nla_nest_end(skb, key_start);
+
+		keys_ex++;
+	}
+
+	nla_nest_end(skb, keys_start);
+
+	return 0;
+}
+
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
 			  int ovr, int bind)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
+	struct nlattr *pattr;
 	struct tc_pedit *parm;
 	int ret = 0, err;
 	struct tcf_pedit *p;
 	struct tc_pedit_key *keys = NULL;
+	struct tcf_pedit_key_ex *keys_ex;
 	int ksize;
 
 	if (nla == NULL)
@@ -51,13 +146,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_PEDIT_PARMS] == NULL)
+	pattr = tb[TCA_PEDIT_PARMS];
+	if (!pattr)
+		pattr = tb[TCA_PEDIT_PARMS_EX];
+	if (!pattr)
 		return -EINVAL;
-	parm = nla_data(tb[TCA_PEDIT_PARMS]);
+
+	parm = nla_data(pattr);
 	ksize = parm->nkeys * sizeof(struct tc_pedit_key);
-	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
+	if (nla_len(pattr) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
+	keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
+	if (IS_ERR(keys_ex))
+		return PTR_ERR(keys_ex);
+
 	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
@@ -69,6 +172,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
 			tcf_hash_cleanup(*a, est);
+			kfree(keys_ex);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
@@ -81,8 +185,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p = to_pedit(*a);
 		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
-			if (keys == NULL)
+			if (!keys) {
+				kfree(keys_ex);
 				return -ENOMEM;
+			}
 		}
 	}
 
@@ -95,6 +201,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p->tcfp_nkeys = parm->nkeys;
 	}
 	memcpy(p->tcfp_keys, parm->keys, ksize);
+
+	kfree(p->tcfp_keys_ex);
+	p->tcfp_keys_ex = keys_ex;
+
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
 		tcf_hash_insert(tn, *a);
@@ -106,6 +216,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
 	kfree(keys);
+	kfree(p->tcfp_keys_ex);
 }
 
 static bool offset_valid(struct sk_buff *skb, int offset)
@@ -119,38 +230,84 @@ static bool offset_valid(struct sk_buff *skb, int offset)
 	return true;
 }
 
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+				enum pedit_header_type htype, int *hoffset)
+{
+	int ret = -EINVAL;
+
+	switch (htype) {
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+		if (skb_mac_header_was_set(skb)) {
+			*hoffset = skb_mac_offset(skb);
+			ret = 0;
+		}
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+		*hoffset = skb_network_offset(skb);
+		ret = 0;
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+		if (skb_transport_header_was_set(skb)) {
+			*hoffset = skb_transport_offset(skb);
+			ret = 0;
+		}
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+
+	return ret;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		     struct tcf_result *res)
 {
 	struct tcf_pedit *p = to_pedit(a);
 	int i;
-	unsigned int off;
 
 	if (skb_unclone(skb, GFP_ATOMIC))
 		return p->tcf_action;
 
-	off = skb_network_offset(skb);
-
 	spin_lock(&p->tcf_lock);
 
 	tcf_lastuse_update(&p->tcf_tm);
 
 	if (p->tcfp_nkeys > 0) {
 		struct tc_pedit_key *tkey = p->tcfp_keys;
+		struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
+		enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
 			u32 *ptr, _data;
 			int offset = tkey->off;
+			int hoffset;
+			int rc;
+
+			if (tkey_ex) {
+				htype = tkey_ex->htype;
+				tkey_ex++;
+			}
+
+			rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+			if (rc) {
+				pr_info("tc filter pedit bad header type specified (0x%x)\n",
+					htype);
+				goto bad;
+			}
 
 			if (tkey->offmask) {
 				char *d, _d;
 
-				if (!offset_valid(skb, off + tkey->at)) {
+				if (!offset_valid(skb, hoffset + tkey->at)) {
 					pr_info("tc filter pedit 'at' offset %d out of bounds\n",
-						off + tkey->at);
+						hoffset + tkey->at);
 					goto bad;
 				}
-				d = skb_header_pointer(skb, off + tkey->at, 1,
+				d = skb_header_pointer(skb, hoffset + tkey->at, 1,
 						       &_d);
 				if (!d)
 					goto bad;
@@ -163,19 +320,19 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 				goto bad;
 			}
 
-			if (!offset_valid(skb, off + offset)) {
+			if (!offset_valid(skb, hoffset + offset)) {
 				pr_info("tc filter pedit offset %d out of bounds\n",
-					offset);
+					hoffset + offset);
 				goto bad;
 			}
 
-			ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+			ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
 			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
 			if (ptr == &_data)
-				skb_store_bits(skb, off + offset, ptr, 4);
+				skb_store_bits(skb, hoffset + offset, ptr, 4);
 		}
 
 		goto done;
@@ -215,8 +372,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	opt->refcnt = p->tcf_refcnt - ref;
 	opt->bindcnt = p->tcf_bindcnt - bind;
 
-	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
-		goto nla_put_failure;
+	if (p->tcfp_keys_ex) {
+		tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
+
+		if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
+			goto nla_put_failure;
+	} else {
+		if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+			goto nla_put_failure;
+	}
 
 	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
-- 
cgit v1.2.3


From 853a14ba4682f820266469979c9297debc05f60c Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 7 Feb 2017 09:56:08 +0200
Subject: net/act_pedit: Introduce 'add' operation

This command could be useful to inc/dec fields.

For example, to forward any TCP packet and decrease its TTL:
$ tc filter add dev enp0s9 protocol ip parent ffff: \
    flower ip_proto tcp \
    action pedit munge ip ttl add 0xff pipe \
    action mirred egress redirect dev veth0

In the example above, adding 0xff to this u8 field is actually
decreasing it by one, since the operation is masked.

Signed-off-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_pedit.h        |  1 +
 include/uapi/linux/tc_act/tc_pedit.h |  8 ++++++++
 net/sched/act_pedit.c                | 30 ++++++++++++++++++++++++++----
 3 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index e076f22035a5..dfbd6ee0bc7c 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -5,6 +5,7 @@
 
 struct tcf_pedit_key_ex {
 	enum pedit_header_type htype;
+	enum pedit_cmd cmd;
 };
 
 struct tcf_pedit {
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 22f19eeda997..143d2b31a316 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -20,6 +20,7 @@ enum {
                                                                                 
 enum {
 	TCA_PEDIT_KEY_EX_HTYPE = 1,
+	TCA_PEDIT_KEY_EX_CMD = 2,
 	__TCA_PEDIT_KEY_EX_MAX
 };
 #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
@@ -38,6 +39,13 @@ enum pedit_header_type {
 };
 #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
 
+enum pedit_cmd {
+	TCA_PEDIT_KEY_EX_CMD_SET = 0,
+	TCA_PEDIT_KEY_EX_CMD_ADD = 1,
+	__PEDIT_CMD_MAX,
+};
+#define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1)
+
 struct tc_pedit_key {
 	__u32           mask;  /* AND */
 	__u32           val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fdd012bd3602..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -36,6 +36,7 @@ static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 
 static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
 	[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+	[TCA_PEDIT_KEY_EX_CMD]	  = { .type = NLA_U16 },
 };
 
 static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
@@ -75,14 +76,17 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
 		if (err)
 			goto err_out;
 
-		if (!tb[TCA_PEDIT_KEY_EX_HTYPE]) {
+		if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+		    !tb[TCA_PEDIT_KEY_EX_CMD]) {
 			err = -EINVAL;
 			goto err_out;
 		}
 
 		k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+		k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
 
-		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX) {
+		if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+		    k->cmd > TCA_PEDIT_CMD_MAX) {
 			err = -EINVAL;
 			goto err_out;
 		}
@@ -110,7 +114,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
 
 		key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
 
-		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype)) {
+		if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+		    nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
 			nlmsg_trim(skb, keys_start);
 			return -EINVAL;
 		}
@@ -280,15 +285,19 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		struct tc_pedit_key *tkey = p->tcfp_keys;
 		struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
 		enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+		enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
 			u32 *ptr, _data;
 			int offset = tkey->off;
 			int hoffset;
+			u32 val;
 			int rc;
 
 			if (tkey_ex) {
 				htype = tkey_ex->htype;
+				cmd = tkey_ex->cmd;
+
 				tkey_ex++;
 			}
 
@@ -330,7 +339,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
-			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
+			switch (cmd) {
+			case TCA_PEDIT_KEY_EX_CMD_SET:
+				val = tkey->val;
+				break;
+			case TCA_PEDIT_KEY_EX_CMD_ADD:
+				val = (*ptr + tkey->val) & ~tkey->mask;
+				break;
+			default:
+				pr_info("tc filter pedit bad command (%d)\n",
+					cmd);
+				goto bad;
+			}
+
+			*ptr = ((*ptr & tkey->mask) ^ val);
 			if (ptr == &_data)
 				skb_store_bits(skb, hoffset + offset, ptr, 4);
 		}
-- 
cgit v1.2.3


From adf200f31c000d707e4afe238ed1d1199e0cce7c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 9 Feb 2017 15:54:33 +0100
Subject: devlink: fix the name of eswitch commands

The eswitch_[gs]et command is supposed to be similar to port_[gs]et
command - for multiple eswitch attributes. However, when it was introduced
by 08f4b5918b2d ("net/devlink: Add E-Switch mode control") it was wrongly
named with the word "mode" in it. So fix this now, make the oririnal
enum value existing but obsolete.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h | 10 ++++++++--
 net/core/devlink.c           | 18 +++++++++---------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 9014c33d4e77..0f1f3a12e23c 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -57,8 +57,14 @@ enum devlink_command {
 	DEVLINK_CMD_SB_OCC_SNAPSHOT,
 	DEVLINK_CMD_SB_OCC_MAX_CLEAR,
 
-	DEVLINK_CMD_ESWITCH_MODE_GET,
-	DEVLINK_CMD_ESWITCH_MODE_SET,
+	DEVLINK_CMD_ESWITCH_GET,
+#define DEVLINK_CMD_ESWITCH_MODE_GET /* obsolete, never use this! */ \
+	DEVLINK_CMD_ESWITCH_GET
+
+	DEVLINK_CMD_ESWITCH_SET,
+#define DEVLINK_CMD_ESWITCH_MODE_SET /* obsolete, never use this! */ \
+	DEVLINK_CMD_ESWITCH_SET
+
 	/* add new commands above here */
 
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b5bf9efa720..7aa8e5369dc5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1435,8 +1435,8 @@ out:
 	return err;
 }
 
-static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
-						struct genl_info *info)
+static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
+					   struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
@@ -1450,7 +1450,7 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
 	if (!msg)
 		return -ENOMEM;
 
-	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
+	err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
 				   info->snd_portid, info->snd_seq, 0);
 
 	if (err) {
@@ -1461,8 +1461,8 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
-static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
-						struct genl_info *info)
+static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
+					   struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	const struct devlink_ops *ops = devlink->ops;
@@ -1629,15 +1629,15 @@ static const struct genl_ops devlink_nl_ops[] = {
 				  DEVLINK_NL_FLAG_LOCK_PORTS,
 	},
 	{
-		.cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
-		.doit = devlink_nl_cmd_eswitch_mode_get_doit,
+		.cmd = DEVLINK_CMD_ESWITCH_GET,
+		.doit = devlink_nl_cmd_eswitch_get_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
 	{
-		.cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
-		.doit = devlink_nl_cmd_eswitch_mode_set_doit,
+		.cmd = DEVLINK_CMD_ESWITCH_SET,
+		.doit = devlink_nl_cmd_eswitch_set_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
-- 
cgit v1.2.3


From abdbf4d635a9a8c956bb9757a9d4f08c2abe1f97 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 3 Feb 2017 16:46:13 -0500
Subject: PCI/DPC: Wait for Root Port busy to clear

Per PCIe r3.1, sec 6.2.10 and sec 7.13.4, on Root Ports that support "RP
Extensions for DPC",

  When the DPC Trigger Status bit is Set and the DPC RP Busy bit is Set,
  software must leave the Root Port in DPC until the DPC RP Busy bit reads
  0b.

Wait up to 1 second for the Root Port to become non-busy.

[bhelgaas: changelog, spec references]
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/pcie-dpc.c   | 26 +++++++++++++++++++++++++-
 include/uapi/linux/pci_regs.h |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index 5a261fd4f03d..d4d70ef4a2d7 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -19,8 +19,28 @@ struct dpc_dev {
 	struct pcie_device	*dev;
 	struct work_struct	work;
 	int			cap_pos;
+	bool			rp;
 };
 
+static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
+{
+	unsigned long timeout = jiffies + HZ;
+	struct pci_dev *pdev = dpc->dev->port;
+	u16 status;
+
+	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS, &status);
+	while (status & PCI_EXP_DPC_RP_BUSY &&
+					!time_after(jiffies, timeout)) {
+		msleep(10);
+		pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS, &status);
+	}
+	if (status & PCI_EXP_DPC_RP_BUSY) {
+		dev_warn(&pdev->dev, "DPC root port still busy\n");
+		return -EBUSY;
+	}
+	return 0;
+}
+
 static void dpc_wait_link_inactive(struct pci_dev *pdev)
 {
 	unsigned long timeout = jiffies + HZ;
@@ -33,7 +53,7 @@ static void dpc_wait_link_inactive(struct pci_dev *pdev)
 		pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnk_status);
 	}
 	if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-		dev_warn(&pdev->dev, "Link state not disabled for DPC event");
+		dev_warn(&pdev->dev, "Link state not disabled for DPC event\n");
 }
 
 static void interrupt_event_handler(struct work_struct *work)
@@ -52,6 +72,8 @@ static void interrupt_event_handler(struct work_struct *work)
 	pci_unlock_rescan_remove();
 
 	dpc_wait_link_inactive(pdev);
+	if (dpc->rp && dpc_wait_rp_inactive(dpc))
+		return;
 	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS,
 		PCI_EXP_DPC_STATUS_TRIGGER | PCI_EXP_DPC_STATUS_INTERRUPT);
 }
@@ -115,6 +137,8 @@ static int dpc_probe(struct pcie_device *dev)
 	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
 	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
 
+	dpc->rp = (cap & PCI_EXP_DPC_CAP_RP_EXT);
+
 	ctl |= PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN;
 	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 174d1147081b..c1b94b044795 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -973,6 +973,7 @@
 #define PCI_EXP_DPC_STATUS		8	/* DPC Status */
 #define  PCI_EXP_DPC_STATUS_TRIGGER	0x01	/* Trigger Status */
 #define  PCI_EXP_DPC_STATUS_INTERRUPT	0x08	/* Interrupt Status */
+#define  PCI_EXP_DPC_RP_BUSY		0x10	/* Root Port Busy */
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
-- 
cgit v1.2.3


From 0fc1223f0e77a748f7040562faaa7027f7db71ca Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Mon, 2 Jan 2017 22:34:10 -0800
Subject: PCI/ASPM: Add L1 substate capability structure register definitions

Add L1 substate capability structure register definitions for use in
subsequent patches.  See the PCIe r3.1 spec, sec 7.33.

[bhelgaas: add PCIe spec reference]
Signed-off-by: Rajat Jain <rajatja@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 174d1147081b..f48d06e2bb4d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -682,6 +682,7 @@
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
+#define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
 #define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PTM
 
@@ -985,4 +986,19 @@
 #define  PCI_PTM_CTRL_ENABLE		0x00000001  /* PTM enable */
 #define  PCI_PTM_CTRL_ROOT		0x00000002  /* Root select */
 
+/* L1 PM Substates */
+#define PCI_L1SS_CAP		    4	/* capability register */
+#define  PCI_L1SS_CAP_PCIPM_L1_2	 1	/* PCI PM L1.2 Support */
+#define  PCI_L1SS_CAP_PCIPM_L1_1	 2	/* PCI PM L1.1 Support */
+#define  PCI_L1SS_CAP_ASPM_L1_2		 4	/* ASPM L1.2 Support */
+#define  PCI_L1SS_CAP_ASPM_L1_1		 8	/* ASPM L1.1 Support */
+#define  PCI_L1SS_CAP_L1_PM_SS		16	/* L1 PM Substates Support */
+#define PCI_L1SS_CTL1		    8	/* Control Register 1 */
+#define  PCI_L1SS_CTL1_PCIPM_L1_2	1	/* PCI PM L1.2 Enable */
+#define  PCI_L1SS_CTL1_PCIPM_L1_1	2	/* PCI PM L1.1 Support */
+#define  PCI_L1SS_CTL1_ASPM_L1_2	4	/* ASPM L1.2 Support */
+#define  PCI_L1SS_CTL1_ASPM_L1_1	8	/* ASPM L1.1 Support */
+#define  PCI_L1SS_CTL1_L1SS_MASK	0x0000000F
+#define PCI_L1SS_CTL2		    0xC	/* Control Register 2 */
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 8c4d4e8b5626fec965fd5034e5bd5e57790f243f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Feb 2017 12:08:17 +0100
Subject: netfilter: nfnetlink: allow to check for generation ID

This patch allows userspace to specify the generation ID that has been
used to build an incremental batch update.

If userspace specifies the generation ID in the batch message as
attribute, then nfnetlink compares it to the current generation ID so
you make sure that you work against the right baseline. Otherwise, bail
out with ERESTART so userspace knows that its changeset is stale and
needs to respin. Userspace can do this transparently at the cost of
taking slightly more time to refresh caches and rework the changeset.

This check is optional, if there is no NFNL_BATCH_GENID attribute in the
batch begin message, then no check is performed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h      |  1 +
 include/uapi/linux/netfilter/nfnetlink.h | 12 ++++++++++++
 net/netfilter/nfnetlink.c                | 31 +++++++++++++++++++++++++++----
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 1d82dd5e9a08..1b49209dd5c7 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -28,6 +28,7 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
+	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 4bb8cb7730e7..a09906a30d77 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -65,4 +65,16 @@ struct nfgenmsg {
 #define NFNL_MSG_BATCH_BEGIN		NLMSG_MIN_TYPE
 #define NFNL_MSG_BATCH_END		NLMSG_MIN_TYPE+1
 
+/**
+ * enum nfnl_batch_attributes - nfnetlink batch netlink attributes
+ *
+ * @NFNL_BATCH_GENID: generation ID for this changeset (NLA_U32)
+ */
+enum nfnl_batch_attributes {
+        NFNL_BATCH_UNSPEC,
+        NFNL_BATCH_GENID,
+        __NFNL_BATCH_MAX
+};
+#define NFNL_BATCH_MAX			(__NFNL_BATCH_MAX - 1)
+
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index ca645a3b1375..a2148d0bc50e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -3,7 +3,7 @@
  *
  * (C) 2001 by Jay Schulist <jschlst@samba.org>,
  * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
- * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * Initial netfilter messages via netlink development funded and
  * generally made possible by Network Robots, Inc. (www.networkrobots.com)
@@ -273,7 +273,7 @@ enum {
 };
 
 static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
-				u16 subsys_id)
+				u16 subsys_id, u32 genid)
 {
 	struct sk_buff *oskb = skb;
 	struct net *net = sock_net(skb->sk);
@@ -315,6 +315,12 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+		nfnl_unlock(subsys_id);
+		netlink_ack(oskb, nlh, -ERESTART);
+		return kfree_skb(skb);
+	}
+
 	while (skb->len >= nlmsg_total_size(0)) {
 		int msglen, type;
 
@@ -436,11 +442,20 @@ done:
 	kfree_skb(skb);
 }
 
+static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
+	[NFNL_BATCH_GENID]	= { .type = NLA_U32 },
+};
+
 static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
+	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+	struct nlattr *attr = (void *)nlh + min_len;
+	struct nlattr *cda[NFNL_BATCH_MAX + 1];
+	int attrlen = nlh->nlmsg_len - min_len;
 	struct nfgenmsg *nfgenmsg;
+	int msglen, err;
+	u32 gen_id = 0;
 	u16 res_id;
-	int msglen;
 
 	msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 	if (msglen > skb->len)
@@ -450,6 +465,14 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 	    skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
 		return;
 
+	err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
+	if (err < 0) {
+		netlink_ack(skb, nlh, err);
+		return;
+	}
+	if (cda[NFNL_BATCH_GENID])
+		gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
+
 	nfgenmsg = nlmsg_data(nlh);
 	skb_pull(skb, msglen);
 	/* Work around old nft using host byte order */
@@ -458,7 +481,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 	else
 		res_id = ntohs(nfgenmsg->res_id);
 
-	nfnetlink_rcv_batch(skb, nlh, res_id);
+	nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
 }
 
 static void nfnetlink_rcv(struct sk_buff *skb)
-- 
cgit v1.2.3


From 1a94e38d254b3622d5d53f74b3b716b0fcab0ba8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Feb 2017 12:08:23 +0100
Subject: netfilter: nf_tables: add NFTA_RULE_ID attribute

This new attribute allows us to uniquely identify a rule in transaction.
Robots may trigger an insertion followed by deletion in a batch, in that
scenario we still don't have a public rule handle that we can use to
delete the rule. This is similar to the NFTA_SET_ID attribute that
allows us to refer to an anonymous set from a batch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  3 +++
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c            | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 21ce50e6d0c5..ac84686aaafb 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1202,10 +1202,13 @@ struct nft_trans {
 
 struct nft_trans_rule {
 	struct nft_rule			*rule;
+	u32				rule_id;
 };
 
 #define nft_trans_rule(trans)	\
 	(((struct nft_trans_rule *)trans->data)->rule)
+#define nft_trans_rule_id(trans)	\
+	(((struct nft_trans_rule *)trans->data)->rule_id)
 
 struct nft_trans_set {
 	struct nft_set			*set;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 207951516ede..05215d30fe5c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -207,6 +207,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
  * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64)
  * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN)
+ * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -218,6 +219,7 @@ enum nft_rule_attributes {
 	NFTA_RULE_POSITION,
 	NFTA_RULE_USERDATA,
 	NFTA_RULE_PAD,
+	NFTA_RULE_ID,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 71c60a04b66b..6c782532615f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -240,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
 	if (trans == NULL)
 		return NULL;
 
+	if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
+		nft_trans_rule_id(trans) =
+			ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
+	}
 	nft_trans_rule(trans) = rule;
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 
@@ -2293,6 +2297,22 @@ err1:
 	return err;
 }
 
+static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+					     const struct nlattr *nla)
+{
+	u32 id = ntohl(nla_get_be32(nla));
+	struct nft_trans *trans;
+
+	list_for_each_entry(trans, &net->nft.commit_list, list) {
+		struct nft_rule *rule = nft_trans_rule(trans);
+
+		if (trans->msg_type == NFT_MSG_NEWRULE &&
+		    id == nft_trans_rule_id(trans))
+			return rule;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			     struct sk_buff *skb, const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[])
@@ -2330,6 +2350,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			if (IS_ERR(rule))
 				return PTR_ERR(rule);
 
+			err = nft_delrule(&ctx, rule);
+		} else if (nla[NFTA_RULE_ID]) {
+			rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+			if (IS_ERR(rule))
+				return PTR_ERR(rule);
+
 			err = nft_delrule(&ctx, rule);
 		} else {
 			err = nft_delrule_by_chain(&ctx);
-- 
cgit v1.2.3


From 7f677633379b4abb3281cdbe7e7006f049305c03 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 10 Feb 2017 20:28:24 -0800
Subject: bpf: introduce BPF_F_ALLOW_OVERRIDE flag

If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
to the given cgroup the descendent cgroup will be able to override
effective bpf program that was inherited from this cgroup.
By default it's not passed, therefore override is disallowed.

Examples:
1.
prog X attached to /A with default
prog Y fails to attach to /A/B and /A/B/C
Everything under /A runs prog X

2.
prog X attached to /A with allow_override.
prog Y fails to attach to /A/B with default (non-override)
prog M attached to /A/B with allow_override.
Everything under /A/B runs prog M only.

3.
prog X attached to /A with allow_override.
prog Y fails to attach to /A with default.
The user has to detach first to switch the mode.

In the future this behavior may be extended with a chain of
non-overridable programs.

Also fix the bug where detach from cgroup where nothing is attached
was not throwing error. Return ENOENT in such case.

Add several testcases and adjust libbpf.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h       | 13 ++++----
 include/uapi/linux/bpf.h         |  7 +++++
 kernel/bpf/cgroup.c              | 59 +++++++++++++++++++++++++++-------
 kernel/bpf/syscall.c             | 20 ++++++++----
 kernel/cgroup.c                  |  9 +++---
 samples/bpf/test_cgrp2_attach.c  |  2 +-
 samples/bpf/test_cgrp2_attach2.c | 68 +++++++++++++++++++++++++++++++++++++---
 samples/bpf/test_cgrp2_sock.c    |  2 +-
 samples/bpf/test_cgrp2_sock2.c   |  2 +-
 tools/lib/bpf/bpf.c              |  4 ++-
 tools/lib/bpf/bpf.h              |  3 +-
 11 files changed, 151 insertions(+), 38 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 92bc89ae7e20..c970a25d2a49 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -21,20 +21,19 @@ struct cgroup_bpf {
 	 */
 	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
 	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
+	bool disallow_override[MAX_BPF_ATTACH_TYPE];
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
 void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
 
-void __cgroup_bpf_update(struct cgroup *cgrp,
-			 struct cgroup *parent,
-			 struct bpf_prog *prog,
-			 enum bpf_attach_type type);
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+			struct bpf_prog *prog, enum bpf_attach_type type,
+			bool overridable);
 
 /* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type);
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, bool overridable);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..d2b0ac799d03 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,12 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -171,6 +177,7 @@ union bpf_attr {
 		__u32		target_fd;	/* container object to attach to */
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
+		__u32		attach_flags;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a515f7b007c6..da0f53690295 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
 		e = rcu_dereference_protected(parent->bpf.effective[type],
 					      lockdep_is_held(&cgroup_mutex));
 		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
 	}
 }
 
@@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
  *
  * Must be called with cgroup_mutex held.
  */
-void __cgroup_bpf_update(struct cgroup *cgrp,
-			 struct cgroup *parent,
-			 struct bpf_prog *prog,
-			 enum bpf_attach_type type)
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+			struct bpf_prog *prog, enum bpf_attach_type type,
+			bool new_overridable)
 {
-	struct bpf_prog *old_prog, *effective;
+	struct bpf_prog *old_prog, *effective = NULL;
 	struct cgroup_subsys_state *pos;
+	bool overridable = true;
 
-	old_prog = xchg(cgrp->bpf.prog + type, prog);
+	if (parent) {
+		overridable = !parent->bpf.disallow_override[type];
+		effective = rcu_dereference_protected(parent->bpf.effective[type],
+						      lockdep_is_held(&cgroup_mutex));
+	}
+
+	if (prog && effective && !overridable)
+		/* if parent has non-overridable prog attached, disallow
+		 * attaching new programs to descendent cgroup
+		 */
+		return -EPERM;
+
+	if (prog && effective && overridable != new_overridable)
+		/* if parent has overridable prog attached, only
+		 * allow overridable programs in descendent cgroup
+		 */
+		return -EPERM;
 
-	effective = (!prog && parent) ?
-		rcu_dereference_protected(parent->bpf.effective[type],
-					  lockdep_is_held(&cgroup_mutex)) :
-		prog;
+	old_prog = cgrp->bpf.prog[type];
+
+	if (prog) {
+		overridable = new_overridable;
+		effective = prog;
+		if (old_prog &&
+		    cgrp->bpf.disallow_override[type] == new_overridable)
+			/* disallow attaching non-overridable on top
+			 * of existing overridable in this cgroup
+			 * and vice versa
+			 */
+			return -EPERM;
+	}
+
+	if (!prog && !old_prog)
+		/* report error when trying to detach and nothing is attached */
+		return -ENOENT;
+
+	cgrp->bpf.prog[type] = prog;
 
 	css_for_each_descendant_pre(pos, &cgrp->self) {
 		struct cgroup *desc = container_of(pos, struct cgroup, self);
 
 		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp)
+		if (desc->bpf.prog[type] && desc != cgrp) {
 			pos = css_rightmost_descendant(pos);
-		else
+		} else {
 			rcu_assign_pointer(desc->bpf.effective[type],
 					   effective);
+			desc->bpf.disallow_override[type] = !overridable;
+		}
 	}
 
 	if (prog)
@@ -115,6 +149,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
+	return 0;
 }
 
 /**
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 19b6129eab23..bbb016adbaeb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -920,13 +920,14 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
-	enum bpf_prog_type ptype;
+	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -934,6 +935,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
+	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+		return -EINVAL;
+
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
@@ -956,10 +960,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	if (ret)
+		bpf_prog_put(prog);
 	cgroup_put(cgrp);
 
-	return 0;
+	return ret;
 }
 
 #define BPF_PROG_DETACH_LAST_FIELD attach_type
@@ -967,6 +974,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
 	struct cgroup *cgrp;
+	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -982,7 +990,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
 
-		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
 		cgroup_put(cgrp);
 		break;
 
@@ -990,7 +998,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return -EINVAL;
 	}
 
-	return 0;
+	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 688dd02af985..53bbca7c4859 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6498,15 +6498,16 @@ static __init int cgroup_namespaces_init(void)
 subsys_initcall(cgroup_namespaces_init);
 
 #ifdef CONFIG_CGROUP_BPF
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type)
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, bool overridable)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
+	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	__cgroup_bpf_update(cgrp, parent, prog, type);
+	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
 	mutex_unlock(&cgroup_mutex);
+	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 504058631ffc..4bfcaf93fcf3 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -104,7 +104,7 @@ static int attach_filter(int cg_fd, int type, int verdict)
 		return EXIT_FAILURE;
 	}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, type);
+	ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 6e69be37f87f..3049b1f26267 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -79,11 +79,12 @@ int main(int argc, char **argv)
 	if (join_cgroup(FOO))
 		goto err;
 
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo");
 		goto err;
 	}
 
+	printf("Attached DROP prog. This ping in cgroup /foo should fail...\n");
 	assert(system(PING_CMD) != 0);
 
 	/* Create cgroup /foo/bar, get fd, and join it */
@@ -94,24 +95,27 @@ int main(int argc, char **argv)
 	if (join_cgroup(BAR))
 		goto err;
 
+	printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
 
+	printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
-
 	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
 		log_err("Detaching program from /foo/bar");
 		goto err;
 	}
 
+	printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n"
+	       "This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -121,8 +125,60 @@ int main(int argc, char **argv)
 		goto err;
 	}
 
+	printf("Attached PASS from /foo/bar and detached DROP from /foo.\n"
+	       "This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+		log_err("Attaching prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+		errno = 0;
+		log_err("Unexpected success attaching prog to /foo/bar");
+		goto err;
+	}
+
+	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Detaching program from /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
+		errno = 0;
+		log_err("Unexpected success in double detach from /foo");
+		goto err;
+	}
+
+	if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching non-overridable prog to /foo");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+		errno = 0;
+		log_err("Unexpected success attaching non-overridable prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+		errno = 0;
+		log_err("Unexpected success attaching overridable prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+		errno = 0;
+		log_err("Unexpected success attaching overridable prog to /foo");
+		goto err;
+	}
+
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching different non-overridable prog to /foo");
+		goto err;
+	}
+
 	goto out;
 
 err:
@@ -132,5 +188,9 @@ out:
 	close(foo);
 	close(bar);
 	cleanup_cgroup_environment();
+	if (!rc)
+		printf("PASS\n");
+	else
+		printf("FAIL\n");
 	return rc;
 }
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 0791b949cbe4..c3cfb23e23b5 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -75,7 +75,7 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
index 455ef0d06e93..db036077b644 100644
--- a/samples/bpf/test_cgrp2_sock2.c
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -55,7 +55,7 @@ int main(int argc, char **argv)
 	}
 
 	ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
-			      BPF_CGROUP_INET_SOCK_CREATE);
+			      BPF_CGROUP_INET_SOCK_CREATE, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3ddb58a36d3c..ae752fa4eaa7 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -168,7 +168,8 @@ int bpf_obj_get(const char *pathname)
 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
 }
 
-int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+		    unsigned int flags)
 {
 	union bpf_attr attr;
 
@@ -176,6 +177,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
 	attr.target_fd	   = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
+	attr.attach_flags  = flags;
 
 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index a2f9853dd882..4ac6c4b84100 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -41,7 +41,8 @@ int bpf_map_delete_elem(int fd, void *key);
 int bpf_map_get_next_key(int fd, void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
-int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type);
+int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
+		    unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
 
 
-- 
cgit v1.2.3


From 35879ee4769099905fa3bda0b21e73d434e2df6a Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 10 Feb 2017 07:18:36 -0200
Subject: [media] videodev2.h: go back to limited range Y'CbCr for SRGB and,
 ADOBERGB

This reverts 'commit 7e0739cd9c40 ("[media] videodev2.h: fix
sYCC/AdobeYCC default quantization range").

The problem is that many drivers can convert R'G'B' content (often
from sensors) to Y'CbCr, but they all produce limited range Y'CbCr.

To stay backwards compatible the default quantization range for
sRGB and AdobeRGB Y'CbCr encoding should be limited range, not full
range, even though the corresponding standards specify full range.

Update the V4L2_MAP_QUANTIZATION_DEFAULT define accordingly and
also update the documentation.

Fixes: 7e0739cd9c40 ("[media] videodev2.h: fix sYCC/AdobeYCC default quantization range")
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Cc: <stable@vger.kernel.org>      # for v4.9 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/v4l/pixfmt-007.rst | 23 +++++++++++++++++------
 include/uapi/linux/videodev2.h              |  7 +++----
 2 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/media/uapi/v4l/pixfmt-007.rst b/Documentation/media/uapi/v4l/pixfmt-007.rst
index 44bb5a7059b3..95a23a28c595 100644
--- a/Documentation/media/uapi/v4l/pixfmt-007.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-007.rst
@@ -211,7 +211,13 @@ Colorspace sRGB (V4L2_COLORSPACE_SRGB)
 The :ref:`srgb` standard defines the colorspace used by most webcams
 and computer graphics. The default transfer function is
 ``V4L2_XFER_FUNC_SRGB``. The default Y'CbCr encoding is
-``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is full range.
+``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is limited range.
+
+Note that the :ref:`sycc` standard specifies full range quantization,
+however all current capture hardware supported by the kernel convert
+R'G'B' to limited range Y'CbCr. So choosing full range as the default
+would break how applications interpret the quantization range.
+
 The chromaticities of the primary colors and the white reference are:
 
 
@@ -276,7 +282,7 @@ the following ``V4L2_YCBCR_ENC_601`` encoding as defined by :ref:`sycc`:
 
 Y' is clamped to the range [0…1] and Cb and Cr are clamped to the range
 [-0.5…0.5]. This transform is identical to one defined in SMPTE
-170M/BT.601. The Y'CbCr quantization is full range.
+170M/BT.601. The Y'CbCr quantization is limited range.
 
 
 .. _col-adobergb:
@@ -288,10 +294,15 @@ The :ref:`adobergb` standard defines the colorspace used by computer
 graphics that use the AdobeRGB colorspace. This is also known as the
 :ref:`oprgb` standard. The default transfer function is
 ``V4L2_XFER_FUNC_ADOBERGB``. The default Y'CbCr encoding is
-``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is full
-range. The chromaticities of the primary colors and the white reference
-are:
+``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is limited
+range.
+
+Note that the :ref:`oprgb` standard specifies full range quantization,
+however all current capture hardware supported by the kernel convert
+R'G'B' to limited range Y'CbCr. So choosing full range as the default
+would break how applications interpret the quantization range.
 
+The chromaticities of the primary colors and the white reference are:
 
 
 .. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
@@ -344,7 +355,7 @@ the following ``V4L2_YCBCR_ENC_601`` encoding:
 
 Y' is clamped to the range [0…1] and Cb and Cr are clamped to the range
 [-0.5…0.5]. This transform is identical to one defined in SMPTE
-170M/BT.601. The Y'CbCr quantization is full range.
+170M/BT.601. The Y'CbCr quantization is limited range.
 
 
 .. _col-bt2020:
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 46e8a2e369f9..45184a2ef66c 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -362,8 +362,8 @@ enum v4l2_quantization {
 	/*
 	 * The default for R'G'B' quantization is always full range, except
 	 * for the BT2020 colorspace. For Y'CbCr the quantization is always
-	 * limited range, except for COLORSPACE_JPEG, SRGB, ADOBERGB,
-	 * XV601 or XV709: those are full range.
+	 * limited range, except for COLORSPACE_JPEG, XV601 or XV709: those
+	 * are full range.
 	 */
 	V4L2_QUANTIZATION_DEFAULT     = 0,
 	V4L2_QUANTIZATION_FULL_RANGE  = 1,
@@ -379,8 +379,7 @@ enum v4l2_quantization {
 	(((is_rgb_or_hsv) && (colsp) == V4L2_COLORSPACE_BT2020) ? \
 	 V4L2_QUANTIZATION_LIM_RANGE : \
 	 (((is_rgb_or_hsv) || (ycbcr_enc) == V4L2_YCBCR_ENC_XV601 || \
-	  (ycbcr_enc) == V4L2_YCBCR_ENC_XV709 || (colsp) == V4L2_COLORSPACE_JPEG) || \
-	  (colsp) == V4L2_COLORSPACE_ADOBERGB || (colsp) == V4L2_COLORSPACE_SRGB ? \
+	  (ycbcr_enc) == V4L2_YCBCR_ENC_XV709 || (colsp) == V4L2_COLORSPACE_JPEG) ? \
 	 V4L2_QUANTIZATION_FULL_RANGE : V4L2_QUANTIZATION_LIM_RANGE))
 
 enum v4l2_priority {
-- 
cgit v1.2.3


From ca86cad7380e373fa17bc0ee8aff121380323e69 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 4 Feb 2017 13:10:38 -0500
Subject: audit: log module name on init_module

This adds a new auxiliary record MODULE_INIT to the SYSCALL event.

We get finit_module for free since it made most sense to hook this in to
load_module().

https://github.com/linux-audit/audit-kernel/issues/7
https://github.com/linux-audit/audit-kernel/wiki/RFE-Module-Load-Record-Format

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
[PM: corrected links in the commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h      | 12 ++++++++++++
 include/uapi/linux/audit.h |  1 +
 kernel/audit.h             |  3 +++
 kernel/auditsc.c           | 14 ++++++++++++++
 kernel/module.c            |  5 ++++-
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2be99b276d29..aba3a2684300 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -360,6 +360,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *old);
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
+extern void __audit_log_kern_module(char *name);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -450,6 +451,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 		__audit_mmap_fd(fd, flags);
 }
 
+static inline void audit_log_kern_module(char *name)
+{
+	if (!audit_dummy_context())
+		__audit_log_kern_module(name);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -561,6 +568,11 @@ static inline void audit_log_capset(const struct cred *new,
 { }
 static inline void audit_mmap_fd(int fd, int flags)
 { }
+
+static inline void audit_log_kern_module(char *name)
+{
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 3f24110ae63c..3c02bb2ff779 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -111,6 +111,7 @@
 #define AUDIT_PROCTITLE		1327	/* Proctitle emit event */
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
+#define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/audit.h b/kernel/audit.h
index 431444c3708b..144b7ebd2deb 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,6 +199,9 @@ struct audit_context {
 		struct {
 			int			argc;
 		} execve;
+		struct {
+			char			*name;
+		} module;
 	};
 	int fds[2];
 	struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb5f504592c6..bde3aac4deed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1268,6 +1268,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 	case AUDIT_EXECVE: {
 		audit_log_execve_info(context, &ab);
 		break; }
+	case AUDIT_KERN_MODULE:
+		audit_log_format(ab, "name=");
+		audit_log_untrustedstring(ab, context->module.name);
+		kfree(context->module.name);
+		break;
 	}
 	audit_log_end(ab);
 }
@@ -2368,6 +2373,15 @@ void __audit_mmap_fd(int fd, int flags)
 	context->type = AUDIT_MMAP;
 }
 
+void __audit_log_kern_module(char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	strcpy(context->module.name, name);
+	context->type = AUDIT_KERN_MODULE;
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
diff --git a/kernel/module.c b/kernel/module.c
index 529efae9f481..5432dbedf8cf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,6 +61,7 @@
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
 #include <linux/dynamic_debug.h>
+#include <linux/audit.h>
 #include <uapi/linux/module.h>
 #include "module-internal.h"
 
@@ -3593,6 +3594,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto free_copy;
 	}
 
+	audit_log_kern_module(mod->name);
+
 	/* Reserve our place in the list. */
 	err = add_unformed_module(mod);
 	if (err)
@@ -3681,7 +3684,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		       mod->name, after_dashes);
 	}
 
-	/* Link in to syfs. */
+	/* Link in to sysfs. */
 	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto coming_cleanup;
-- 
cgit v1.2.3


From 1ac5a404797523cedaf424a3aaa3cf8f9548dff8 Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Fri, 10 Feb 2017 03:19:33 -0800
Subject: RDMA/bnxt_re: Add bnxt_re RoCE driver

This patch introduces the RoCE driver for the Broadcom
NetXtreme-E 10/25/40/50G RoCE HCAs.

The RoCE driver is a two part driver that relies on the parent
bnxt_en NIC driver to operate.  The changes needed in the bnxt_en
driver have already been incorporated via Dave Miller's net tree
into the mainline kernel.

The vendor official git repository for this driver is available
on github as:
https://github.com/Broadcom/linux-rdma-nxt/

Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h    |  146 ++
 drivers/infiniband/hw/bnxt_re/ib_verbs.c   | 3202 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/bnxt_re/ib_verbs.h   |  197 ++
 drivers/infiniband/hw/bnxt_re/main.c       | 1315 ++++++++++++
 drivers/infiniband/hw/bnxt_re/qplib_fp.c   | 2167 +++++++++++++++++++
 drivers/infiniband/hw/bnxt_re/qplib_fp.h   |  439 ++++
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c |  694 ++++++
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h |  231 ++
 drivers/infiniband/hw/bnxt_re/qplib_res.c  |  825 +++++++
 drivers/infiniband/hw/bnxt_re/qplib_res.h  |  223 ++
 drivers/infiniband/hw/bnxt_re/qplib_sp.c   |  838 ++++++++
 drivers/infiniband/hw/bnxt_re/qplib_sp.h   |  160 ++
 drivers/infiniband/hw/bnxt_re/roce_hsi.h   | 2821 ++++++++++++++++++++++++
 include/uapi/rdma/bnxt_re-abi.h            |   89 +
 14 files changed, 13347 insertions(+)
 create mode 100644 drivers/infiniband/hw/bnxt_re/bnxt_re.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/ib_verbs.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/ib_verbs.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/main.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_fp.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_fp.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_res.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_res.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_sp.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_sp.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/roce_hsi.h
 create mode 100644 include/uapi/rdma/bnxt_re-abi.h

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
new file mode 100644
index 000000000000..ebf7be8d4139
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -0,0 +1,146 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Slow Path Operators (header)
+ *
+ */
+
+#ifndef __BNXT_RE_H__
+#define __BNXT_RE_H__
+#define ROCE_DRV_MODULE_NAME		"bnxt_re"
+#define ROCE_DRV_MODULE_VERSION		"1.0.0"
+
+#define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
+
+#define BNXT_RE_PAGE_SIZE_4K		BIT(12)
+#define BNXT_RE_PAGE_SIZE_8K		BIT(13)
+#define BNXT_RE_PAGE_SIZE_64K		BIT(16)
+#define BNXT_RE_PAGE_SIZE_2M		BIT(21)
+#define BNXT_RE_PAGE_SIZE_8M		BIT(23)
+#define BNXT_RE_PAGE_SIZE_1G		BIT(30)
+
+#define BNXT_RE_MAX_QPC_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_MRW_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_SRQC_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_CQ_COUNT		(64 * 1024)
+
+struct bnxt_re_work {
+	struct work_struct	work;
+	unsigned long		event;
+	struct bnxt_re_dev      *rdev;
+	struct net_device	*vlan_dev;
+};
+
+struct bnxt_re_sqp_entries {
+	struct bnxt_qplib_sge sge;
+	u64 wrid;
+	/* For storing the actual qp1 cqe */
+	struct bnxt_qplib_cqe cqe;
+	struct bnxt_re_qp *qp1_qp;
+};
+
+#define BNXT_RE_MIN_MSIX		2
+#define BNXT_RE_MAX_MSIX		16
+#define BNXT_RE_AEQ_IDX			0
+#define BNXT_RE_NQ_IDX			1
+
+struct bnxt_re_dev {
+	struct ib_device		ibdev;
+	struct list_head		list;
+	unsigned long			flags;
+#define BNXT_RE_FLAG_NETDEV_REGISTERED	0
+#define BNXT_RE_FLAG_IBDEV_REGISTERED	1
+#define BNXT_RE_FLAG_GOT_MSIX		2
+#define BNXT_RE_FLAG_RCFW_CHANNEL_EN	8
+#define BNXT_RE_FLAG_QOS_WORK_REG	16
+	struct net_device		*netdev;
+	unsigned int			version, major, minor;
+	struct bnxt_en_dev		*en_dev;
+	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
+	int				num_msix;
+
+	int				id;
+
+	struct delayed_work		worker;
+	u8				cur_prio_map;
+
+	/* FP Notification Queue (CQ & SRQ) */
+	struct tasklet_struct		nq_task;
+
+	/* RCFW Channel */
+	struct bnxt_qplib_rcfw		rcfw;
+
+	/* NQ */
+	struct bnxt_qplib_nq		nq;
+
+	/* Device Resources */
+	struct bnxt_qplib_dev_attr	dev_attr;
+	struct bnxt_qplib_ctx		qplib_ctx;
+	struct bnxt_qplib_res		qplib_res;
+	struct bnxt_qplib_dpi		dpi_privileged;
+
+	atomic_t			qp_count;
+	struct mutex			qp_lock;	/* protect qp list */
+	struct list_head		qp_list;
+
+	atomic_t			cq_count;
+	atomic_t			srq_count;
+	atomic_t			mr_count;
+	atomic_t			mw_count;
+	/* Max of 2 lossless traffic class supported per port */
+	u16				cosq[2];
+
+	/* QP for for handling QP1 packets */
+	u32				sqp_id;
+	struct bnxt_re_qp		*qp1_sqp;
+	struct bnxt_re_ah		*sqp_ah;
+	struct bnxt_re_sqp_entries sqp_tbl[1024];
+};
+
+#define to_bnxt_re_dev(ptr, member)	\
+	container_of((ptr), struct bnxt_re_dev, member)
+
+#define BNXT_RE_ROCE_V1_PACKET		0
+#define BNXT_RE_ROCEV2_IPV4_PACKET	2
+#define BNXT_RE_ROCEV2_IPV6_PACKET	3
+
+static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
+{
+	if (rdev)
+		return  &rdev->ibdev.dev;
+	return NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
new file mode 100644
index 000000000000..33af2e3de399
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -0,0 +1,3202 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: IB Verbs interpreter
+ */
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cache.h>
+
+#include "bnxt_ulp.h"
+
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+#include "qplib_rcfw.h"
+
+#include "bnxt_re.h"
+#include "ib_verbs.h"
+#include <rdma/bnxt_re-abi.h>
+
+static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list,
+			     struct bnxt_qplib_sge *sg_list, int num)
+{
+	int i, total = 0;
+
+	for (i = 0; i < num; i++) {
+		sg_list[i].addr = ib_sg_list[i].addr;
+		sg_list[i].lkey = ib_sg_list[i].lkey;
+		sg_list[i].size = ib_sg_list[i].length;
+		total += sg_list[i].size;
+	}
+	return total;
+}
+
+/* Device */
+struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct net_device *netdev = NULL;
+
+	rcu_read_lock();
+	if (rdev)
+		netdev = rdev->netdev;
+	if (netdev)
+		dev_hold(netdev);
+
+	rcu_read_unlock();
+	return netdev;
+}
+
+int bnxt_re_query_device(struct ib_device *ibdev,
+			 struct ib_device_attr *ib_attr,
+			 struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+
+	memset(ib_attr, 0, sizeof(*ib_attr));
+
+	ib_attr->fw_ver = (u64)(unsigned long)(dev_attr->fw_ver);
+	bnxt_qplib_get_guid(rdev->netdev->dev_addr,
+			    (u8 *)&ib_attr->sys_image_guid);
+	ib_attr->max_mr_size = ~0ull;
+	ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_8K |
+				 BNXT_RE_PAGE_SIZE_64K | BNXT_RE_PAGE_SIZE_2M |
+				 BNXT_RE_PAGE_SIZE_8M | BNXT_RE_PAGE_SIZE_1G;
+
+	ib_attr->vendor_id = rdev->en_dev->pdev->vendor;
+	ib_attr->vendor_part_id = rdev->en_dev->pdev->device;
+	ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device;
+	ib_attr->max_qp = dev_attr->max_qp;
+	ib_attr->max_qp_wr = dev_attr->max_qp_wqes;
+	ib_attr->device_cap_flags =
+				    IB_DEVICE_CURR_QP_STATE_MOD
+				    | IB_DEVICE_RC_RNR_NAK_GEN
+				    | IB_DEVICE_SHUTDOWN_PORT
+				    | IB_DEVICE_SYS_IMAGE_GUID
+				    | IB_DEVICE_LOCAL_DMA_LKEY
+				    | IB_DEVICE_RESIZE_MAX_WR
+				    | IB_DEVICE_PORT_ACTIVE_EVENT
+				    | IB_DEVICE_N_NOTIFY_CQ
+				    | IB_DEVICE_MEM_WINDOW
+				    | IB_DEVICE_MEM_WINDOW_TYPE_2B
+				    | IB_DEVICE_MEM_MGT_EXTENSIONS;
+	ib_attr->max_sge = dev_attr->max_qp_sges;
+	ib_attr->max_sge_rd = dev_attr->max_qp_sges;
+	ib_attr->max_cq = dev_attr->max_cq;
+	ib_attr->max_cqe = dev_attr->max_cq_wqes;
+	ib_attr->max_mr = dev_attr->max_mr;
+	ib_attr->max_pd = dev_attr->max_pd;
+	ib_attr->max_qp_rd_atom = dev_attr->max_qp_rd_atom;
+	ib_attr->max_qp_init_rd_atom = dev_attr->max_qp_rd_atom;
+	ib_attr->atomic_cap = IB_ATOMIC_HCA;
+	ib_attr->masked_atomic_cap = IB_ATOMIC_HCA;
+
+	ib_attr->max_ee_rd_atom = 0;
+	ib_attr->max_res_rd_atom = 0;
+	ib_attr->max_ee_init_rd_atom = 0;
+	ib_attr->max_ee = 0;
+	ib_attr->max_rdd = 0;
+	ib_attr->max_mw = dev_attr->max_mw;
+	ib_attr->max_raw_ipv6_qp = 0;
+	ib_attr->max_raw_ethy_qp = dev_attr->max_raw_ethy_qp;
+	ib_attr->max_mcast_grp = 0;
+	ib_attr->max_mcast_qp_attach = 0;
+	ib_attr->max_total_mcast_qp_attach = 0;
+	ib_attr->max_ah = dev_attr->max_ah;
+
+	ib_attr->max_fmr = dev_attr->max_fmr;
+	ib_attr->max_map_per_fmr = 1;	/* ? */
+
+	ib_attr->max_srq = dev_attr->max_srq;
+	ib_attr->max_srq_wr = dev_attr->max_srq_wqes;
+	ib_attr->max_srq_sge = dev_attr->max_srq_sges;
+
+	ib_attr->max_fast_reg_page_list_len = MAX_PBL_LVL_1_PGS;
+
+	ib_attr->max_pkeys = 1;
+	ib_attr->local_ca_ack_delay = 0;
+	return 0;
+}
+
+int bnxt_re_modify_device(struct ib_device *ibdev,
+			  int device_modify_mask,
+			  struct ib_device_modify *device_modify)
+{
+	switch (device_modify_mask) {
+	case IB_DEVICE_MODIFY_SYS_IMAGE_GUID:
+		/* Modify the GUID requires the modification of the GID table */
+		/* GUID should be made as READ-ONLY */
+		break;
+	case IB_DEVICE_MODIFY_NODE_DESC:
+		/* Node Desc should be made as READ-ONLY */
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static void __to_ib_speed_width(struct net_device *netdev, u8 *speed, u8 *width)
+{
+	struct ethtool_link_ksettings lksettings;
+	u32 espeed;
+
+	if (netdev->ethtool_ops && netdev->ethtool_ops->get_link_ksettings) {
+		memset(&lksettings, 0, sizeof(lksettings));
+		rtnl_lock();
+		netdev->ethtool_ops->get_link_ksettings(netdev, &lksettings);
+		rtnl_unlock();
+		espeed = lksettings.base.speed;
+	} else {
+		espeed = SPEED_UNKNOWN;
+	}
+	switch (espeed) {
+	case SPEED_1000:
+		*speed = IB_SPEED_SDR;
+		*width = IB_WIDTH_1X;
+		break;
+	case SPEED_10000:
+		*speed = IB_SPEED_QDR;
+		*width = IB_WIDTH_1X;
+		break;
+	case SPEED_20000:
+		*speed = IB_SPEED_DDR;
+		*width = IB_WIDTH_4X;
+		break;
+	case SPEED_25000:
+		*speed = IB_SPEED_EDR;
+		*width = IB_WIDTH_1X;
+		break;
+	case SPEED_40000:
+		*speed = IB_SPEED_QDR;
+		*width = IB_WIDTH_4X;
+		break;
+	case SPEED_50000:
+		break;
+	default:
+		*speed = IB_SPEED_SDR;
+		*width = IB_WIDTH_1X;
+		break;
+	}
+}
+
+/* Port */
+int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
+		       struct ib_port_attr *port_attr)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	if (netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev)) {
+		port_attr->state = IB_PORT_ACTIVE;
+		port_attr->phys_state = 5;
+	} else {
+		port_attr->state = IB_PORT_DOWN;
+		port_attr->phys_state = 3;
+	}
+	port_attr->max_mtu = IB_MTU_4096;
+	port_attr->active_mtu = iboe_get_mtu(rdev->netdev->mtu);
+	port_attr->gid_tbl_len = dev_attr->max_sgid;
+	port_attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+				    IB_PORT_DEVICE_MGMT_SUP |
+				    IB_PORT_VENDOR_CLASS_SUP |
+				    IB_PORT_IP_BASED_GIDS;
+
+	/* Max MSG size set to 2G for now */
+	port_attr->max_msg_sz = 0x80000000;
+	port_attr->bad_pkey_cntr = 0;
+	port_attr->qkey_viol_cntr = 0;
+	port_attr->pkey_tbl_len = dev_attr->max_pkey;
+	port_attr->lid = 0;
+	port_attr->sm_lid = 0;
+	port_attr->lmc = 0;
+	port_attr->max_vl_num = 4;
+	port_attr->sm_sl = 0;
+	port_attr->subnet_timeout = 0;
+	port_attr->init_type_reply = 0;
+	/* call the underlying netdev's ethtool hooks to query speed settings
+	 * for which we acquire rtnl_lock _only_ if it's registered with
+	 * IB stack to avoid race in the NETDEV_UNREG path
+	 */
+	if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+		__to_ib_speed_width(rdev->netdev, &port_attr->active_speed,
+				    &port_attr->active_width);
+	return 0;
+}
+
+int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num,
+			int port_modify_mask,
+			struct ib_port_modify *port_modify)
+{
+	switch (port_modify_mask) {
+	case IB_PORT_SHUTDOWN:
+		break;
+	case IB_PORT_INIT_TYPE:
+		break;
+	case IB_PORT_RESET_QKEY_CNTR:
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr port_attr;
+
+	if (bnxt_re_query_port(ibdev, port_num, &port_attr))
+		return -EINVAL;
+
+	immutable->pkey_tbl_len = port_attr.pkey_tbl_len;
+	immutable->gid_tbl_len = port_attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+	immutable->core_cap_flags |= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	return 0;
+}
+
+int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+		       u16 index, u16 *pkey)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+
+	/* Ignore port_num */
+
+	memset(pkey, 0, sizeof(*pkey));
+	return bnxt_qplib_get_pkey(&rdev->qplib_res,
+				   &rdev->qplib_res.pkey_tbl, index, pkey);
+}
+
+int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+		      int index, union ib_gid *gid)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	int rc = 0;
+
+	/* Ignore port_num */
+	memset(gid, 0, sizeof(*gid));
+	rc = bnxt_qplib_get_sgid(&rdev->qplib_res,
+				 &rdev->qplib_res.sgid_tbl, index,
+				 (struct bnxt_qplib_gid *)gid);
+	return rc;
+}
+
+int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num,
+		    unsigned int index, void **context)
+{
+	int rc = 0;
+	struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+
+	/* Delete the entry from the hardware */
+	ctx = *context;
+	if (!ctx)
+		return -EINVAL;
+
+	if (sgid_tbl && sgid_tbl->active) {
+		if (ctx->idx >= sgid_tbl->max)
+			return -EINVAL;
+		ctx->refcnt--;
+		if (!ctx->refcnt) {
+			rc = bnxt_qplib_del_sgid
+					(sgid_tbl,
+					 &sgid_tbl->tbl[ctx->idx], true);
+			if (rc)
+				dev_err(rdev_to_dev(rdev),
+					"Failed to remove GID: %#x", rc);
+			ctx_tbl = sgid_tbl->ctx;
+			ctx_tbl[ctx->idx] = NULL;
+			kfree(ctx);
+		}
+	} else {
+		return -EINVAL;
+	}
+	return rc;
+}
+
+int bnxt_re_add_gid(struct ib_device *ibdev, u8 port_num,
+		    unsigned int index, const union ib_gid *gid,
+		    const struct ib_gid_attr *attr, void **context)
+{
+	int rc;
+	u32 tbl_idx = 0;
+	u16 vlan_id = 0xFFFF;
+	struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+
+	if ((attr->ndev) && is_vlan_dev(attr->ndev))
+		vlan_id = vlan_dev_vlan_id(attr->ndev);
+
+	rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)gid,
+				 rdev->qplib_res.netdev->dev_addr,
+				 vlan_id, true, &tbl_idx);
+	if (rc == -EALREADY) {
+		ctx_tbl = sgid_tbl->ctx;
+		ctx_tbl[tbl_idx]->refcnt++;
+		*context = ctx_tbl[tbl_idx];
+		return 0;
+	}
+
+	if (rc < 0) {
+		dev_err(rdev_to_dev(rdev), "Failed to add GID: %#x", rc);
+		return rc;
+	}
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	ctx_tbl = sgid_tbl->ctx;
+	ctx->idx = tbl_idx;
+	ctx->refcnt = 1;
+	ctx_tbl[tbl_idx] = ctx;
+
+	return rc;
+}
+
+enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
+					    u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+/* Protection Domains */
+int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	int rc;
+
+	if (ib_pd->uobject && pd->dpi.dbr) {
+		struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
+		struct bnxt_re_ucontext *ucntx;
+
+		/* Free DPI only if this is the first PD allocated by the
+		 * application and mark the context dpi as NULL
+		 */
+		ucntx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
+
+		rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+					    &rdev->qplib_res.dpi_tbl,
+					    &pd->dpi);
+		if (rc)
+			dev_err(rdev_to_dev(rdev), "Failed to deallocate HW DPI");
+			/* Don't fail, continue*/
+		ucntx->dpi = NULL;
+	}
+
+	rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res,
+				   &rdev->qplib_res.pd_tbl,
+				   &pd->qplib_pd);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD");
+		return rc;
+	}
+
+	kfree(pd);
+	return 0;
+}
+
+struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *ucontext,
+			       struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_re_ucontext *ucntx = container_of(ucontext,
+						      struct bnxt_re_ucontext,
+						      ib_uctx);
+	struct bnxt_re_pd *pd;
+	int rc;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	pd->rdev = rdev;
+	if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	if (udata) {
+		struct bnxt_re_pd_resp resp;
+
+		if (!ucntx->dpi) {
+			/* Allocate DPI in alloc_pd to avoid failing of
+			 * ibv_devinfo and family of application when DPIs
+			 * are depleted.
+			 */
+			if (bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl,
+						 &pd->dpi, ucntx)) {
+				rc = -ENOMEM;
+				goto dbfail;
+			}
+			ucntx->dpi = &pd->dpi;
+		}
+
+		resp.pdid = pd->qplib_pd.id;
+		/* Still allow mapping this DBR to the new user PD. */
+		resp.dpi = ucntx->dpi->dpi;
+		resp.dbr = (u64)ucntx->dpi->umdbr;
+
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to copy user response\n");
+			goto dbfail;
+		}
+	}
+
+	return &pd->ib_pd;
+dbfail:
+	(void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+				    &pd->qplib_pd);
+fail:
+	kfree(pd);
+	return ERR_PTR(rc);
+}
+
+/* Address Handles */
+int bnxt_re_destroy_ah(struct ib_ah *ib_ah)
+{
+	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
+	struct bnxt_re_dev *rdev = ah->rdev;
+	int rc;
+
+	rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to destroy HW AH");
+		return rc;
+	}
+	kfree(ah);
+	return 0;
+}
+
+struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
+				struct ib_ah_attr *ah_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_ah *ah;
+	int rc;
+	u16 vlan_tag;
+	u8 nw_type;
+
+	struct ib_gid_attr sgid_attr;
+
+	if (!(ah_attr->ah_flags & IB_AH_GRH)) {
+		dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
+		return ERR_PTR(-EINVAL);
+	}
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->rdev = rdev;
+	ah->qplib_ah.pd = &pd->qplib_pd;
+
+	/* Supply the configuration for the HW */
+	memcpy(ah->qplib_ah.dgid.data, ah_attr->grh.dgid.raw,
+	       sizeof(union ib_gid));
+	/*
+	 * If RoCE V2 is enabled, stack will have two entries for
+	 * each GID entry. Avoiding this duplicte entry in HW. Dividing
+	 * the GID index by 2 for RoCE V2
+	 */
+	ah->qplib_ah.sgid_index = ah_attr->grh.sgid_index / 2;
+	ah->qplib_ah.host_sgid_index = ah_attr->grh.sgid_index;
+	ah->qplib_ah.traffic_class = ah_attr->grh.traffic_class;
+	ah->qplib_ah.flow_label = ah_attr->grh.flow_label;
+	ah->qplib_ah.hop_limit = ah_attr->grh.hop_limit;
+	ah->qplib_ah.sl = ah_attr->sl;
+	if (ib_pd->uobject &&
+	    !rdma_is_multicast_addr((struct in6_addr *)
+				    ah_attr->grh.dgid.raw) &&
+	    !rdma_link_local_addr((struct in6_addr *)
+				  ah_attr->grh.dgid.raw)) {
+		union ib_gid sgid;
+
+		rc = ib_get_cached_gid(&rdev->ibdev, 1,
+				       ah_attr->grh.sgid_index, &sgid,
+				       &sgid_attr);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to query gid at index %d",
+				ah_attr->grh.sgid_index);
+			goto fail;
+		}
+		if (sgid_attr.ndev) {
+			if (is_vlan_dev(sgid_attr.ndev))
+				vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev);
+			dev_put(sgid_attr.ndev);
+		}
+		/* Get network header type for this GID */
+		nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+		switch (nw_type) {
+		case RDMA_NETWORK_IPV4:
+			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
+			break;
+		case RDMA_NETWORK_IPV6:
+			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
+			break;
+		default:
+			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V1;
+			break;
+		}
+		rc = rdma_addr_find_l2_eth_by_grh(&sgid, &ah_attr->grh.dgid,
+						  ah_attr->dmac, &vlan_tag,
+						  &sgid_attr.ndev->ifindex,
+						  NULL);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to get dmac\n");
+			goto fail;
+		}
+	}
+
+	memcpy(ah->qplib_ah.dmac, ah_attr->dmac, ETH_ALEN);
+	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
+		goto fail;
+	}
+
+	/* Write AVID to shared page. */
+	if (ib_pd->uobject) {
+		struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
+		struct bnxt_re_ucontext *uctx;
+		unsigned long flag;
+		u32 *wrptr;
+
+		uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
+		spin_lock_irqsave(&uctx->sh_lock, flag);
+		wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT);
+		*wrptr = ah->qplib_ah.id;
+		wmb(); /* make sure cache is updated. */
+		spin_unlock_irqrestore(&uctx->sh_lock, flag);
+	}
+
+	return &ah->ib_ah;
+
+fail:
+	kfree(ah);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_modify_ah(struct ib_ah *ib_ah, struct ib_ah_attr *ah_attr)
+{
+	return 0;
+}
+
+int bnxt_re_query_ah(struct ib_ah *ib_ah, struct ib_ah_attr *ah_attr)
+{
+	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
+
+	memcpy(ah_attr->grh.dgid.raw, ah->qplib_ah.dgid.data,
+	       sizeof(union ib_gid));
+	ah_attr->grh.sgid_index = ah->qplib_ah.host_sgid_index;
+	ah_attr->grh.traffic_class = ah->qplib_ah.traffic_class;
+	ah_attr->sl = ah->qplib_ah.sl;
+	memcpy(ah_attr->dmac, ah->qplib_ah.dmac, ETH_ALEN);
+	ah_attr->ah_flags = IB_AH_GRH;
+	ah_attr->port_num = 1;
+	ah_attr->static_rate = 0;
+	return 0;
+}
+
+/* Queue Pairs */
+int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	int rc;
+
+	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
+		return rc;
+	}
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
+		rc = bnxt_qplib_destroy_ah(&rdev->qplib_res,
+					   &rdev->sqp_ah->qplib_ah);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to destroy HW AH for shadow QP");
+			return rc;
+		}
+
+		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
+					   &rdev->qp1_sqp->qplib_qp);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to destroy Shadow QP");
+			return rc;
+		}
+		mutex_lock(&rdev->qp_lock);
+		list_del(&rdev->qp1_sqp->list);
+		atomic_dec(&rdev->qp_count);
+		mutex_unlock(&rdev->qp_lock);
+
+		kfree(rdev->sqp_ah);
+		kfree(rdev->qp1_sqp);
+	}
+
+	if (qp->rumem && !IS_ERR(qp->rumem))
+		ib_umem_release(qp->rumem);
+	if (qp->sumem && !IS_ERR(qp->sumem))
+		ib_umem_release(qp->sumem);
+
+	mutex_lock(&rdev->qp_lock);
+	list_del(&qp->list);
+	atomic_dec(&rdev->qp_count);
+	mutex_unlock(&rdev->qp_lock);
+	kfree(qp);
+	return 0;
+}
+
+static u8 __from_ib_qp_type(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_GSI:
+		return CMDQ_CREATE_QP1_TYPE_GSI;
+	case IB_QPT_RC:
+		return CMDQ_CREATE_QP_TYPE_RC;
+	case IB_QPT_UD:
+		return CMDQ_CREATE_QP_TYPE_UD;
+	default:
+		return IB_QPT_MAX;
+	}
+}
+
+static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
+				struct bnxt_re_qp *qp, struct ib_udata *udata)
+{
+	struct bnxt_re_qp_req ureq;
+	struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
+	struct ib_umem *umem;
+	int bytes = 0;
+	struct ib_ucontext *context = pd->ib_pd.uobject->context;
+	struct bnxt_re_ucontext *cntx = container_of(context,
+						     struct bnxt_re_ucontext,
+						     ib_uctx);
+	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+		return -EFAULT;
+
+	bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
+	/* Consider mapping PSN search memory only for RC QPs. */
+	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC)
+		bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search));
+	bytes = PAGE_ALIGN(bytes);
+	umem = ib_umem_get(context, ureq.qpsva, bytes,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem))
+		return PTR_ERR(umem);
+
+	qp->sumem = umem;
+	qplib_qp->sq.sglist = umem->sg_head.sgl;
+	qplib_qp->sq.nmap = umem->nmap;
+	qplib_qp->qp_handle = ureq.qp_handle;
+
+	if (!qp->qplib_qp.srq) {
+		bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+		bytes = PAGE_ALIGN(bytes);
+		umem = ib_umem_get(context, ureq.qprva, bytes,
+				   IB_ACCESS_LOCAL_WRITE, 1);
+		if (IS_ERR(umem))
+			goto rqfail;
+		qp->rumem = umem;
+		qplib_qp->rq.sglist = umem->sg_head.sgl;
+		qplib_qp->rq.nmap = umem->nmap;
+	}
+
+	qplib_qp->dpi = cntx->dpi;
+	return 0;
+rqfail:
+	ib_umem_release(qp->sumem);
+	qp->sumem = NULL;
+	qplib_qp->sq.sglist = NULL;
+	qplib_qp->sq.nmap = 0;
+
+	return PTR_ERR(umem);
+}
+
+static struct bnxt_re_ah *bnxt_re_create_shadow_qp_ah
+				(struct bnxt_re_pd *pd,
+				 struct bnxt_qplib_res *qp1_res,
+				 struct bnxt_qplib_qp *qp1_qp)
+{
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_ah *ah;
+	union ib_gid sgid;
+	int rc;
+
+	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+	if (!ah)
+		return NULL;
+
+	memset(ah, 0, sizeof(*ah));
+	ah->rdev = rdev;
+	ah->qplib_ah.pd = &pd->qplib_pd;
+
+	rc = bnxt_re_query_gid(&rdev->ibdev, 1, 0, &sgid);
+	if (rc)
+		goto fail;
+
+	/* supply the dgid data same as sgid */
+	memcpy(ah->qplib_ah.dgid.data, &sgid.raw,
+	       sizeof(union ib_gid));
+	ah->qplib_ah.sgid_index = 0;
+
+	ah->qplib_ah.traffic_class = 0;
+	ah->qplib_ah.flow_label = 0;
+	ah->qplib_ah.hop_limit = 1;
+	ah->qplib_ah.sl = 0;
+	/* Have DMAC same as SMAC */
+	ether_addr_copy(ah->qplib_ah.dmac, rdev->netdev->dev_addr);
+
+	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to allocate HW AH for Shadow QP");
+		goto fail;
+	}
+
+	return ah;
+
+fail:
+	kfree(ah);
+	return NULL;
+}
+
+static struct bnxt_re_qp *bnxt_re_create_shadow_qp
+				(struct bnxt_re_pd *pd,
+				 struct bnxt_qplib_res *qp1_res,
+				 struct bnxt_qplib_qp *qp1_qp)
+{
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_qp *qp;
+	int rc;
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return NULL;
+
+	memset(qp, 0, sizeof(*qp));
+	qp->rdev = rdev;
+
+	/* Initialize the shadow QP structure from the QP1 values */
+	ether_addr_copy(qp->qplib_qp.smac, rdev->netdev->dev_addr);
+
+	qp->qplib_qp.pd = &pd->qplib_pd;
+	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
+	qp->qplib_qp.type = IB_QPT_UD;
+
+	qp->qplib_qp.max_inline_data = 0;
+	qp->qplib_qp.sig_type = true;
+
+	/* Shadow QP SQ depth should be same as QP1 RQ depth */
+	qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe;
+	qp->qplib_qp.sq.max_sge = 2;
+
+	qp->qplib_qp.scq = qp1_qp->scq;
+	qp->qplib_qp.rcq = qp1_qp->rcq;
+
+	qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe;
+	qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
+
+	qp->qplib_qp.mtu = qp1_qp->mtu;
+
+	qp->qplib_qp.sq_hdr_buf_size = 0;
+	qp->qplib_qp.rq_hdr_buf_size = BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6;
+	qp->qplib_qp.dpi = &rdev->dpi_privileged;
+
+	rc = bnxt_qplib_create_qp(qp1_res, &qp->qplib_qp);
+	if (rc)
+		goto fail;
+
+	rdev->sqp_id = qp->qplib_qp.id;
+
+	spin_lock_init(&qp->sq_lock);
+	INIT_LIST_HEAD(&qp->list);
+	mutex_lock(&rdev->qp_lock);
+	list_add_tail(&qp->list, &rdev->qp_list);
+	atomic_inc(&rdev->qp_count);
+	mutex_unlock(&rdev->qp_lock);
+	return qp;
+fail:
+	kfree(qp);
+	return NULL;
+}
+
+struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
+				struct ib_qp_init_attr *qp_init_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_qp *qp;
+	struct bnxt_re_cq *cq;
+	int rc, entries;
+
+	if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) ||
+	    (qp_init_attr->cap.max_recv_wr > dev_attr->max_qp_wqes) ||
+	    (qp_init_attr->cap.max_send_sge > dev_attr->max_qp_sges) ||
+	    (qp_init_attr->cap.max_recv_sge > dev_attr->max_qp_sges) ||
+	    (qp_init_attr->cap.max_inline_data > dev_attr->max_inline_data))
+		return ERR_PTR(-EINVAL);
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->rdev = rdev;
+	ether_addr_copy(qp->qplib_qp.smac, rdev->netdev->dev_addr);
+	qp->qplib_qp.pd = &pd->qplib_pd;
+	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
+	qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
+	if (qp->qplib_qp.type == IB_QPT_MAX) {
+		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
+			qp->qplib_qp.type);
+		rc = -EINVAL;
+		goto fail;
+	}
+	qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
+	qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
+				  IB_SIGNAL_ALL_WR) ? true : false);
+
+	entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
+	qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+					dev_attr->max_qp_wqes + 1);
+
+	qp->qplib_qp.sq.max_sge = qp_init_attr->cap.max_send_sge;
+	if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
+		qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
+
+	if (qp_init_attr->send_cq) {
+		cq = container_of(qp_init_attr->send_cq, struct bnxt_re_cq,
+				  ib_cq);
+		if (!cq) {
+			dev_err(rdev_to_dev(rdev), "Send CQ not found");
+			rc = -EINVAL;
+			goto fail;
+		}
+		qp->qplib_qp.scq = &cq->qplib_cq;
+	}
+
+	if (qp_init_attr->recv_cq) {
+		cq = container_of(qp_init_attr->recv_cq, struct bnxt_re_cq,
+				  ib_cq);
+		if (!cq) {
+			dev_err(rdev_to_dev(rdev), "Receive CQ not found");
+			rc = -EINVAL;
+			goto fail;
+		}
+		qp->qplib_qp.rcq = &cq->qplib_cq;
+	}
+
+	if (qp_init_attr->srq) {
+		dev_err(rdev_to_dev(rdev), "SRQ not supported");
+		rc = -ENOTSUPP;
+		goto fail;
+	} else {
+		/* Allocate 1 more than what's provided so posting max doesn't
+		 * mean empty
+		 */
+		entries = roundup_pow_of_two(qp_init_attr->cap.max_recv_wr + 1);
+		qp->qplib_qp.rq.max_wqe = min_t(u32, entries,
+						dev_attr->max_qp_wqes + 1);
+
+		qp->qplib_qp.rq.max_sge = qp_init_attr->cap.max_recv_sge;
+		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
+			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
+	}
+
+	qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
+
+	if (qp_init_attr->qp_type == IB_QPT_GSI) {
+		qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
+		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
+			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
+		qp->qplib_qp.sq.max_sge++;
+		if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
+			qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
+
+		qp->qplib_qp.rq_hdr_buf_size =
+					BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+
+		qp->qplib_qp.sq_hdr_buf_size =
+					BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;
+		qp->qplib_qp.dpi = &rdev->dpi_privileged;
+		rc = bnxt_qplib_create_qp1(&rdev->qplib_res, &qp->qplib_qp);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to create HW QP1");
+			goto fail;
+		}
+		/* Create a shadow QP to handle the QP1 traffic */
+		rdev->qp1_sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res,
+							 &qp->qplib_qp);
+		if (!rdev->qp1_sqp) {
+			rc = -EINVAL;
+			dev_err(rdev_to_dev(rdev),
+				"Failed to create Shadow QP for QP1");
+			goto qp_destroy;
+		}
+		rdev->sqp_ah = bnxt_re_create_shadow_qp_ah(pd, &rdev->qplib_res,
+							   &qp->qplib_qp);
+		if (!rdev->sqp_ah) {
+			bnxt_qplib_destroy_qp(&rdev->qplib_res,
+					      &rdev->qp1_sqp->qplib_qp);
+			rc = -EINVAL;
+			dev_err(rdev_to_dev(rdev),
+				"Failed to create AH entry for ShadowQP");
+			goto qp_destroy;
+		}
+
+	} else {
+		qp->qplib_qp.max_rd_atomic = dev_attr->max_qp_rd_atom;
+		qp->qplib_qp.max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
+		if (udata) {
+			rc = bnxt_re_init_user_qp(rdev, pd, qp, udata);
+			if (rc)
+				goto fail;
+		} else {
+			qp->qplib_qp.dpi = &rdev->dpi_privileged;
+		}
+
+		rc = bnxt_qplib_create_qp(&rdev->qplib_res, &qp->qplib_qp);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to create HW QP");
+			goto fail;
+		}
+	}
+
+	qp->ib_qp.qp_num = qp->qplib_qp.id;
+	spin_lock_init(&qp->sq_lock);
+
+	if (udata) {
+		struct bnxt_re_qp_resp resp;
+
+		resp.qpid = qp->ib_qp.qp_num;
+		resp.rsvd = 0;
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to copy QP udata");
+			goto qp_destroy;
+		}
+	}
+	INIT_LIST_HEAD(&qp->list);
+	mutex_lock(&rdev->qp_lock);
+	list_add_tail(&qp->list, &rdev->qp_list);
+	atomic_inc(&rdev->qp_count);
+	mutex_unlock(&rdev->qp_lock);
+
+	return &qp->ib_qp;
+qp_destroy:
+	bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
+fail:
+	kfree(qp);
+	return ERR_PTR(rc);
+}
+
+static u8 __from_ib_qp_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	case IB_QPS_INIT:
+		return CMDQ_MODIFY_QP_NEW_STATE_INIT;
+	case IB_QPS_RTR:
+		return CMDQ_MODIFY_QP_NEW_STATE_RTR;
+	case IB_QPS_RTS:
+		return CMDQ_MODIFY_QP_NEW_STATE_RTS;
+	case IB_QPS_SQD:
+		return CMDQ_MODIFY_QP_NEW_STATE_SQD;
+	case IB_QPS_SQE:
+		return CMDQ_MODIFY_QP_NEW_STATE_SQE;
+	case IB_QPS_ERR:
+	default:
+		return CMDQ_MODIFY_QP_NEW_STATE_ERR;
+	}
+}
+
+static enum ib_qp_state __to_ib_qp_state(u8 state)
+{
+	switch (state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RESET:
+		return IB_QPS_RESET;
+	case CMDQ_MODIFY_QP_NEW_STATE_INIT:
+		return IB_QPS_INIT;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTR:
+		return IB_QPS_RTR;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTS:
+		return IB_QPS_RTS;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQD:
+		return IB_QPS_SQD;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQE:
+		return IB_QPS_SQE;
+	case CMDQ_MODIFY_QP_NEW_STATE_ERR:
+	default:
+		return IB_QPS_ERR;
+	}
+}
+
+static u32 __from_ib_mtu(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_256;
+	case IB_MTU_512:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_512;
+	case IB_MTU_1024:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_1024;
+	case IB_MTU_2048:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_2048;
+	case IB_MTU_4096:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_4096;
+	default:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_2048;
+	}
+}
+
+static enum ib_mtu __to_ib_mtu(u32 mtu)
+{
+	switch (mtu & CREQ_QUERY_QP_RESP_SB_PATH_MTU_MASK) {
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_256:
+		return IB_MTU_256;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_512:
+		return IB_MTU_512;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_1024:
+		return IB_MTU_1024;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_2048:
+		return IB_MTU_2048;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_4096:
+		return IB_MTU_4096;
+	default:
+		return IB_MTU_2048;
+	}
+}
+
+static int __from_ib_access_flags(int iflags)
+{
+	int qflags = 0;
+
+	if (iflags & IB_ACCESS_LOCAL_WRITE)
+		qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+	if (iflags & IB_ACCESS_REMOTE_READ)
+		qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
+	if (iflags & IB_ACCESS_REMOTE_WRITE)
+		qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
+	if (iflags & IB_ACCESS_REMOTE_ATOMIC)
+		qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
+	if (iflags & IB_ACCESS_MW_BIND)
+		qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
+	if (iflags & IB_ZERO_BASED)
+		qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
+	if (iflags & IB_ACCESS_ON_DEMAND)
+		qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
+	return qflags;
+};
+
+static enum ib_access_flags __to_ib_access_flags(int qflags)
+{
+	enum ib_access_flags iflags = 0;
+
+	if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
+		iflags |= IB_ACCESS_LOCAL_WRITE;
+	if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
+		iflags |= IB_ACCESS_REMOTE_WRITE;
+	if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
+		iflags |= IB_ACCESS_REMOTE_READ;
+	if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
+		iflags |= IB_ACCESS_REMOTE_ATOMIC;
+	if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
+		iflags |= IB_ACCESS_MW_BIND;
+	if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
+		iflags |= IB_ZERO_BASED;
+	if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
+		iflags |= IB_ACCESS_ON_DEMAND;
+	return iflags;
+};
+
+static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
+				    struct bnxt_re_qp *qp1_qp,
+				    int qp_attr_mask)
+{
+	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	int rc = 0;
+
+	if (qp_attr_mask & IB_QP_STATE) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
+		qp->qplib_qp.state = qp1_qp->qplib_qp.state;
+	}
+	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
+		qp->qplib_qp.pkey_index = qp1_qp->qplib_qp.pkey_index;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY;
+		/* Using a Random  QKEY */
+		qp->qplib_qp.qkey = 0x81818181;
+	}
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN;
+		qp->qplib_qp.sq.psn = qp1_qp->qplib_qp.sq.psn;
+	}
+
+	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
+	if (rc)
+		dev_err(rdev_to_dev(rdev),
+			"Failed to modify Shadow QP for QP1");
+	return rc;
+}
+
+int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
+		      int qp_attr_mask, struct ib_udata *udata)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	enum ib_qp_state curr_qp_state, new_qp_state;
+	int rc, entries;
+	int status;
+	union ib_gid sgid;
+	struct ib_gid_attr sgid_attr;
+	u8 nw_type;
+
+	qp->qplib_qp.modify_flags = 0;
+	if (qp_attr_mask & IB_QP_STATE) {
+		curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state);
+		new_qp_state = qp_attr->qp_state;
+		if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state,
+					ib_qp->qp_type, qp_attr_mask,
+					IB_LINK_LAYER_ETHERNET)) {
+			dev_err(rdev_to_dev(rdev),
+				"Invalid attribute mask: %#x specified ",
+				qp_attr_mask);
+			dev_err(rdev_to_dev(rdev),
+				"for qpn: %#x type: %#x",
+				ib_qp->qp_num, ib_qp->qp_type);
+			dev_err(rdev_to_dev(rdev),
+				"curr_qp_state=0x%x, new_qp_state=0x%x\n",
+				curr_qp_state, new_qp_state);
+			return -EINVAL;
+		}
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
+		qp->qplib_qp.state = __from_ib_qp_state(qp_attr->qp_state);
+	}
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY;
+		qp->qplib_qp.en_sqd_async_notify = true;
+	}
+	if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS;
+		qp->qplib_qp.access =
+			__from_ib_access_flags(qp_attr->qp_access_flags);
+		/* LOCAL_WRITE access must be set to allow RC receive */
+		qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+	}
+	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
+		qp->qplib_qp.pkey_index = qp_attr->pkey_index;
+	}
+	if (qp_attr_mask & IB_QP_QKEY) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY;
+		qp->qplib_qp.qkey = qp_attr->qkey;
+	}
+	if (qp_attr_mask & IB_QP_AV) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_DGID |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID;
+		memcpy(qp->qplib_qp.ah.dgid.data, qp_attr->ah_attr.grh.dgid.raw,
+		       sizeof(qp->qplib_qp.ah.dgid.data));
+		qp->qplib_qp.ah.flow_label = qp_attr->ah_attr.grh.flow_label;
+		/* If RoCE V2 is enabled, stack will have two entries for
+		 * each GID entry. Avoiding this duplicte entry in HW. Dividing
+		 * the GID index by 2 for RoCE V2
+		 */
+		qp->qplib_qp.ah.sgid_index =
+					qp_attr->ah_attr.grh.sgid_index / 2;
+		qp->qplib_qp.ah.host_sgid_index =
+					qp_attr->ah_attr.grh.sgid_index;
+		qp->qplib_qp.ah.hop_limit = qp_attr->ah_attr.grh.hop_limit;
+		qp->qplib_qp.ah.traffic_class =
+					qp_attr->ah_attr.grh.traffic_class;
+		qp->qplib_qp.ah.sl = qp_attr->ah_attr.sl;
+		ether_addr_copy(qp->qplib_qp.ah.dmac, qp_attr->ah_attr.dmac);
+
+		status = ib_get_cached_gid(&rdev->ibdev, 1,
+					   qp_attr->ah_attr.grh.sgid_index,
+					   &sgid, &sgid_attr);
+		if (!status && sgid_attr.ndev) {
+			memcpy(qp->qplib_qp.smac, sgid_attr.ndev->dev_addr,
+			       ETH_ALEN);
+			dev_put(sgid_attr.ndev);
+			nw_type = ib_gid_to_network_type(sgid_attr.gid_type,
+							 &sgid);
+			switch (nw_type) {
+			case RDMA_NETWORK_IPV4:
+				qp->qplib_qp.nw_type =
+					CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV4;
+				break;
+			case RDMA_NETWORK_IPV6:
+				qp->qplib_qp.nw_type =
+					CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV6;
+				break;
+			default:
+				qp->qplib_qp.nw_type =
+					CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV1;
+				break;
+			}
+		}
+	}
+
+	if (qp_attr_mask & IB_QP_PATH_MTU) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU;
+		qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu);
+	} else if (qp_attr->qp_state == IB_QPS_RTR) {
+		qp->qplib_qp.modify_flags |=
+			CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU;
+		qp->qplib_qp.path_mtu =
+			__from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu));
+	}
+
+	if (qp_attr_mask & IB_QP_TIMEOUT) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_TIMEOUT;
+		qp->qplib_qp.timeout = qp_attr->timeout;
+	}
+	if (qp_attr_mask & IB_QP_RETRY_CNT) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_RETRY_CNT;
+		qp->qplib_qp.retry_cnt = qp_attr->retry_cnt;
+	}
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_RNR_RETRY;
+		qp->qplib_qp.rnr_retry = qp_attr->rnr_retry;
+	}
+	if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER;
+		qp->qplib_qp.min_rnr_timer = qp_attr->min_rnr_timer;
+	}
+	if (qp_attr_mask & IB_QP_RQ_PSN) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN;
+		qp->qplib_qp.rq.psn = qp_attr->rq_psn;
+	}
+	if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_MAX_RD_ATOMIC;
+		qp->qplib_qp.max_rd_atomic = qp_attr->max_rd_atomic;
+	}
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN;
+		qp->qplib_qp.sq.psn = qp_attr->sq_psn;
+	}
+	if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC;
+		qp->qplib_qp.max_dest_rd_atomic = qp_attr->max_dest_rd_atomic;
+	}
+	if (qp_attr_mask & IB_QP_CAP) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SIZE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SIZE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SGE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SGE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_MAX_INLINE_DATA;
+		if ((qp_attr->cap.max_send_wr >= dev_attr->max_qp_wqes) ||
+		    (qp_attr->cap.max_recv_wr >= dev_attr->max_qp_wqes) ||
+		    (qp_attr->cap.max_send_sge >= dev_attr->max_qp_sges) ||
+		    (qp_attr->cap.max_recv_sge >= dev_attr->max_qp_sges) ||
+		    (qp_attr->cap.max_inline_data >=
+						dev_attr->max_inline_data)) {
+			dev_err(rdev_to_dev(rdev),
+				"Create QP failed - max exceeded");
+			return -EINVAL;
+		}
+		entries = roundup_pow_of_two(qp_attr->cap.max_send_wr);
+		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+						dev_attr->max_qp_wqes + 1);
+		qp->qplib_qp.sq.max_sge = qp_attr->cap.max_send_sge;
+		if (qp->qplib_qp.rq.max_wqe) {
+			entries = roundup_pow_of_two(qp_attr->cap.max_recv_wr);
+			qp->qplib_qp.rq.max_wqe =
+				min_t(u32, entries, dev_attr->max_qp_wqes + 1);
+			qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge;
+		} else {
+			/* SRQ was used prior, just ignore the RQ caps */
+		}
+	}
+	if (qp_attr_mask & IB_QP_DEST_QPN) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID;
+		qp->qplib_qp.dest_qpn = qp_attr->dest_qp_num;
+	}
+	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to modify HW QP");
+		return rc;
+	}
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp)
+		rc = bnxt_re_modify_shadow_qp(rdev, qp, qp_attr_mask);
+	return rc;
+}
+
+int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
+		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_qplib_qp qplib_qp;
+	int rc;
+
+	memset(&qplib_qp, 0, sizeof(struct bnxt_qplib_qp));
+	qplib_qp.id = qp->qplib_qp.id;
+	qplib_qp.ah.host_sgid_index = qp->qplib_qp.ah.host_sgid_index;
+
+	rc = bnxt_qplib_query_qp(&rdev->qplib_res, &qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to query HW QP");
+		return rc;
+	}
+	qp_attr->qp_state = __to_ib_qp_state(qplib_qp.state);
+	qp_attr->en_sqd_async_notify = qplib_qp.en_sqd_async_notify ? 1 : 0;
+	qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp.access);
+	qp_attr->pkey_index = qplib_qp.pkey_index;
+	qp_attr->qkey = qplib_qp.qkey;
+	memcpy(qp_attr->ah_attr.grh.dgid.raw, qplib_qp.ah.dgid.data,
+	       sizeof(qplib_qp.ah.dgid.data));
+	qp_attr->ah_attr.grh.flow_label = qplib_qp.ah.flow_label;
+	qp_attr->ah_attr.grh.sgid_index = qplib_qp.ah.host_sgid_index;
+	qp_attr->ah_attr.grh.hop_limit = qplib_qp.ah.hop_limit;
+	qp_attr->ah_attr.grh.traffic_class = qplib_qp.ah.traffic_class;
+	qp_attr->ah_attr.sl = qplib_qp.ah.sl;
+	ether_addr_copy(qp_attr->ah_attr.dmac, qplib_qp.ah.dmac);
+	qp_attr->path_mtu = __to_ib_mtu(qplib_qp.path_mtu);
+	qp_attr->timeout = qplib_qp.timeout;
+	qp_attr->retry_cnt = qplib_qp.retry_cnt;
+	qp_attr->rnr_retry = qplib_qp.rnr_retry;
+	qp_attr->min_rnr_timer = qplib_qp.min_rnr_timer;
+	qp_attr->rq_psn = qplib_qp.rq.psn;
+	qp_attr->max_rd_atomic = qplib_qp.max_rd_atomic;
+	qp_attr->sq_psn = qplib_qp.sq.psn;
+	qp_attr->max_dest_rd_atomic = qplib_qp.max_dest_rd_atomic;
+	qp_init_attr->sq_sig_type = qplib_qp.sig_type ? IB_SIGNAL_ALL_WR :
+							IB_SIGNAL_REQ_WR;
+	qp_attr->dest_qp_num = qplib_qp.dest_qpn;
+
+	qp_attr->cap.max_send_wr = qp->qplib_qp.sq.max_wqe;
+	qp_attr->cap.max_send_sge = qp->qplib_qp.sq.max_sge;
+	qp_attr->cap.max_recv_wr = qp->qplib_qp.rq.max_wqe;
+	qp_attr->cap.max_recv_sge = qp->qplib_qp.rq.max_sge;
+	qp_attr->cap.max_inline_data = qp->qplib_qp.max_inline_data;
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+/* Routine for sending QP1 packets for RoCE V1 an V2
+ */
+static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp,
+				     struct ib_send_wr *wr,
+				     struct bnxt_qplib_swqe *wqe,
+				     int payload_size)
+{
+	struct ib_device *ibdev = &qp->rdev->ibdev;
+	struct bnxt_re_ah *ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah,
+					     ib_ah);
+	struct bnxt_qplib_ah *qplib_ah = &ah->qplib_ah;
+	struct bnxt_qplib_sge sge;
+	union ib_gid sgid;
+	u8 nw_type;
+	u16 ether_type;
+	struct ib_gid_attr sgid_attr;
+	union ib_gid dgid;
+	bool is_eth = false;
+	bool is_vlan = false;
+	bool is_grh = false;
+	bool is_udp = false;
+	u8 ip_version = 0;
+	u16 vlan_id = 0xFFFF;
+	void *buf;
+	int i, rc = 0, size;
+
+	memset(&qp->qp1_hdr, 0, sizeof(qp->qp1_hdr));
+
+	rc = ib_get_cached_gid(ibdev, 1,
+			       qplib_ah->host_sgid_index, &sgid,
+			       &sgid_attr);
+	if (rc) {
+		dev_err(rdev_to_dev(qp->rdev),
+			"Failed to query gid at index %d",
+			qplib_ah->host_sgid_index);
+		return rc;
+	}
+	if (sgid_attr.ndev) {
+		if (is_vlan_dev(sgid_attr.ndev))
+			vlan_id = vlan_dev_vlan_id(sgid_attr.ndev);
+		dev_put(sgid_attr.ndev);
+	}
+	/* Get network header type for this GID */
+	nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+	switch (nw_type) {
+	case RDMA_NETWORK_IPV4:
+		nw_type = BNXT_RE_ROCEV2_IPV4_PACKET;
+		break;
+	case RDMA_NETWORK_IPV6:
+		nw_type = BNXT_RE_ROCEV2_IPV6_PACKET;
+		break;
+	default:
+		nw_type = BNXT_RE_ROCE_V1_PACKET;
+		break;
+	}
+	memcpy(&dgid.raw, &qplib_ah->dgid, 16);
+	is_udp = sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+	if (is_udp) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) {
+			ip_version = 4;
+			ether_type = ETH_P_IP;
+		} else {
+			ip_version = 6;
+			ether_type = ETH_P_IPV6;
+		}
+		is_grh = false;
+	} else {
+		ether_type = ETH_P_IBOE;
+		is_grh = true;
+	}
+
+	is_eth = true;
+	is_vlan = (vlan_id && (vlan_id < 0x1000)) ? true : false;
+
+	ib_ud_header_init(payload_size, !is_eth, is_eth, is_vlan, is_grh,
+			  ip_version, is_udp, 0, &qp->qp1_hdr);
+
+	/* ETH */
+	ether_addr_copy(qp->qp1_hdr.eth.dmac_h, ah->qplib_ah.dmac);
+	ether_addr_copy(qp->qp1_hdr.eth.smac_h, qp->qplib_qp.smac);
+
+	/* For vlan, check the sgid for vlan existence */
+
+	if (!is_vlan) {
+		qp->qp1_hdr.eth.type = cpu_to_be16(ether_type);
+	} else {
+		qp->qp1_hdr.vlan.type = cpu_to_be16(ether_type);
+		qp->qp1_hdr.vlan.tag = cpu_to_be16(vlan_id);
+	}
+
+	if (is_grh || (ip_version == 6)) {
+		memcpy(qp->qp1_hdr.grh.source_gid.raw, sgid.raw, sizeof(sgid));
+		memcpy(qp->qp1_hdr.grh.destination_gid.raw, qplib_ah->dgid.data,
+		       sizeof(sgid));
+		qp->qp1_hdr.grh.hop_limit     = qplib_ah->hop_limit;
+	}
+
+	if (ip_version == 4) {
+		qp->qp1_hdr.ip4.tos = 0;
+		qp->qp1_hdr.ip4.id = 0;
+		qp->qp1_hdr.ip4.frag_off = htons(IP_DF);
+		qp->qp1_hdr.ip4.ttl = qplib_ah->hop_limit;
+
+		memcpy(&qp->qp1_hdr.ip4.saddr, sgid.raw + 12, 4);
+		memcpy(&qp->qp1_hdr.ip4.daddr, qplib_ah->dgid.data + 12, 4);
+		qp->qp1_hdr.ip4.check = ib_ud_ip4_csum(&qp->qp1_hdr);
+	}
+
+	if (is_udp) {
+		qp->qp1_hdr.udp.dport = htons(ROCE_V2_UDP_DPORT);
+		qp->qp1_hdr.udp.sport = htons(0x8CD1);
+		qp->qp1_hdr.udp.csum = 0;
+	}
+
+	/* BTH */
+	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+		qp->qp1_hdr.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		qp->qp1_hdr.immediate_present = 1;
+	} else {
+		qp->qp1_hdr.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+	}
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		qp->qp1_hdr.bth.solicited_event = 1;
+	/* pad_count */
+	qp->qp1_hdr.bth.pad_count = (4 - payload_size) & 3;
+
+	/* P_key for QP1 is for all members */
+	qp->qp1_hdr.bth.pkey = cpu_to_be16(0xFFFF);
+	qp->qp1_hdr.bth.destination_qpn = IB_QP1;
+	qp->qp1_hdr.bth.ack_req = 0;
+	qp->send_psn++;
+	qp->send_psn &= BTH_PSN_MASK;
+	qp->qp1_hdr.bth.psn = cpu_to_be32(qp->send_psn);
+	/* DETH */
+	/* Use the priviledged Q_Key for QP1 */
+	qp->qp1_hdr.deth.qkey = cpu_to_be32(IB_QP1_QKEY);
+	qp->qp1_hdr.deth.source_qpn = IB_QP1;
+
+	/* Pack the QP1 to the transmit buffer */
+	buf = bnxt_qplib_get_qp1_sq_buf(&qp->qplib_qp, &sge);
+	if (buf) {
+		size = ib_ud_header_pack(&qp->qp1_hdr, buf);
+		for (i = wqe->num_sge; i; i--) {
+			wqe->sg_list[i].addr = wqe->sg_list[i - 1].addr;
+			wqe->sg_list[i].lkey = wqe->sg_list[i - 1].lkey;
+			wqe->sg_list[i].size = wqe->sg_list[i - 1].size;
+		}
+
+		/*
+		 * Max Header buf size for IPV6 RoCE V2 is 86,
+		 * which is same as the QP1 SQ header buffer.
+		 * Header buf size for IPV4 RoCE V2 can be 66.
+		 * ETH(14) + VLAN(4)+ IP(20) + UDP (8) + BTH(20).
+		 * Subtract 20 bytes from QP1 SQ header buf size
+		 */
+		if (is_udp && ip_version == 4)
+			sge.size -= 20;
+		/*
+		 * Max Header buf size for RoCE V1 is 78.
+		 * ETH(14) + VLAN(4) + GRH(40) + BTH(20).
+		 * Subtract 8 bytes from QP1 SQ header buf size
+		 */
+		if (!is_udp)
+			sge.size -= 8;
+
+		/* Subtract 4 bytes for non vlan packets */
+		if (!is_vlan)
+			sge.size -= 4;
+
+		wqe->sg_list[0].addr = sge.addr;
+		wqe->sg_list[0].lkey = sge.lkey;
+		wqe->sg_list[0].size = sge.size;
+		wqe->num_sge++;
+
+	} else {
+		dev_err(rdev_to_dev(qp->rdev), "QP1 buffer is empty!");
+		rc = -ENOMEM;
+	}
+	return rc;
+}
+
+/* For the MAD layer, it only provides the recv SGE the size of
+ * ib_grh + MAD datagram.  No Ethernet headers, Ethertype, BTH, DETH,
+ * nor RoCE iCRC.  The Cu+ solution must provide buffer for the entire
+ * receive packet (334 bytes) with no VLAN and then copy the GRH
+ * and the MAD datagram out to the provided SGE.
+ */
+static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
+					    struct ib_recv_wr *wr,
+					    struct bnxt_qplib_swqe *wqe,
+					    int payload_size)
+{
+	struct bnxt_qplib_sge ref, sge;
+	u32 rq_prod_index;
+	struct bnxt_re_sqp_entries *sqp_entry;
+
+	rq_prod_index = bnxt_qplib_get_rq_prod_index(&qp->qplib_qp);
+
+	if (!bnxt_qplib_get_qp1_rq_buf(&qp->qplib_qp, &sge))
+		return -ENOMEM;
+
+	/* Create 1 SGE to receive the entire
+	 * ethernet packet
+	 */
+	/* Save the reference from ULP */
+	ref.addr = wqe->sg_list[0].addr;
+	ref.lkey = wqe->sg_list[0].lkey;
+	ref.size = wqe->sg_list[0].size;
+
+	sqp_entry = &qp->rdev->sqp_tbl[rq_prod_index];
+
+	/* SGE 1 */
+	wqe->sg_list[0].addr = sge.addr;
+	wqe->sg_list[0].lkey = sge.lkey;
+	wqe->sg_list[0].size = BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+	sge.size -= wqe->sg_list[0].size;
+
+	sqp_entry->sge.addr = ref.addr;
+	sqp_entry->sge.lkey = ref.lkey;
+	sqp_entry->sge.size = ref.size;
+	/* Store the wrid for reporting completion */
+	sqp_entry->wrid = wqe->wr_id;
+	/* change the wqe->wrid to table index */
+	wqe->wr_id = rq_prod_index;
+	return 0;
+}
+
+static int is_ud_qp(struct bnxt_re_qp *qp)
+{
+	return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
+}
+
+static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp,
+				  struct ib_send_wr *wr,
+				  struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_re_ah *ah = NULL;
+
+	if (is_ud_qp(qp)) {
+		ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah, ib_ah);
+		wqe->send.q_key = ud_wr(wr)->remote_qkey;
+		wqe->send.dst_qp = ud_wr(wr)->remote_qpn;
+		wqe->send.avid = ah->qplib_ah.id;
+	}
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM;
+		wqe->send.imm_data = wr->ex.imm_data;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV;
+		wqe->send.inv_key = wr->ex.invalidate_rkey;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+	if (wr->send_flags & IB_SEND_INLINE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_INLINE;
+
+	return 0;
+}
+
+static int bnxt_re_build_rdma_wqe(struct ib_send_wr *wr,
+				  struct bnxt_qplib_swqe *wqe)
+{
+	switch (wr->opcode) {
+	case IB_WR_RDMA_WRITE:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE;
+		break;
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM;
+		wqe->rdma.imm_data = wr->ex.imm_data;
+		break;
+	case IB_WR_RDMA_READ:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_READ;
+		wqe->rdma.inv_key = wr->ex.invalidate_rkey;
+		break;
+	default:
+		return -EINVAL;
+	}
+	wqe->rdma.remote_va = rdma_wr(wr)->remote_addr;
+	wqe->rdma.r_key = rdma_wr(wr)->rkey;
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+	if (wr->send_flags & IB_SEND_INLINE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_INLINE;
+
+	return 0;
+}
+
+static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr,
+				    struct bnxt_qplib_swqe *wqe)
+{
+	switch (wr->opcode) {
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP;
+		wqe->atomic.swap_data = atomic_wr(wr)->swap;
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD;
+		wqe->atomic.cmp_data = atomic_wr(wr)->compare_add;
+		break;
+	default:
+		return -EINVAL;
+	}
+	wqe->atomic.remote_va = atomic_wr(wr)->remote_addr;
+	wqe->atomic.r_key = atomic_wr(wr)->rkey;
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+	return 0;
+}
+
+static int bnxt_re_build_inv_wqe(struct ib_send_wr *wr,
+				 struct bnxt_qplib_swqe *wqe)
+{
+	wqe->type = BNXT_QPLIB_SWQE_TYPE_LOCAL_INV;
+	wqe->local_inv.inv_l_key = wr->ex.invalidate_rkey;
+
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+
+	return 0;
+}
+
+static int bnxt_re_build_reg_wqe(struct ib_reg_wr *wr,
+				 struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_re_mr *mr = container_of(wr->mr, struct bnxt_re_mr, ib_mr);
+	struct bnxt_qplib_frpl *qplib_frpl = &mr->qplib_frpl;
+	int access = wr->access;
+
+	wqe->frmr.pbl_ptr = (__le64 *)qplib_frpl->hwq.pbl_ptr[0];
+	wqe->frmr.pbl_dma_ptr = qplib_frpl->hwq.pbl_dma_ptr[0];
+	wqe->frmr.page_list = mr->pages;
+	wqe->frmr.page_list_len = mr->npages;
+	wqe->frmr.levels = qplib_frpl->hwq.level + 1;
+	wqe->type = BNXT_QPLIB_SWQE_TYPE_REG_MR;
+
+	if (wr->wr.send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->wr.send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+
+	if (access & IB_ACCESS_LOCAL_WRITE)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE;
+	if (access & IB_ACCESS_REMOTE_READ)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_READ;
+	if (access & IB_ACCESS_REMOTE_WRITE)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_WRITE;
+	if (access & IB_ACCESS_REMOTE_ATOMIC)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_ATOMIC;
+	if (access & IB_ACCESS_MW_BIND)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_WINDOW_BIND;
+
+	wqe->frmr.l_key = wr->key;
+	wqe->frmr.length = wr->mr->length;
+	wqe->frmr.pbl_pg_sz_log = (wr->mr->page_size >> PAGE_SHIFT_4K) - 1;
+	wqe->frmr.va = wr->mr->iova;
+	return 0;
+}
+
+static int bnxt_re_copy_inline_data(struct bnxt_re_dev *rdev,
+				    struct ib_send_wr *wr,
+				    struct bnxt_qplib_swqe *wqe)
+{
+	/*  Copy the inline data to the data  field */
+	u8 *in_data;
+	u32 i, sge_len;
+	void *sge_addr;
+
+	in_data = wqe->inline_data;
+	for (i = 0; i < wr->num_sge; i++) {
+		sge_addr = (void *)(unsigned long)
+				wr->sg_list[i].addr;
+		sge_len = wr->sg_list[i].length;
+
+		if ((sge_len + wqe->inline_len) >
+		    BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
+			dev_err(rdev_to_dev(rdev),
+				"Inline data size requested > supported value");
+			return -EINVAL;
+		}
+		sge_len = wr->sg_list[i].length;
+
+		memcpy(in_data, sge_addr, sge_len);
+		in_data += wr->sg_list[i].length;
+		wqe->inline_len += wr->sg_list[i].length;
+	}
+	return wqe->inline_len;
+}
+
+static int bnxt_re_copy_wr_payload(struct bnxt_re_dev *rdev,
+				   struct ib_send_wr *wr,
+				   struct bnxt_qplib_swqe *wqe)
+{
+	int payload_sz = 0;
+
+	if (wr->send_flags & IB_SEND_INLINE)
+		payload_sz = bnxt_re_copy_inline_data(rdev, wr, wqe);
+	else
+		payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe->sg_list,
+					       wqe->num_sge);
+
+	return payload_sz;
+}
+
+static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
+				       struct bnxt_re_qp *qp,
+				struct ib_send_wr *wr)
+{
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+	memset(&wqe, 0, sizeof(wqe));
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
+			dev_err(rdev_to_dev(rdev),
+				"Limit exceeded for Send SGEs");
+			rc = -EINVAL;
+			goto bad;
+		}
+
+		payload_sz = bnxt_re_copy_wr_payload(qp->rdev, wr, &wqe);
+		if (payload_sz < 0) {
+			rc = -EINVAL;
+			goto bad;
+		}
+		wqe.wr_id = wr->wr_id;
+
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_SEND;
+
+		rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+		if (!rc)
+			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+bad:
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Post send failed opcode = %#x rc = %d",
+				wr->opcode, rc);
+			break;
+		}
+		wr = wr->next;
+	}
+	bnxt_qplib_post_send_db(&qp->qplib_qp);
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+	return rc;
+}
+
+int bnxt_re_post_send(struct ib_qp *ib_qp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
+			dev_err(rdev_to_dev(qp->rdev),
+				"Limit exceeded for Send SGEs");
+			rc = -EINVAL;
+			goto bad;
+		}
+
+		payload_sz = bnxt_re_copy_wr_payload(qp->rdev, wr, &wqe);
+		if (payload_sz < 0) {
+			rc = -EINVAL;
+			goto bad;
+		}
+		wqe.wr_id = wr->wr_id;
+
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (ib_qp->qp_type == IB_QPT_GSI) {
+				rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe,
+							       payload_sz);
+				if (rc)
+					goto bad;
+				wqe.rawqp1.lflags |=
+					SQ_SEND_RAWETH_QP1_LFLAGS_ROCE_CRC;
+			}
+			switch (wr->send_flags) {
+			case IB_SEND_IP_CSUM:
+				wqe.rawqp1.lflags |=
+					SQ_SEND_RAWETH_QP1_LFLAGS_IP_CHKSUM;
+				break;
+			default:
+				break;
+			}
+			/* Fall thru to build the wqe */
+		case IB_WR_SEND_WITH_INV:
+			rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+			break;
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+		case IB_WR_RDMA_READ:
+			rc = bnxt_re_build_rdma_wqe(wr, &wqe);
+			break;
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			rc = bnxt_re_build_atomic_wqe(wr, &wqe);
+			break;
+		case IB_WR_RDMA_READ_WITH_INV:
+			dev_err(rdev_to_dev(qp->rdev),
+				"RDMA Read with Invalidate is not supported");
+			rc = -EINVAL;
+			goto bad;
+		case IB_WR_LOCAL_INV:
+			rc = bnxt_re_build_inv_wqe(wr, &wqe);
+			break;
+		case IB_WR_REG_MR:
+			rc = bnxt_re_build_reg_wqe(reg_wr(wr), &wqe);
+			break;
+		default:
+			/* Unsupported WRs */
+			dev_err(rdev_to_dev(qp->rdev),
+				"WR (%#x) is not supported", wr->opcode);
+			rc = -EINVAL;
+			goto bad;
+		}
+		if (!rc)
+			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+bad:
+		if (rc) {
+			dev_err(rdev_to_dev(qp->rdev),
+				"post_send failed op:%#x qps = %#x rc = %d\n",
+				wr->opcode, qp->qplib_qp.state, rc);
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	bnxt_qplib_post_send_db(&qp->qplib_qp);
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	return rc;
+}
+
+static int bnxt_re_post_recv_shadow_qp(struct bnxt_re_dev *rdev,
+				       struct bnxt_re_qp *qp,
+				       struct ib_recv_wr *wr)
+{
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+
+	memset(&wqe, 0, sizeof(wqe));
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
+			dev_err(rdev_to_dev(rdev),
+				"Limit exceeded for Receive SGEs");
+			rc = -EINVAL;
+			break;
+		}
+		payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list,
+					       wr->num_sge);
+		wqe.wr_id = wr->wr_id;
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+		rc = bnxt_qplib_post_recv(&qp->qplib_qp, &wqe);
+		if (rc)
+			break;
+
+		wr = wr->next;
+	}
+	if (!rc)
+		bnxt_qplib_post_recv_db(&qp->qplib_qp);
+	return rc;
+}
+
+int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
+			dev_err(rdev_to_dev(qp->rdev),
+				"Limit exceeded for Receive SGEs");
+			rc = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list,
+					       wr->num_sge);
+		wqe.wr_id = wr->wr_id;
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+		if (ib_qp->qp_type == IB_QPT_GSI)
+			rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe,
+							      payload_sz);
+		if (!rc)
+			rc = bnxt_qplib_post_recv(&qp->qplib_qp, &wqe);
+		if (rc) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	bnxt_qplib_post_recv_db(&qp->qplib_qp);
+	return rc;
+}
+
+/* Completion Queues */
+int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	struct bnxt_re_dev *rdev = cq->rdev;
+	int rc;
+
+	rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ");
+		return rc;
+	}
+	if (cq->umem && !IS_ERR(cq->umem))
+		ib_umem_release(cq->umem);
+
+	if (cq) {
+		kfree(cq->cql);
+		kfree(cq);
+	}
+	atomic_dec(&rdev->cq_count);
+	rdev->nq.budget--;
+	return 0;
+}
+
+struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_cq *cq = NULL;
+	int rc, entries;
+	int cqe = attr->cqe;
+
+	/* Validate CQ fields */
+	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
+		dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
+		return ERR_PTR(-EINVAL);
+	}
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	cq->rdev = rdev;
+	cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq);
+
+	entries = roundup_pow_of_two(cqe + 1);
+	if (entries > dev_attr->max_cq_wqes + 1)
+		entries = dev_attr->max_cq_wqes + 1;
+
+	if (context) {
+		struct bnxt_re_cq_req req;
+		struct bnxt_re_ucontext *uctx = container_of
+						(context,
+						 struct bnxt_re_ucontext,
+						 ib_uctx);
+		if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+			rc = -EFAULT;
+			goto fail;
+		}
+
+		cq->umem = ib_umem_get(context, req.cq_va,
+				       entries * sizeof(struct cq_base),
+				       IB_ACCESS_LOCAL_WRITE, 1);
+		if (IS_ERR(cq->umem)) {
+			rc = PTR_ERR(cq->umem);
+			goto fail;
+		}
+		cq->qplib_cq.sghead = cq->umem->sg_head.sgl;
+		cq->qplib_cq.nmap = cq->umem->nmap;
+		cq->qplib_cq.dpi = uctx->dpi;
+	} else {
+		cq->max_cql = min_t(u32, entries, MAX_CQL_PER_POLL);
+		cq->cql = kcalloc(cq->max_cql, sizeof(struct bnxt_qplib_cqe),
+				  GFP_KERNEL);
+		if (!cq->cql) {
+			rc = -ENOMEM;
+			goto fail;
+		}
+
+		cq->qplib_cq.dpi = &rdev->dpi_privileged;
+		cq->qplib_cq.sghead = NULL;
+		cq->qplib_cq.nmap = 0;
+	}
+	cq->qplib_cq.max_wqe = entries;
+	cq->qplib_cq.cnq_hw_ring_id = rdev->nq.ring_id;
+
+	rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to create HW CQ");
+		goto fail;
+	}
+
+	cq->ib_cq.cqe = entries;
+	cq->cq_period = cq->qplib_cq.period;
+	rdev->nq.budget++;
+
+	atomic_inc(&rdev->cq_count);
+
+	if (context) {
+		struct bnxt_re_cq_resp resp;
+
+		resp.cqid = cq->qplib_cq.id;
+		resp.tail = cq->qplib_cq.hwq.cons;
+		resp.phase = cq->qplib_cq.period;
+		resp.rsvd = 0;
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to copy CQ udata");
+			bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+			goto c2fail;
+		}
+	}
+
+	return &cq->ib_cq;
+
+c2fail:
+	if (context)
+		ib_umem_release(cq->umem);
+fail:
+	kfree(cq->cql);
+	kfree(cq);
+	return ERR_PTR(rc);
+}
+
+static u8 __req_to_ib_wc_status(u8 qstatus)
+{
+	switch (qstatus) {
+	case CQ_REQ_STATUS_OK:
+		return IB_WC_SUCCESS;
+	case CQ_REQ_STATUS_BAD_RESPONSE_ERR:
+		return IB_WC_BAD_RESP_ERR;
+	case CQ_REQ_STATUS_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case CQ_REQ_STATUS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case CQ_REQ_STATUS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case CQ_REQ_STATUS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_GENERAL_ERR;
+	case CQ_REQ_STATUS_REMOTE_INVALID_REQUEST_ERR:
+		return IB_WC_REM_INV_REQ_ERR;
+	case CQ_REQ_STATUS_REMOTE_ACCESS_ERR:
+		return IB_WC_REM_ACCESS_ERR;
+	case CQ_REQ_STATUS_REMOTE_OPERATION_ERR:
+		return IB_WC_REM_OP_ERR;
+	case CQ_REQ_STATUS_RNR_NAK_RETRY_CNT_ERR:
+		return IB_WC_RNR_RETRY_EXC_ERR;
+	case CQ_REQ_STATUS_TRANSPORT_RETRY_CNT_ERR:
+		return IB_WC_RETRY_EXC_ERR;
+	case CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+	return 0;
+}
+
+static u8 __rawqp1_to_ib_wc_status(u8 qstatus)
+{
+	switch (qstatus) {
+	case CQ_RES_RAWETH_QP1_STATUS_OK:
+		return IB_WC_SUCCESS;
+	case CQ_RES_RAWETH_QP1_STATUS_LOCAL_ACCESS_ERROR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_HW_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_GENERAL_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_WORK_REQUEST_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_HW_FLUSH_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+static u8 __rc_to_ib_wc_status(u8 qstatus)
+{
+	switch (qstatus) {
+	case CQ_RES_RC_STATUS_OK:
+		return IB_WC_SUCCESS;
+	case CQ_RES_RC_STATUS_LOCAL_ACCESS_ERROR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CQ_RES_RC_STATUS_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case CQ_RES_RC_STATUS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case CQ_RES_RC_STATUS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case CQ_RES_RC_STATUS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_GENERAL_ERR;
+	case CQ_RES_RC_STATUS_REMOTE_INVALID_REQUEST_ERR:
+		return IB_WC_REM_INV_REQ_ERR;
+	case CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	case CQ_RES_RC_STATUS_HW_FLUSH_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+static void bnxt_re_process_req_wc(struct ib_wc *wc, struct bnxt_qplib_cqe *cqe)
+{
+	switch (cqe->type) {
+	case BNXT_QPLIB_SWQE_TYPE_SEND:
+		wc->opcode = IB_WC_SEND;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
+		wc->opcode = IB_WC_SEND;
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
+		wc->opcode = IB_WC_SEND;
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE:
+		wc->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
+		wc->opcode = IB_WC_RDMA_WRITE;
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
+		wc->opcode = IB_WC_RDMA_READ;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
+		wc->opcode = IB_WC_COMP_SWAP;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
+		wc->opcode = IB_WC_FETCH_ADD;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
+		wc->opcode = IB_WC_LOCAL_INV;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_REG_MR:
+		wc->opcode = IB_WC_REG_MR;
+		break;
+	default:
+		wc->opcode = IB_WC_SEND;
+		break;
+	}
+
+	wc->status = __req_to_ib_wc_status(cqe->status);
+}
+
+static int bnxt_re_check_packet_type(u16 raweth_qp1_flags,
+				     u16 raweth_qp1_flags2)
+{
+	bool is_udp = false, is_ipv6 = false, is_ipv4 = false;
+
+	/* raweth_qp1_flags Bit 9-6 indicates itype */
+	if ((raweth_qp1_flags & CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ROCE)
+	    != CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ROCE)
+		return -1;
+
+	if (raweth_qp1_flags2 &
+	    CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_CS_CALC &&
+	    raweth_qp1_flags2 &
+	    CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_L4_CS_CALC) {
+		is_udp = true;
+		/* raweth_qp1_flags2 Bit 8 indicates ip_type. 0-v4 1 - v6 */
+		(raweth_qp1_flags2 &
+		 CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_TYPE) ?
+			(is_ipv6 = true) : (is_ipv4 = true);
+		return ((is_ipv6) ?
+			 BNXT_RE_ROCEV2_IPV6_PACKET :
+			 BNXT_RE_ROCEV2_IPV4_PACKET);
+	} else {
+		return BNXT_RE_ROCE_V1_PACKET;
+	}
+}
+
+static int bnxt_re_to_ib_nw_type(int nw_type)
+{
+	u8 nw_hdr_type = 0xFF;
+
+	switch (nw_type) {
+	case BNXT_RE_ROCE_V1_PACKET:
+		nw_hdr_type = RDMA_NETWORK_ROCE_V1;
+		break;
+	case BNXT_RE_ROCEV2_IPV4_PACKET:
+		nw_hdr_type = RDMA_NETWORK_IPV4;
+		break;
+	case BNXT_RE_ROCEV2_IPV6_PACKET:
+		nw_hdr_type = RDMA_NETWORK_IPV6;
+		break;
+	}
+	return nw_hdr_type;
+}
+
+static bool bnxt_re_is_loopback_packet(struct bnxt_re_dev *rdev,
+				       void *rq_hdr_buf)
+{
+	u8 *tmp_buf = NULL;
+	struct ethhdr *eth_hdr;
+	u16 eth_type;
+	bool rc = false;
+
+	tmp_buf = (u8 *)rq_hdr_buf;
+	/*
+	 * If dest mac is not same as I/F mac, this could be a
+	 * loopback address or multicast address, check whether
+	 * it is a loopback packet
+	 */
+	if (!ether_addr_equal(tmp_buf, rdev->netdev->dev_addr)) {
+		tmp_buf += 4;
+		/* Check the  ether type */
+		eth_hdr = (struct ethhdr *)tmp_buf;
+		eth_type = ntohs(eth_hdr->h_proto);
+		switch (eth_type) {
+		case ETH_P_IBOE:
+			rc = true;
+			break;
+		case ETH_P_IP:
+		case ETH_P_IPV6: {
+			u32 len;
+			struct udphdr *udp_hdr;
+
+			len = (eth_type == ETH_P_IP ? sizeof(struct iphdr) :
+						      sizeof(struct ipv6hdr));
+			tmp_buf += sizeof(struct ethhdr) + len;
+			udp_hdr = (struct udphdr *)tmp_buf;
+			if (ntohs(udp_hdr->dest) ==
+				    ROCE_V2_UDP_DPORT)
+				rc = true;
+			break;
+			}
+		default:
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
+					 struct bnxt_qplib_cqe *cqe)
+{
+	struct bnxt_re_dev *rdev = qp1_qp->rdev;
+	struct bnxt_re_sqp_entries *sqp_entry = NULL;
+	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	struct ib_send_wr *swr;
+	struct ib_ud_wr udwr;
+	struct ib_recv_wr rwr;
+	int pkt_type = 0;
+	u32 tbl_idx;
+	void *rq_hdr_buf;
+	dma_addr_t rq_hdr_buf_map;
+	dma_addr_t shrq_hdr_buf_map;
+	u32 offset = 0;
+	u32 skip_bytes = 0;
+	struct ib_sge s_sge[2];
+	struct ib_sge r_sge[2];
+	int rc;
+
+	memset(&udwr, 0, sizeof(udwr));
+	memset(&rwr, 0, sizeof(rwr));
+	memset(&s_sge, 0, sizeof(s_sge));
+	memset(&r_sge, 0, sizeof(r_sge));
+
+	swr = &udwr.wr;
+	tbl_idx = cqe->wr_id;
+
+	rq_hdr_buf = qp1_qp->qplib_qp.rq_hdr_buf +
+			(tbl_idx * qp1_qp->qplib_qp.rq_hdr_buf_size);
+	rq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp1_qp->qplib_qp,
+							  tbl_idx);
+
+	/* Shadow QP header buffer */
+	shrq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp->qplib_qp,
+							    tbl_idx);
+	sqp_entry = &rdev->sqp_tbl[tbl_idx];
+
+	/* Store this cqe */
+	memcpy(&sqp_entry->cqe, cqe, sizeof(struct bnxt_qplib_cqe));
+	sqp_entry->qp1_qp = qp1_qp;
+
+	/* Find packet type from the cqe */
+
+	pkt_type = bnxt_re_check_packet_type(cqe->raweth_qp1_flags,
+					     cqe->raweth_qp1_flags2);
+	if (pkt_type < 0) {
+		dev_err(rdev_to_dev(rdev), "Invalid packet\n");
+		return -EINVAL;
+	}
+
+	/* Adjust the offset for the user buffer and post in the rq */
+
+	if (pkt_type == BNXT_RE_ROCEV2_IPV4_PACKET)
+		offset = 20;
+
+	/*
+	 * QP1 loopback packet has 4 bytes of internal header before
+	 * ether header. Skip these four bytes.
+	 */
+	if (bnxt_re_is_loopback_packet(rdev, rq_hdr_buf))
+		skip_bytes = 4;
+
+	/* First send SGE . Skip the ether header*/
+	s_sge[0].addr = rq_hdr_buf_map + BNXT_QPLIB_MAX_QP1_RQ_ETH_HDR_SIZE
+			+ skip_bytes;
+	s_sge[0].lkey = 0xFFFFFFFF;
+	s_sge[0].length = offset ? BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV4 :
+				BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6;
+
+	/* Second Send SGE */
+	s_sge[1].addr = s_sge[0].addr + s_sge[0].length +
+			BNXT_QPLIB_MAX_QP1_RQ_BDETH_HDR_SIZE;
+	if (pkt_type != BNXT_RE_ROCE_V1_PACKET)
+		s_sge[1].addr += 8;
+	s_sge[1].lkey = 0xFFFFFFFF;
+	s_sge[1].length = 256;
+
+	/* First recv SGE */
+
+	r_sge[0].addr = shrq_hdr_buf_map;
+	r_sge[0].lkey = 0xFFFFFFFF;
+	r_sge[0].length = 40;
+
+	r_sge[1].addr = sqp_entry->sge.addr + offset;
+	r_sge[1].lkey = sqp_entry->sge.lkey;
+	r_sge[1].length = BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6 + 256 - offset;
+
+	/* Create receive work request */
+	rwr.num_sge = 2;
+	rwr.sg_list = r_sge;
+	rwr.wr_id = tbl_idx;
+	rwr.next = NULL;
+
+	rc = bnxt_re_post_recv_shadow_qp(rdev, qp, &rwr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to post Rx buffers to shadow QP");
+		return -ENOMEM;
+	}
+
+	swr->num_sge = 2;
+	swr->sg_list = s_sge;
+	swr->wr_id = tbl_idx;
+	swr->opcode = IB_WR_SEND;
+	swr->next = NULL;
+
+	udwr.ah = &rdev->sqp_ah->ib_ah;
+	udwr.remote_qpn = rdev->qp1_sqp->qplib_qp.id;
+	udwr.remote_qkey = rdev->qp1_sqp->qplib_qp.qkey;
+
+	/* post data received  in the send queue */
+	rc = bnxt_re_post_send_shadow_qp(rdev, qp, swr);
+
+	return 0;
+}
+
+static void bnxt_re_process_res_rawqp1_wc(struct ib_wc *wc,
+					  struct bnxt_qplib_cqe *cqe)
+{
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rawqp1_to_ib_wc_status(cqe->status);
+	wc->wc_flags |= IB_WC_GRH;
+}
+
+static void bnxt_re_process_res_rc_wc(struct ib_wc *wc,
+				      struct bnxt_qplib_cqe *cqe)
+{
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rc_to_ib_wc_status(cqe->status);
+
+	if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	if (cqe->flags & CQ_RES_RC_FLAGS_INV)
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
+	    (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
+		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+}
+
+static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
+					     struct ib_wc *wc,
+					     struct bnxt_qplib_cqe *cqe)
+{
+	u32 tbl_idx;
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_re_qp *qp1_qp = NULL;
+	struct bnxt_qplib_cqe *orig_cqe = NULL;
+	struct bnxt_re_sqp_entries *sqp_entry = NULL;
+	int nw_type;
+
+	tbl_idx = cqe->wr_id;
+
+	sqp_entry = &rdev->sqp_tbl[tbl_idx];
+	qp1_qp = sqp_entry->qp1_qp;
+	orig_cqe = &sqp_entry->cqe;
+
+	wc->wr_id = sqp_entry->wrid;
+	wc->byte_len = orig_cqe->length;
+	wc->qp = &qp1_qp->ib_qp;
+
+	wc->ex.imm_data = orig_cqe->immdata;
+	wc->src_qp = orig_cqe->src_qp;
+	memcpy(wc->smac, orig_cqe->smac, ETH_ALEN);
+	wc->port_num = 1;
+	wc->vendor_err = orig_cqe->status;
+
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rawqp1_to_ib_wc_status(orig_cqe->status);
+	wc->wc_flags |= IB_WC_GRH;
+
+	nw_type = bnxt_re_check_packet_type(orig_cqe->raweth_qp1_flags,
+					    orig_cqe->raweth_qp1_flags2);
+	if (nw_type >= 0) {
+		wc->network_hdr_type = bnxt_re_to_ib_nw_type(nw_type);
+		wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+	}
+}
+
+static void bnxt_re_process_res_ud_wc(struct ib_wc *wc,
+				      struct bnxt_qplib_cqe *cqe)
+{
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rc_to_ib_wc_status(cqe->status);
+
+	if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	if (cqe->flags & CQ_RES_RC_FLAGS_INV)
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
+	    (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
+		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+}
+
+int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
+{
+	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	struct bnxt_re_qp *qp;
+	struct bnxt_qplib_cqe *cqe;
+	int i, ncqe, budget;
+	u32 tbl_idx;
+	struct bnxt_re_sqp_entries *sqp_entry = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	budget = min_t(u32, num_entries, cq->max_cql);
+	if (!cq->cql) {
+		dev_err(rdev_to_dev(cq->rdev), "POLL CQ : no CQL to use");
+		goto exit;
+	}
+	cqe = &cq->cql[0];
+	while (budget) {
+		ncqe = bnxt_qplib_poll_cq(&cq->qplib_cq, cqe, budget);
+		if (!ncqe)
+			break;
+
+		for (i = 0; i < ncqe; i++, cqe++) {
+			/* Transcribe each qplib_wqe back to ib_wc */
+			memset(wc, 0, sizeof(*wc));
+
+			wc->wr_id = cqe->wr_id;
+			wc->byte_len = cqe->length;
+			qp = container_of
+				((struct bnxt_qplib_qp *)
+				 (unsigned long)(cqe->qp_handle),
+				 struct bnxt_re_qp, qplib_qp);
+			if (!qp) {
+				dev_err(rdev_to_dev(cq->rdev),
+					"POLL CQ : bad QP handle");
+				continue;
+			}
+			wc->qp = &qp->ib_qp;
+			wc->ex.imm_data = cqe->immdata;
+			wc->src_qp = cqe->src_qp;
+			memcpy(wc->smac, cqe->smac, ETH_ALEN);
+			wc->port_num = 1;
+			wc->vendor_err = cqe->status;
+
+			switch (cqe->opcode) {
+			case CQ_BASE_CQE_TYPE_REQ:
+				if (qp->qplib_qp.id ==
+				    qp->rdev->qp1_sqp->qplib_qp.id) {
+					/* Handle this completion with
+					 * the stored completion
+					 */
+					memset(wc, 0, sizeof(*wc));
+					continue;
+				}
+				bnxt_re_process_req_wc(wc, cqe);
+				break;
+			case CQ_BASE_CQE_TYPE_RES_RAWETH_QP1:
+				if (!cqe->status) {
+					int rc = 0;
+
+					rc = bnxt_re_process_raw_qp_pkt_rx
+								(qp, cqe);
+					if (!rc) {
+						memset(wc, 0, sizeof(*wc));
+						continue;
+					}
+					cqe->status = -1;
+				}
+				/* Errors need not be looped back.
+				 * But change the wr_id to the one
+				 * stored in the table
+				 */
+				tbl_idx = cqe->wr_id;
+				sqp_entry = &cq->rdev->sqp_tbl[tbl_idx];
+				wc->wr_id = sqp_entry->wrid;
+				bnxt_re_process_res_rawqp1_wc(wc, cqe);
+				break;
+			case CQ_BASE_CQE_TYPE_RES_RC:
+				bnxt_re_process_res_rc_wc(wc, cqe);
+				break;
+			case CQ_BASE_CQE_TYPE_RES_UD:
+				if (qp->qplib_qp.id ==
+				    qp->rdev->qp1_sqp->qplib_qp.id) {
+					/* Handle this completion with
+					 * the stored completion
+					 */
+					if (cqe->status) {
+						continue;
+					} else {
+						bnxt_re_process_res_shadow_qp_wc
+								(qp, wc, cqe);
+						break;
+					}
+				}
+				bnxt_re_process_res_ud_wc(wc, cqe);
+				break;
+			default:
+				dev_err(rdev_to_dev(cq->rdev),
+					"POLL CQ : type 0x%x not handled",
+					cqe->opcode);
+				continue;
+			}
+			wc++;
+			budget--;
+		}
+	}
+exit:
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	return num_entries - budget;
+}
+
+int bnxt_re_req_notify_cq(struct ib_cq *ib_cq,
+			  enum ib_cq_notify_flags ib_cqn_flags)
+{
+	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	int type = 0;
+
+	/* Trigger on the very next completion */
+	if (ib_cqn_flags & IB_CQ_NEXT_COMP)
+		type = DBR_DBR_TYPE_CQ_ARMALL;
+	/* Trigger on the next solicited completion */
+	else if (ib_cqn_flags & IB_CQ_SOLICITED)
+		type = DBR_DBR_TYPE_CQ_ARMSE;
+
+	bnxt_qplib_req_notify_cq(&cq->qplib_cq, type);
+
+	return 0;
+}
+
+/* Memory Regions */
+struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mr *mr;
+	u64 pbl = 0;
+	int rc;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+	/* Allocate and register 0 as the address */
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc)
+		goto fail;
+
+	mr->qplib_mr.hwq.level = PBL_LVL_MAX;
+	mr->qplib_mr.total_size = -1; /* Infinte length */
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false);
+	if (rc)
+		goto fail_mr;
+
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	if (mr_access_flags & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |
+			       IB_ACCESS_REMOTE_ATOMIC))
+		mr->ib_mr.rkey = mr->ib_mr.lkey;
+	atomic_inc(&rdev->mr_count);
+
+	return &mr->ib_mr;
+
+fail_mr:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+fail:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+	struct bnxt_re_dev *rdev = mr->rdev;
+	int rc = 0;
+
+	if (mr->npages && mr->pages) {
+		rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res,
+							&mr->qplib_frpl);
+		kfree(mr->pages);
+		mr->npages = 0;
+		mr->pages = NULL;
+	}
+	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+
+	if (!IS_ERR(mr->ib_umem) && mr->ib_umem)
+		ib_umem_release(mr->ib_umem);
+
+	kfree(mr);
+	atomic_dec(&rdev->mr_count);
+	return rc;
+}
+
+static int bnxt_re_set_page(struct ib_mr *ib_mr, u64 addr)
+{
+	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+
+	if (unlikely(mr->npages == mr->qplib_frpl.max_pg_ptrs))
+		return -ENOMEM;
+
+	mr->pages[mr->npages++] = addr;
+	return 0;
+}
+
+int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset)
+{
+	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+
+	mr->npages = 0;
+	return ib_sg_to_pages(ib_mr, sg, sg_nents, sg_offset, bnxt_re_set_page);
+}
+
+struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
+			       u32 max_num_sg)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mr *mr = NULL;
+	int rc;
+
+	if (type != IB_MR_TYPE_MEM_REG) {
+		dev_dbg(rdev_to_dev(rdev), "MR type 0x%x not supported", type);
+		return ERR_PTR(-EINVAL);
+	}
+	if (max_num_sg > MAX_PBL_LVL_1_PGS)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR;
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc)
+		goto fail;
+
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	mr->ib_mr.rkey = mr->ib_mr.lkey;
+
+	mr->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL);
+	if (!mr->pages) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = bnxt_qplib_alloc_fast_reg_page_list(&rdev->qplib_res,
+						 &mr->qplib_frpl, max_num_sg);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to allocate HW FR page list");
+		goto fail_mr;
+	}
+
+	atomic_inc(&rdev->mr_count);
+	return &mr->ib_mr;
+
+fail_mr:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+fail:
+	kfree(mr->pages);
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+/* Fast Memory Regions */
+struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *ib_pd, int mr_access_flags,
+				 struct ib_fmr_attr *fmr_attr)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_fmr *fmr;
+	int rc;
+
+	if (fmr_attr->max_pages > MAX_PBL_LVL_2_PGS ||
+	    fmr_attr->max_maps > rdev->dev_attr.max_map_per_fmr) {
+		dev_err(rdev_to_dev(rdev), "Allocate FMR exceeded Max limit");
+		return ERR_PTR(-ENOMEM);
+	}
+	fmr = kzalloc(sizeof(*fmr), GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	fmr->rdev = rdev;
+	fmr->qplib_fmr.pd = &pd->qplib_pd;
+	fmr->qplib_fmr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &fmr->qplib_fmr);
+	if (rc)
+		goto fail;
+
+	fmr->qplib_fmr.flags = __from_ib_access_flags(mr_access_flags);
+	fmr->ib_fmr.lkey = fmr->qplib_fmr.lkey;
+	fmr->ib_fmr.rkey = fmr->ib_fmr.lkey;
+
+	atomic_inc(&rdev->mr_count);
+	return &fmr->ib_fmr;
+fail:
+	kfree(fmr);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_map_phys_fmr(struct ib_fmr *ib_fmr, u64 *page_list, int list_len,
+			 u64 iova)
+{
+	struct bnxt_re_fmr *fmr = container_of(ib_fmr, struct bnxt_re_fmr,
+					     ib_fmr);
+	struct bnxt_re_dev *rdev = fmr->rdev;
+	int rc;
+
+	fmr->qplib_fmr.va = iova;
+	fmr->qplib_fmr.total_size = list_len * PAGE_SIZE;
+
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &fmr->qplib_fmr, page_list,
+			       list_len, true);
+	if (rc)
+		dev_err(rdev_to_dev(rdev), "Failed to map FMR for lkey = 0x%x!",
+			fmr->ib_fmr.lkey);
+	return rc;
+}
+
+int bnxt_re_unmap_fmr(struct list_head *fmr_list)
+{
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_fmr *fmr;
+	struct ib_fmr *ib_fmr;
+	int rc = 0;
+
+	/* Validate each FMRs inside the fmr_list */
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		fmr = container_of(ib_fmr, struct bnxt_re_fmr, ib_fmr);
+		rdev = fmr->rdev;
+
+		if (rdev) {
+			rc = bnxt_qplib_dereg_mrw(&rdev->qplib_res,
+						  &fmr->qplib_fmr, true);
+			if (rc)
+				break;
+		}
+	}
+	return rc;
+}
+
+int bnxt_re_dealloc_fmr(struct ib_fmr *ib_fmr)
+{
+	struct bnxt_re_fmr *fmr = container_of(ib_fmr, struct bnxt_re_fmr,
+					       ib_fmr);
+	struct bnxt_re_dev *rdev = fmr->rdev;
+	int rc;
+
+	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &fmr->qplib_fmr);
+	if (rc)
+		dev_err(rdev_to_dev(rdev), "Failed to free FMR");
+
+	kfree(fmr);
+	atomic_dec(&rdev->mr_count);
+	return rc;
+}
+
+/* uverbs */
+struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
+				  u64 virt_addr, int mr_access_flags,
+				  struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mr *mr;
+	struct ib_umem *umem;
+	u64 *pbl_tbl, *pbl_tbl_orig;
+	int i, umem_pgs, pages, page_shift, rc;
+	struct scatterlist *sg;
+	int entry;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR;
+
+	umem = ib_umem_get(ib_pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem)) {
+		dev_err(rdev_to_dev(rdev), "Failed to get umem");
+		rc = -EFAULT;
+		goto free_mr;
+	}
+	mr->ib_umem = umem;
+
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
+		goto release_umem;
+	}
+	/* The fixed portion of the rkey is the same as the lkey */
+	mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
+	mr->qplib_mr.va = virt_addr;
+	umem_pgs = ib_umem_page_count(umem);
+	if (!umem_pgs) {
+		dev_err(rdev_to_dev(rdev), "umem is invalid!");
+		rc = -EINVAL;
+		goto free_mrw;
+	}
+	mr->qplib_mr.total_size = length;
+
+	pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL);
+	if (!pbl_tbl) {
+		rc = -EINVAL;
+		goto free_mrw;
+	}
+	pbl_tbl_orig = pbl_tbl;
+
+	page_shift = ilog2(umem->page_size);
+	if (umem->hugetlb) {
+		dev_err(rdev_to_dev(rdev), "umem hugetlb not supported!");
+		rc = -EFAULT;
+		goto fail;
+	}
+	if (umem->page_size != PAGE_SIZE) {
+		dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
+		rc = -EFAULT;
+		goto fail;
+	}
+	/* Map umem buf ptrs to the PBL */
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> page_shift;
+		for (i = 0; i < pages; i++, pbl_tbl++)
+			*pbl_tbl = sg_dma_address(sg) + (i << page_shift);
+	}
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl_orig,
+			       umem_pgs, false);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to register user MR");
+		goto fail;
+	}
+
+	kfree(pbl_tbl_orig);
+
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	mr->ib_mr.rkey = mr->qplib_mr.lkey;
+	atomic_inc(&rdev->mr_count);
+
+	return &mr->ib_mr;
+fail:
+	kfree(pbl_tbl_orig);
+free_mrw:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+release_umem:
+	ib_umem_release(umem);
+free_mr:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
+					   struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_re_uctx_resp resp;
+	struct bnxt_re_ucontext *uctx;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	int rc;
+
+	dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
+		ibdev->uverbs_abi_ver);
+
+	if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
+		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
+			BNXT_RE_ABI_VERSION);
+		return ERR_PTR(-EPERM);
+	}
+
+	uctx = kzalloc(sizeof(*uctx), GFP_KERNEL);
+	if (!uctx)
+		return ERR_PTR(-ENOMEM);
+
+	uctx->rdev = rdev;
+
+	uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
+	if (!uctx->shpg) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	spin_lock_init(&uctx->sh_lock);
+
+	resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+	resp.max_qp = rdev->qplib_ctx.qpc_count;
+	resp.pg_size = PAGE_SIZE;
+	resp.cqe_sz = sizeof(struct cq_base);
+	resp.max_cqd = dev_attr->max_cq_wqes;
+	resp.rsvd    = 0;
+
+	rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
+		rc = -EFAULT;
+		goto cfail;
+	}
+
+	return &uctx->ib_uctx;
+cfail:
+	free_page((unsigned long)uctx->shpg);
+	uctx->shpg = NULL;
+fail:
+	kfree(uctx);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
+{
+	struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
+						   struct bnxt_re_ucontext,
+						   ib_uctx);
+	if (uctx->shpg)
+		free_page((unsigned long)uctx->shpg);
+	kfree(uctx);
+	return 0;
+}
+
+/* Helper function to mmap the virtual memory from user app */
+int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma)
+{
+	struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
+						   struct bnxt_re_ucontext,
+						   ib_uctx);
+	struct bnxt_re_dev *rdev = uctx->rdev;
+	u64 pfn;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				       PAGE_SIZE, vma->vm_page_prot)) {
+			dev_err(rdev_to_dev(rdev), "Failed to map DPI");
+			return -EAGAIN;
+		}
+	} else {
+		pfn = virt_to_phys(uctx->shpg) >> PAGE_SHIFT;
+		if (remap_pfn_range(vma, vma->vm_start,
+				    pfn, PAGE_SIZE, vma->vm_page_prot)) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to map shared page");
+			return -EAGAIN;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
new file mode 100644
index 000000000000..b4084c252f06
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -0,0 +1,197 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: IB Verbs interpreter (header)
+ */
+
+#ifndef __BNXT_RE_IB_VERBS_H__
+#define __BNXT_RE_IB_VERBS_H__
+
+struct bnxt_re_gid_ctx {
+	u32			idx;
+	u32			refcnt;
+};
+
+struct bnxt_re_pd {
+	struct bnxt_re_dev	*rdev;
+	struct ib_pd		ib_pd;
+	struct bnxt_qplib_pd	qplib_pd;
+	struct bnxt_qplib_dpi	dpi;
+};
+
+struct bnxt_re_ah {
+	struct bnxt_re_dev	*rdev;
+	struct ib_ah		ib_ah;
+	struct bnxt_qplib_ah	qplib_ah;
+};
+
+struct bnxt_re_qp {
+	struct list_head	list;
+	struct bnxt_re_dev	*rdev;
+	struct ib_qp		ib_qp;
+	spinlock_t		sq_lock;	/* protect sq */
+	struct bnxt_qplib_qp	qplib_qp;
+	struct ib_umem		*sumem;
+	struct ib_umem		*rumem;
+	/* QP1 */
+	u32			send_psn;
+	struct ib_ud_header	qp1_hdr;
+};
+
+struct bnxt_re_cq {
+	struct bnxt_re_dev	*rdev;
+	spinlock_t              cq_lock;	/* protect cq */
+	u16			cq_count;
+	u16			cq_period;
+	struct ib_cq		ib_cq;
+	struct bnxt_qplib_cq	qplib_cq;
+	struct bnxt_qplib_cqe	*cql;
+#define MAX_CQL_PER_POLL	1024
+	u32			max_cql;
+	struct ib_umem		*umem;
+};
+
+struct bnxt_re_mr {
+	struct bnxt_re_dev	*rdev;
+	struct ib_mr		ib_mr;
+	struct ib_umem		*ib_umem;
+	struct bnxt_qplib_mrw	qplib_mr;
+	u32			npages;
+	u64			*pages;
+	struct bnxt_qplib_frpl	qplib_frpl;
+};
+
+struct bnxt_re_frpl {
+	struct bnxt_re_dev		*rdev;
+	struct bnxt_qplib_frpl		qplib_frpl;
+	u64				*page_list;
+};
+
+struct bnxt_re_fmr {
+	struct bnxt_re_dev	*rdev;
+	struct ib_fmr		ib_fmr;
+	struct bnxt_qplib_mrw	qplib_fmr;
+};
+
+struct bnxt_re_mw {
+	struct bnxt_re_dev	*rdev;
+	struct ib_mw		ib_mw;
+	struct bnxt_qplib_mrw	qplib_mw;
+};
+
+struct bnxt_re_ucontext {
+	struct bnxt_re_dev	*rdev;
+	struct ib_ucontext	ib_uctx;
+	struct bnxt_qplib_dpi	*dpi;
+	void			*shpg;
+	spinlock_t		sh_lock;	/* protect shpg */
+};
+
+struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num);
+
+int bnxt_re_query_device(struct ib_device *ibdev,
+			 struct ib_device_attr *ib_attr,
+			 struct ib_udata *udata);
+int bnxt_re_modify_device(struct ib_device *ibdev,
+			  int device_modify_mask,
+			  struct ib_device_modify *device_modify);
+int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
+		       struct ib_port_attr *port_attr);
+int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num,
+			int port_modify_mask,
+			struct ib_port_modify *port_modify);
+int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable);
+int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+		       u16 index, u16 *pkey);
+int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num,
+		    unsigned int index, void **context);
+int bnxt_re_add_gid(struct ib_device *ibdev, u8 port_num,
+		    unsigned int index, const union ib_gid *gid,
+		    const struct ib_gid_attr *attr, void **context);
+int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+		      int index, union ib_gid *gid);
+enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
+					    u8 port_num);
+struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata);
+int bnxt_re_dealloc_pd(struct ib_pd *pd);
+struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
+				struct ib_ah_attr *ah_attr,
+				struct ib_udata *udata);
+int bnxt_re_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+int bnxt_re_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+int bnxt_re_destroy_ah(struct ib_ah *ah);
+struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *qp_init_attr,
+				struct ib_udata *udata);
+int bnxt_re_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		      int qp_attr_mask, struct ib_udata *udata);
+int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int bnxt_re_destroy_qp(struct ib_qp *qp);
+int bnxt_re_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+		      struct ib_send_wr **bad_send_wr);
+int bnxt_re_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+		      struct ib_recv_wr **bad_recv_wr);
+struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int bnxt_re_destroy_cq(struct ib_cq *cq);
+int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset);
+struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg);
+int bnxt_re_dereg_mr(struct ib_mr *mr);
+struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+				 struct ib_fmr_attr *fmr_attr);
+int bnxt_re_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len,
+			 u64 iova);
+int bnxt_re_unmap_fmr(struct list_head *fmr_list);
+int bnxt_re_dealloc_fmr(struct ib_fmr *fmr);
+struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int mr_access_flags,
+				  struct ib_udata *udata);
+struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
+					   struct ib_udata *udata);
+int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+#endif /* __BNXT_RE_IB_VERBS_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
new file mode 100644
index 000000000000..6b9f1178050f
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -0,0 +1,1315 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Main component of the bnxt_re driver
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <net/dcbnl.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+
+#include "bnxt_ulp.h"
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+#include "qplib_rcfw.h"
+#include "bnxt_re.h"
+#include "ib_verbs.h"
+#include <rdma/bnxt_re-abi.h>
+#include "bnxt.h"
+static char version[] =
+		BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
+
+MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>");
+MODULE_DESCRIPTION(BNXT_RE_DESC " Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(ROCE_DRV_MODULE_VERSION);
+
+/* globals */
+static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
+/* Mutex to protect the list of bnxt_re devices added */
+static DEFINE_MUTEX(bnxt_re_dev_lock);
+static struct workqueue_struct *bnxt_re_wq;
+
+/* for handling bnxt_en callbacks later */
+static void bnxt_re_stop(void *p)
+{
+}
+
+static void bnxt_re_start(void *p)
+{
+}
+
+static void bnxt_re_sriov_config(void *p, int num_vfs)
+{
+}
+
+static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
+	.ulp_async_notifier = NULL,
+	.ulp_stop = bnxt_re_stop,
+	.ulp_start = bnxt_re_start,
+	.ulp_sriov_config = bnxt_re_sriov_config
+};
+
+/* RoCE -> Net driver */
+
+/* Driver registration routines used to let the networking driver (bnxt_en)
+ * to know that the RoCE driver is now installed
+ */
+static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+	/* Acquire rtnl lock if it is not invokded from netdev event */
+	if (lock_wait)
+		rtnl_lock();
+
+	rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev,
+						    BNXT_ROCE_ULP);
+	if (lock_wait)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc = 0;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	rtnl_lock();
+	rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
+						  &bnxt_re_ulp_ops, rdev);
+	rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	if (lock_wait)
+		rtnl_lock();
+
+	rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP);
+
+	if (lock_wait)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
+{
+	int rc = 0, num_msix_want = BNXT_RE_MIN_MSIX, num_msix_got;
+	struct bnxt_en_dev *en_dev;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	rtnl_lock();
+	num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP,
+							 rdev->msix_entries,
+							 num_msix_want);
+	if (num_msix_got < BNXT_RE_MIN_MSIX) {
+		rc = -EINVAL;
+		goto done;
+	}
+	if (num_msix_got != num_msix_want) {
+		dev_warn(rdev_to_dev(rdev),
+			 "Requested %d MSI-X vectors, got %d\n",
+			 num_msix_want, num_msix_got);
+	}
+	rdev->num_msix = num_msix_got;
+done:
+	rtnl_unlock();
+	return rc;
+}
+
+static void bnxt_re_init_hwrm_hdr(struct bnxt_re_dev *rdev, struct input *hdr,
+				  u16 opcd, u16 crid, u16 trid)
+{
+	hdr->req_type = cpu_to_le16(opcd);
+	hdr->cmpl_ring = cpu_to_le16(crid);
+	hdr->target_id = cpu_to_le16(trid);
+}
+
+static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg,
+				int msg_len, void *resp, int resp_max_len,
+				int timeout)
+{
+	fw_msg->msg = msg;
+	fw_msg->msg_len = msg_len;
+	fw_msg->resp = resp;
+	fw_msg->resp_max_len = resp_max_len;
+	fw_msg->timeout = timeout;
+}
+
+static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id,
+				 bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_ring_free_input req = {0};
+	struct hwrm_ring_free_output resp;
+	struct bnxt_fw_msg fw_msg;
+	bool do_unlock = false;
+	int rc = -EINVAL;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	if (lock_wait) {
+		rtnl_lock();
+		do_unlock = true;
+	}
+
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
+	req.ring_type = RING_ALLOC_REQ_RING_TYPE_CMPL;
+	req.ring_id = cpu_to_le16(fw_ring_id);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc)
+		dev_err(rdev_to_dev(rdev),
+			"Failed to free HW ring:%d :%#x", req.ring_id, rc);
+	if (do_unlock)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
+				  int pages, int type, u32 ring_mask,
+				  u32 map_index, u16 *fw_ring_id)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_ring_alloc_input req = {0};
+	struct hwrm_ring_alloc_output resp;
+	struct bnxt_fw_msg fw_msg;
+	int rc = -EINVAL;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	rtnl_lock();
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1);
+	req.enables = 0;
+	req.page_tbl_addr =  cpu_to_le64(dma_arr[0]);
+	if (pages > 1) {
+		/* Page size is in log2 units */
+		req.page_size = BNXT_PAGE_SHIFT;
+		req.page_tbl_depth = 1;
+	}
+	req.fbo = 0;
+	/* Association of ring index with doorbell index and MSIX number */
+	req.logical_id = cpu_to_le16(map_index);
+	req.length = cpu_to_le32(ring_mask + 1);
+	req.ring_type = RING_ALLOC_REQ_RING_TYPE_CMPL;
+	req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (!rc)
+		*fw_ring_id = le16_to_cpu(resp.ring_id);
+
+	rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
+				      u32 fw_stats_ctx_id, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_stat_ctx_free_input req = {0};
+	struct bnxt_fw_msg fw_msg;
+	bool do_unlock = false;
+	int rc = -EINVAL;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	if (lock_wait) {
+		rtnl_lock();
+		do_unlock = true;
+	}
+
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1);
+	req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&req,
+			    sizeof(req), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc)
+		dev_err(rdev_to_dev(rdev),
+			"Failed to free HW stats context %#x", rc);
+
+	if (do_unlock)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev,
+				       dma_addr_t dma_map,
+				       u32 *fw_stats_ctx_id)
+{
+	struct hwrm_stat_ctx_alloc_output resp = {0};
+	struct hwrm_stat_ctx_alloc_input req = {0};
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct bnxt_fw_msg fw_msg;
+	int rc = -EINVAL;
+
+	*fw_stats_ctx_id = INVALID_STATS_CTX_ID;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	rtnl_lock();
+
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1);
+	req.update_period_ms = cpu_to_le32(1000);
+	req.stats_dma_addr = cpu_to_le64(dma_map);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (!rc)
+		*fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id);
+
+	rtnl_unlock();
+	return rc;
+}
+
+/* Device */
+
+static bool is_bnxt_re_dev(struct net_device *netdev)
+{
+	struct ethtool_drvinfo drvinfo;
+
+	if (netdev->ethtool_ops && netdev->ethtool_ops->get_drvinfo) {
+		memset(&drvinfo, 0, sizeof(drvinfo));
+		netdev->ethtool_ops->get_drvinfo(netdev, &drvinfo);
+
+		if (strcmp(drvinfo.driver, "bnxt_en"))
+			return false;
+		return true;
+	}
+	return false;
+}
+
+static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
+{
+	struct bnxt_re_dev *rdev;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rdev, &bnxt_re_dev_list, list) {
+		if (rdev->netdev == netdev) {
+			rcu_read_unlock();
+			return rdev;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+static void bnxt_re_dev_unprobe(struct net_device *netdev,
+				struct bnxt_en_dev *en_dev)
+{
+	dev_put(netdev);
+	module_put(en_dev->pdev->driver->driver.owner);
+}
+
+static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
+{
+	struct bnxt *bp = netdev_priv(netdev);
+	struct bnxt_en_dev *en_dev;
+	struct pci_dev *pdev;
+
+	/* Call bnxt_en's RoCE probe via indirect API */
+	if (!bp->ulp_probe)
+		return ERR_PTR(-EINVAL);
+
+	en_dev = bp->ulp_probe(netdev);
+	if (IS_ERR(en_dev))
+		return en_dev;
+
+	pdev = en_dev->pdev;
+	if (!pdev)
+		return ERR_PTR(-EINVAL);
+
+	if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) {
+		dev_dbg(&pdev->dev,
+			"%s: probe error: RoCE is not supported on this device",
+			ROCE_DRV_MODULE_NAME);
+		return ERR_PTR(-ENODEV);
+	}
+
+	/* Bump net device reference count */
+	if (!try_module_get(pdev->driver->driver.owner))
+		return ERR_PTR(-ENODEV);
+
+	dev_hold(netdev);
+
+	return en_dev;
+}
+
+static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
+{
+	ib_unregister_device(&rdev->ibdev);
+}
+
+static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
+{
+	struct ib_device *ibdev = &rdev->ibdev;
+
+	/* ib device init */
+	ibdev->owner = THIS_MODULE;
+	ibdev->node_type = RDMA_NODE_IB_CA;
+	strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX);
+	strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
+		strlen(BNXT_RE_DESC) + 5);
+	ibdev->phys_port_cnt = 1;
+
+	bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ibdev->node_guid);
+
+	ibdev->num_comp_vectors	= 1;
+	ibdev->dma_device = &rdev->en_dev->pdev->dev;
+	ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
+
+	/* User space */
+	ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION;
+	ibdev->uverbs_cmd_mask =
+			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+			(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+			(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+			(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+			(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+			(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+			(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_AH)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_AH)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+	/* POLL_CQ and REQ_NOTIFY_CQ is directly handled in libbnxt_re */
+
+	/* Kernel verbs */
+	ibdev->query_device		= bnxt_re_query_device;
+	ibdev->modify_device		= bnxt_re_modify_device;
+
+	ibdev->query_port		= bnxt_re_query_port;
+	ibdev->modify_port		= bnxt_re_modify_port;
+	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
+	ibdev->query_pkey		= bnxt_re_query_pkey;
+	ibdev->query_gid		= bnxt_re_query_gid;
+	ibdev->get_netdev		= bnxt_re_get_netdev;
+	ibdev->add_gid			= bnxt_re_add_gid;
+	ibdev->del_gid			= bnxt_re_del_gid;
+	ibdev->get_link_layer		= bnxt_re_get_link_layer;
+
+	ibdev->alloc_pd			= bnxt_re_alloc_pd;
+	ibdev->dealloc_pd		= bnxt_re_dealloc_pd;
+
+	ibdev->create_ah		= bnxt_re_create_ah;
+	ibdev->modify_ah		= bnxt_re_modify_ah;
+	ibdev->query_ah			= bnxt_re_query_ah;
+	ibdev->destroy_ah		= bnxt_re_destroy_ah;
+
+	ibdev->create_qp		= bnxt_re_create_qp;
+	ibdev->modify_qp		= bnxt_re_modify_qp;
+	ibdev->query_qp			= bnxt_re_query_qp;
+	ibdev->destroy_qp		= bnxt_re_destroy_qp;
+
+	ibdev->post_send		= bnxt_re_post_send;
+	ibdev->post_recv		= bnxt_re_post_recv;
+
+	ibdev->create_cq		= bnxt_re_create_cq;
+	ibdev->destroy_cq		= bnxt_re_destroy_cq;
+	ibdev->poll_cq			= bnxt_re_poll_cq;
+	ibdev->req_notify_cq		= bnxt_re_req_notify_cq;
+
+	ibdev->get_dma_mr		= bnxt_re_get_dma_mr;
+	ibdev->dereg_mr			= bnxt_re_dereg_mr;
+	ibdev->alloc_mr			= bnxt_re_alloc_mr;
+	ibdev->map_mr_sg		= bnxt_re_map_mr_sg;
+	ibdev->alloc_fmr		= bnxt_re_alloc_fmr;
+	ibdev->map_phys_fmr		= bnxt_re_map_phys_fmr;
+	ibdev->unmap_fmr		= bnxt_re_unmap_fmr;
+	ibdev->dealloc_fmr		= bnxt_re_dealloc_fmr;
+
+	ibdev->reg_user_mr		= bnxt_re_reg_user_mr;
+	ibdev->alloc_ucontext		= bnxt_re_alloc_ucontext;
+	ibdev->dealloc_ucontext		= bnxt_re_dealloc_ucontext;
+	ibdev->mmap			= bnxt_re_mmap;
+
+	return ib_register_device(ibdev, NULL);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->dev_attr.fw_ver);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
+}
+
+static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL);
+static DEVICE_ATTR(fw_rev, 0444, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, 0444, show_hca, NULL);
+
+static struct device_attribute *bnxt_re_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_rev,
+	&dev_attr_hca_type
+};
+
+static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
+{
+	dev_put(rdev->netdev);
+	rdev->netdev = NULL;
+
+	mutex_lock(&bnxt_re_dev_lock);
+	list_del_rcu(&rdev->list);
+	mutex_unlock(&bnxt_re_dev_lock);
+
+	synchronize_rcu();
+	flush_workqueue(bnxt_re_wq);
+
+	ib_dealloc_device(&rdev->ibdev);
+	/* rdev is gone */
+}
+
+static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
+					   struct bnxt_en_dev *en_dev)
+{
+	struct bnxt_re_dev *rdev;
+
+	/* Allocate bnxt_re_dev instance here */
+	rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+	if (!rdev) {
+		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
+			ROCE_DRV_MODULE_NAME);
+		return NULL;
+	}
+	/* Default values */
+	rdev->netdev = netdev;
+	dev_hold(rdev->netdev);
+	rdev->en_dev = en_dev;
+	rdev->id = rdev->en_dev->pdev->devfn;
+	INIT_LIST_HEAD(&rdev->qp_list);
+	mutex_init(&rdev->qp_lock);
+	atomic_set(&rdev->qp_count, 0);
+	atomic_set(&rdev->cq_count, 0);
+	atomic_set(&rdev->srq_count, 0);
+	atomic_set(&rdev->mr_count, 0);
+	atomic_set(&rdev->mw_count, 0);
+	rdev->cosq[0] = 0xFFFF;
+	rdev->cosq[1] = 0xFFFF;
+
+	mutex_lock(&bnxt_re_dev_lock);
+	list_add_tail_rcu(&rdev->list, &bnxt_re_dev_list);
+	mutex_unlock(&bnxt_re_dev_lock);
+	return rdev;
+}
+
+static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw,
+			       struct creq_func_event *aeqe)
+{
+	switch (aeqe->event) {
+	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
+			       struct bnxt_qplib_cq *handle)
+{
+	struct bnxt_re_cq *cq = container_of(handle, struct bnxt_re_cq,
+					     qplib_cq);
+
+	if (!cq) {
+		dev_err(NULL, "%s: CQ is NULL, CQN not handled",
+			ROCE_DRV_MODULE_NAME);
+		return -EINVAL;
+	}
+	if (cq->ib_cq.comp_handler) {
+		/* Lock comp_handler? */
+		(*cq->ib_cq.comp_handler)(&cq->ib_cq, cq->ib_cq.cq_context);
+	}
+
+	return 0;
+}
+
+static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
+{
+	if (rdev->nq.hwq.max_elements)
+		bnxt_qplib_disable_nq(&rdev->nq);
+
+	if (rdev->qplib_res.rcfw)
+		bnxt_qplib_cleanup_res(&rdev->qplib_res);
+}
+
+static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
+{
+	int rc = 0;
+
+	bnxt_qplib_init_res(&rdev->qplib_res);
+
+	if (rdev->msix_entries[BNXT_RE_NQ_IDX].vector <= 0)
+		return -EINVAL;
+
+	rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq,
+				  rdev->msix_entries[BNXT_RE_NQ_IDX].vector,
+				  rdev->msix_entries[BNXT_RE_NQ_IDX].db_offset,
+				  &bnxt_re_cqn_handler,
+				  NULL);
+
+	if (rc)
+		dev_err(rdev_to_dev(rdev), "Failed to enable NQ: %#x", rc);
+
+	return rc;
+}
+
+static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	if (rdev->nq.hwq.max_elements) {
+		bnxt_re_net_ring_free(rdev, rdev->nq.ring_id, lock_wait);
+		bnxt_qplib_free_nq(&rdev->nq);
+	}
+	if (rdev->qplib_res.dpi_tbl.max) {
+		bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+				       &rdev->qplib_res.dpi_tbl,
+				       &rdev->dpi_privileged);
+	}
+	if (rdev->qplib_res.rcfw) {
+		bnxt_qplib_free_res(&rdev->qplib_res);
+		rdev->qplib_res.rcfw = NULL;
+	}
+}
+
+static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
+{
+	int rc = 0;
+
+	/* Configure and allocate resources for qplib */
+	rdev->qplib_res.rcfw = &rdev->rcfw;
+	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->en_dev->pdev,
+				  rdev->netdev, &rdev->dev_attr);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl,
+				  &rdev->dpi_privileged,
+				  rdev);
+	if (rc)
+		goto fail;
+
+	rdev->nq.hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
+				    BNXT_RE_MAX_SRQC_COUNT + 2;
+	rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to allocate NQ memory: %#x", rc);
+		goto fail;
+	}
+	rc = bnxt_re_net_ring_alloc
+			(rdev, rdev->nq.hwq.pbl[PBL_LVL_0].pg_map_arr,
+			 rdev->nq.hwq.pbl[rdev->nq.hwq.level].pg_count,
+			 HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_NQE_MAX_CNT - 1,
+			 rdev->msix_entries[BNXT_RE_NQ_IDX].ring_idx,
+			 &rdev->nq.ring_id);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to allocate NQ ring: %#x", rc);
+		goto free_nq;
+	}
+	return 0;
+free_nq:
+	bnxt_qplib_free_nq(&rdev->nq);
+fail:
+	rdev->qplib_res.rcfw = NULL;
+	return rc;
+}
+
+static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp,
+				   u8 port_num, enum ib_event_type event)
+{
+	struct ib_event ib_event;
+
+	ib_event.device = ibdev;
+	if (qp)
+		ib_event.element.qp = qp;
+	else
+		ib_event.element.port_num = port_num;
+	ib_event.event = event;
+	ib_dispatch_event(&ib_event);
+}
+
+#define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN      0x02
+static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir,
+				      u64 *cid_map)
+{
+	struct hwrm_queue_pri2cos_qcfg_input req = {0};
+	struct bnxt *bp = netdev_priv(rdev->netdev);
+	struct hwrm_queue_pri2cos_qcfg_output resp;
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct bnxt_fw_msg fw_msg;
+	u32 flags = 0;
+	u8 *qcfgmap, *tmp_map;
+	int rc = 0, i;
+
+	if (!cid_map)
+		return -EINVAL;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req,
+			      HWRM_QUEUE_PRI2COS_QCFG, -1, -1);
+	flags |= (dir & 0x01);
+	flags |= HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN;
+	req.flags = cpu_to_le32(flags);
+	req.port_id = bp->pf.port_id;
+
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc)
+		return rc;
+
+	if (resp.queue_cfg_info) {
+		dev_warn(rdev_to_dev(rdev),
+			 "Asymmetric cos queue configuration detected");
+		dev_warn(rdev_to_dev(rdev),
+			 " on device, QoS may not be fully functional\n");
+	}
+	qcfgmap = &resp.pri0_cos_queue_id;
+	tmp_map = (u8 *)cid_map;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		tmp_map[i] = qcfgmap[i];
+
+	return rc;
+}
+
+static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev,
+					struct bnxt_re_qp *qp)
+{
+	return (qp->ib_qp.qp_type == IB_QPT_GSI) || (qp == rdev->qp1_sqp);
+}
+
+static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
+{
+	int mask = IB_QP_STATE;
+	struct ib_qp_attr qp_attr;
+	struct bnxt_re_qp *qp;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	mutex_lock(&rdev->qp_lock);
+	list_for_each_entry(qp, &rdev->qp_list, list) {
+		/* Modify the state of all QPs except QP1/Shadow QP */
+		if (!bnxt_re_is_qp1_or_shadow_qp(rdev, qp)) {
+			if (qp->qplib_qp.state !=
+			    CMDQ_MODIFY_QP_NEW_STATE_RESET &&
+			    qp->qplib_qp.state !=
+			    CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+				bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp,
+						       1, IB_EVENT_QP_FATAL);
+				bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask,
+						  NULL);
+			}
+		}
+	}
+	mutex_unlock(&rdev->qp_lock);
+}
+
+static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
+{
+	u32 prio_map = 0, tmp_map = 0;
+	struct net_device *netdev;
+	struct dcb_app app;
+
+	netdev = rdev->netdev;
+
+	memset(&app, 0, sizeof(app));
+	app.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE;
+	app.protocol = ETH_P_IBOE;
+	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
+	prio_map = tmp_map;
+
+	app.selector = IEEE_8021QAZ_APP_SEL_DGRAM;
+	app.protocol = ROCE_V2_UDP_DPORT;
+	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
+	prio_map |= tmp_map;
+
+	if (!prio_map)
+		prio_map = -EFAULT;
+	return prio_map;
+}
+
+static void bnxt_re_parse_cid_map(u8 prio_map, u8 *cid_map, u16 *cosq)
+{
+	u16 prio;
+	u8 id;
+
+	for (prio = 0, id = 0; prio < 8; prio++) {
+		if (prio_map & (1 << prio)) {
+			cosq[id] = cid_map[prio];
+			id++;
+			if (id == 2) /* Max 2 tcs supported */
+				break;
+		}
+	}
+}
+
+static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
+{
+	u8 prio_map = 0;
+	u64 cid_map;
+	int rc;
+
+	/* Get priority for roce */
+	rc = bnxt_re_get_priority_mask(rdev);
+	if (rc < 0)
+		return rc;
+	prio_map = (u8)rc;
+
+	if (prio_map == rdev->cur_prio_map)
+		return 0;
+	rdev->cur_prio_map = prio_map;
+	/* Get cosq id for this priority */
+	rc = bnxt_re_query_hwrm_pri2cos(rdev, 0, &cid_map);
+	if (rc) {
+		dev_warn(rdev_to_dev(rdev), "no cos for p_mask %x\n", prio_map);
+		return rc;
+	}
+	/* Parse CoS IDs for app priority */
+	bnxt_re_parse_cid_map(prio_map, (u8 *)&cid_map, rdev->cosq);
+
+	/* Config BONO. */
+	rc = bnxt_qplib_map_tc2cos(&rdev->qplib_res, rdev->cosq);
+	if (rc) {
+		dev_warn(rdev_to_dev(rdev), "no tc for cos{%x, %x}\n",
+			 rdev->cosq[0], rdev->cosq[1]);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	int i, rc;
+
+	if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
+		for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++)
+			device_remove_file(&rdev->ibdev.dev,
+					   bnxt_re_attributes[i]);
+		/* Cleanup ib dev */
+		bnxt_re_unregister_ib(rdev);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
+		cancel_delayed_work(&rdev->worker);
+
+	bnxt_re_cleanup_res(rdev);
+	bnxt_re_free_res(rdev, lock_wait);
+
+	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
+		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to deinitialize RCFW: %#x", rc);
+		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id,
+					   lock_wait);
+		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
+		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, lock_wait);
+		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
+		rc = bnxt_re_free_msix(rdev, lock_wait);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to free MSI-X vectors: %#x", rc);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
+		rc = bnxt_re_unregister_netdev(rdev, lock_wait);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to unregister with netdev: %#x", rc);
+	}
+}
+
+static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
+{
+	u32 i;
+
+	rdev->qplib_ctx.qpc_count = BNXT_RE_MAX_QPC_COUNT;
+	rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT;
+	rdev->qplib_ctx.srqc_count = BNXT_RE_MAX_SRQC_COUNT;
+	rdev->qplib_ctx.cq_count = BNXT_RE_MAX_CQ_COUNT;
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
+		rdev->qplib_ctx.tqm_count[i] =
+		rdev->dev_attr.tqm_alloc_reqs[i];
+}
+
+/* worker thread for polling periodic events. Now used for QoS programming*/
+static void bnxt_re_worker(struct work_struct *work)
+{
+	struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev,
+						worker.work);
+
+	bnxt_re_setup_qos(rdev);
+	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+}
+
+static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
+{
+	int i, j, rc;
+
+	/* Registered a new RoCE device instance to netdev */
+	rc = bnxt_re_register_netdev(rdev);
+	if (rc) {
+		pr_err("Failed to register with netedev: %#x\n", rc);
+		return -EINVAL;
+	}
+	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+
+	rc = bnxt_re_request_msix(rdev);
+	if (rc) {
+		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
+		rc = -EINVAL;
+		goto fail;
+	}
+	set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags);
+
+	/* Establish RCFW Communication Channel to initialize the context
+	 * memory for the function and all child VFs
+	 */
+	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_re_net_ring_alloc
+			(rdev, rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr,
+			 rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count,
+			 HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_CREQE_MAX_CNT - 1,
+			 rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx,
+			 &rdev->rcfw.creq_ring_id);
+	if (rc) {
+		pr_err("Failed to allocate CREQ: %#x\n", rc);
+		goto free_rcfw;
+	}
+	rc = bnxt_qplib_enable_rcfw_channel
+				(rdev->en_dev->pdev, &rdev->rcfw,
+				 rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
+				 rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
+				 0, &bnxt_re_aeq_handler);
+	if (rc) {
+		pr_err("Failed to enable RCFW channel: %#x\n", rc);
+		goto free_ring;
+	}
+
+	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr);
+	if (rc)
+		goto disable_rcfw;
+	bnxt_re_set_resource_limits(rdev);
+
+	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
+	if (rc) {
+		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
+		goto disable_rcfw;
+	}
+	rc = bnxt_re_net_stats_ctx_alloc(rdev,
+					 rdev->qplib_ctx.stats.dma_map,
+					 &rdev->qplib_ctx.stats.fw_id);
+	if (rc) {
+		pr_err("Failed to allocate stats context: %#x\n", rc);
+		goto free_ctx;
+	}
+
+	rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx, 0);
+	if (rc) {
+		pr_err("Failed to initialize RCFW: %#x\n", rc);
+		goto free_sctx;
+	}
+	set_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
+
+	/* Resources based on the 'new' device caps */
+	rc = bnxt_re_alloc_res(rdev);
+	if (rc) {
+		pr_err("Failed to allocate resources: %#x\n", rc);
+		goto fail;
+	}
+	rc = bnxt_re_init_res(rdev);
+	if (rc) {
+		pr_err("Failed to initialize resources: %#x\n", rc);
+		goto fail;
+	}
+
+	rc = bnxt_re_setup_qos(rdev);
+	if (rc)
+		pr_info("RoCE priority not yet configured\n");
+
+	INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
+	set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
+	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+
+	/* Register ib dev */
+	rc = bnxt_re_register_ib(rdev);
+	if (rc) {
+		pr_err("Failed to register with IB: %#x\n", rc);
+		goto fail;
+	}
+	dev_info(rdev_to_dev(rdev), "Device registered successfully");
+	for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) {
+		rc = device_create_file(&rdev->ibdev.dev,
+					bnxt_re_attributes[i]);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to create IB sysfs: %#x", rc);
+			/* Must clean up all created device files */
+			for (j = 0; j < i; j++)
+				device_remove_file(&rdev->ibdev.dev,
+						   bnxt_re_attributes[j]);
+			bnxt_re_unregister_ib(rdev);
+			goto fail;
+		}
+	}
+	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
+	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
+	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE);
+
+	return 0;
+free_sctx:
+	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, true);
+free_ctx:
+	bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+disable_rcfw:
+	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
+free_ring:
+	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, true);
+free_rcfw:
+	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
+fail:
+	bnxt_re_ib_unreg(rdev, true);
+	return rc;
+}
+
+static void bnxt_re_dev_unreg(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct net_device *netdev = rdev->netdev;
+
+	bnxt_re_dev_remove(rdev);
+
+	if (netdev)
+		bnxt_re_dev_unprobe(netdev, en_dev);
+}
+
+static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc = 0;
+
+	if (!is_bnxt_re_dev(netdev))
+		return -ENODEV;
+
+	en_dev = bnxt_re_dev_probe(netdev);
+	if (IS_ERR(en_dev)) {
+		if (en_dev != ERR_PTR(-ENODEV))
+			pr_err("%s: Failed to probe\n", ROCE_DRV_MODULE_NAME);
+		rc = PTR_ERR(en_dev);
+		goto exit;
+	}
+	*rdev = bnxt_re_dev_add(netdev, en_dev);
+	if (!*rdev) {
+		rc = -ENOMEM;
+		bnxt_re_dev_unprobe(netdev, en_dev);
+		goto exit;
+	}
+exit:
+	return rc;
+}
+
+static void bnxt_re_remove_one(struct bnxt_re_dev *rdev)
+{
+	pci_dev_put(rdev->en_dev->pdev);
+}
+
+/* Handle all deferred netevents tasks */
+static void bnxt_re_task(struct work_struct *work)
+{
+	struct bnxt_re_work *re_work;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+
+	re_work = container_of(work, struct bnxt_re_work, work);
+	rdev = re_work->rdev;
+
+	if (re_work->event != NETDEV_REGISTER &&
+	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+		return;
+
+	switch (re_work->event) {
+	case NETDEV_REGISTER:
+		rc = bnxt_re_ib_reg(rdev);
+		if (rc)
+			dev_err(rdev_to_dev(rdev),
+				"Failed to register with IB: %#x", rc);
+		break;
+	case NETDEV_UP:
+		bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1,
+				       IB_EVENT_PORT_ACTIVE);
+		break;
+	case NETDEV_DOWN:
+		bnxt_re_dev_stop(rdev);
+		break;
+	case NETDEV_CHANGE:
+		if (!netif_carrier_ok(rdev->netdev))
+			bnxt_re_dev_stop(rdev);
+		else if (netif_carrier_ok(rdev->netdev))
+			bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1,
+					       IB_EVENT_PORT_ACTIVE);
+		break;
+	default:
+		break;
+	}
+	kfree(re_work);
+}
+
+static void bnxt_re_init_one(struct bnxt_re_dev *rdev)
+{
+	pci_dev_get(rdev->en_dev->pdev);
+}
+
+/*
+ * "Notifier chain callback can be invoked for the same chain from
+ * different CPUs at the same time".
+ *
+ * For cases when the netdev is already present, our call to the
+ * register_netdevice_notifier() will actually get the rtnl_lock()
+ * before sending NETDEV_REGISTER and (if up) NETDEV_UP
+ * events.
+ *
+ * But for cases when the netdev is not already present, the notifier
+ * chain is subjected to be invoked from different CPUs simultaneously.
+ *
+ * This is protected by the netdev_mutex.
+ */
+static int bnxt_re_netdev_event(struct notifier_block *notifier,
+				unsigned long event, void *ptr)
+{
+	struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bnxt_re_work *re_work;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+	bool sch_work = false;
+
+	real_dev = rdma_vlan_dev_real_dev(netdev);
+	if (!real_dev)
+		real_dev = netdev;
+
+	rdev = bnxt_re_from_netdev(real_dev);
+	if (!rdev && event != NETDEV_REGISTER)
+		goto exit;
+	if (real_dev != netdev)
+		goto exit;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		if (rdev)
+			break;
+		rc = bnxt_re_dev_reg(&rdev, real_dev);
+		if (rc == -ENODEV)
+			break;
+		if (rc) {
+			pr_err("Failed to register with the device %s: %#x\n",
+			       real_dev->name, rc);
+			break;
+		}
+		bnxt_re_init_one(rdev);
+		sch_work = true;
+		break;
+
+	case NETDEV_UNREGISTER:
+		bnxt_re_ib_unreg(rdev, false);
+		bnxt_re_remove_one(rdev);
+		bnxt_re_dev_unreg(rdev);
+		break;
+
+	default:
+		sch_work = true;
+		break;
+	}
+	if (sch_work) {
+		/* Allocate for the deferred task */
+		re_work = kzalloc(sizeof(*re_work), GFP_ATOMIC);
+		if (re_work) {
+			re_work->rdev = rdev;
+			re_work->event = event;
+			re_work->vlan_dev = (real_dev == netdev ?
+					     NULL : netdev);
+			INIT_WORK(&re_work->work, bnxt_re_task);
+			queue_work(bnxt_re_wq, &re_work->work);
+		}
+	}
+
+exit:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block bnxt_re_netdev_notifier = {
+	.notifier_call = bnxt_re_netdev_event
+};
+
+static int __init bnxt_re_mod_init(void)
+{
+	int rc = 0;
+
+	pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version);
+
+	bnxt_re_wq = create_singlethread_workqueue("bnxt_re");
+	if (!bnxt_re_wq)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&bnxt_re_dev_list);
+
+	rc = register_netdevice_notifier(&bnxt_re_netdev_notifier);
+	if (rc) {
+		pr_err("%s: Cannot register to netdevice_notifier",
+		       ROCE_DRV_MODULE_NAME);
+		goto err_netdev;
+	}
+	return 0;
+
+err_netdev:
+	destroy_workqueue(bnxt_re_wq);
+
+	return rc;
+}
+
+static void __exit bnxt_re_mod_exit(void)
+{
+	unregister_netdevice_notifier(&bnxt_re_netdev_notifier);
+	if (bnxt_re_wq)
+		destroy_workqueue(bnxt_re_wq);
+}
+
+module_init(bnxt_re_mod_init);
+module_exit(bnxt_re_mod_exit);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
new file mode 100644
index 000000000000..43d08b5e9085
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -0,0 +1,2167 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Fast Path Operators
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/prefetch.h>
+
+#include "roce_hsi.h"
+
+#include "qplib_res.h"
+#include "qplib_rcfw.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+
+static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
+
+static void bnxt_qplib_free_qp_hdr_buf(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct bnxt_qplib_q *sq = &qp->sq;
+
+	if (qp->rq_hdr_buf)
+		dma_free_coherent(&res->pdev->dev,
+				  rq->hwq.max_elements * qp->rq_hdr_buf_size,
+				  qp->rq_hdr_buf, qp->rq_hdr_buf_map);
+	if (qp->sq_hdr_buf)
+		dma_free_coherent(&res->pdev->dev,
+				  sq->hwq.max_elements * qp->sq_hdr_buf_size,
+				  qp->sq_hdr_buf, qp->sq_hdr_buf_map);
+	qp->rq_hdr_buf = NULL;
+	qp->sq_hdr_buf = NULL;
+	qp->rq_hdr_buf_map = 0;
+	qp->sq_hdr_buf_map = 0;
+	qp->sq_hdr_buf_size = 0;
+	qp->rq_hdr_buf_size = 0;
+}
+
+static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct bnxt_qplib_q *sq = &qp->rq;
+	int rc = 0;
+
+	if (qp->sq_hdr_buf_size && sq->hwq.max_elements) {
+		qp->sq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
+					sq->hwq.max_elements *
+					qp->sq_hdr_buf_size,
+					&qp->sq_hdr_buf_map, GFP_KERNEL);
+		if (!qp->sq_hdr_buf) {
+			rc = -ENOMEM;
+			dev_err(&res->pdev->dev,
+				"QPLIB: Failed to create sq_hdr_buf");
+			goto fail;
+		}
+	}
+
+	if (qp->rq_hdr_buf_size && rq->hwq.max_elements) {
+		qp->rq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
+						    rq->hwq.max_elements *
+						    qp->rq_hdr_buf_size,
+						    &qp->rq_hdr_buf_map,
+						    GFP_KERNEL);
+		if (!qp->rq_hdr_buf) {
+			rc = -ENOMEM;
+			dev_err(&res->pdev->dev,
+				"QPLIB: Failed to create rq_hdr_buf");
+			goto fail;
+		}
+	}
+	return 0;
+
+fail:
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+	return rc;
+}
+
+static void bnxt_qplib_service_nq(unsigned long data)
+{
+	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
+	struct bnxt_qplib_hwq *hwq = &nq->hwq;
+	struct nq_base *nqe, **nq_ptr;
+	int num_cqne_processed = 0;
+	u32 sw_cons, raw_cons;
+	u16 type;
+	int budget = nq->budget;
+	u64 q_handle;
+
+	/* Service the NQ until empty */
+	raw_cons = hwq->cons;
+	while (budget--) {
+		sw_cons = HWQ_CMP(raw_cons, hwq);
+		nq_ptr = (struct nq_base **)hwq->pbl_ptr;
+		nqe = &nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)];
+		if (!NQE_CMP_VALID(nqe, raw_cons, hwq->max_elements))
+			break;
+
+		type = le16_to_cpu(nqe->info10_type) & NQ_BASE_TYPE_MASK;
+		switch (type) {
+		case NQ_BASE_TYPE_CQ_NOTIFICATION:
+		{
+			struct nq_cn *nqcne = (struct nq_cn *)nqe;
+
+			q_handle = le32_to_cpu(nqcne->cq_handle_low);
+			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
+						     << 32;
+			bnxt_qplib_arm_cq_enable((struct bnxt_qplib_cq *)
+						 ((unsigned long)q_handle));
+			if (!nq->cqn_handler(nq, (struct bnxt_qplib_cq *)
+						 ((unsigned long)q_handle)))
+				num_cqne_processed++;
+			else
+				dev_warn(&nq->pdev->dev,
+					 "QPLIB: cqn - type 0x%x not handled",
+					 type);
+			break;
+		}
+		case NQ_BASE_TYPE_DBQ_EVENT:
+			break;
+		default:
+			dev_warn(&nq->pdev->dev,
+				 "QPLIB: nqe with type = 0x%x not handled",
+				 type);
+			break;
+		}
+		raw_cons++;
+	}
+	if (hwq->cons != raw_cons) {
+		hwq->cons = raw_cons;
+		NQ_DB_REARM(nq->bar_reg_iomem, hwq->cons, hwq->max_elements);
+	}
+}
+
+static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
+{
+	struct bnxt_qplib_nq *nq = dev_instance;
+	struct bnxt_qplib_hwq *hwq = &nq->hwq;
+	struct nq_base **nq_ptr;
+	u32 sw_cons;
+
+	/* Prefetch the NQ element */
+	sw_cons = HWQ_CMP(hwq->cons, hwq);
+	nq_ptr = (struct nq_base **)nq->hwq.pbl_ptr;
+	prefetch(&nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)]);
+
+	/* Fan out to CPU affinitized kthreads? */
+	tasklet_schedule(&nq->worker);
+
+	return IRQ_HANDLED;
+}
+
+void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
+{
+	/* Make sure the HW is stopped! */
+	synchronize_irq(nq->vector);
+	tasklet_disable(&nq->worker);
+	tasklet_kill(&nq->worker);
+
+	if (nq->requested) {
+		free_irq(nq->vector, nq);
+		nq->requested = false;
+	}
+	if (nq->bar_reg_iomem)
+		iounmap(nq->bar_reg_iomem);
+	nq->bar_reg_iomem = NULL;
+
+	nq->cqn_handler = NULL;
+	nq->srqn_handler = NULL;
+	nq->vector = 0;
+}
+
+int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
+			 int msix_vector, int bar_reg_offset,
+			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
+					    struct bnxt_qplib_cq *),
+			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
+					     void *, u8 event))
+{
+	resource_size_t nq_base;
+	int rc;
+
+	nq->pdev = pdev;
+	nq->vector = msix_vector;
+
+	nq->cqn_handler = cqn_handler;
+
+	nq->srqn_handler = srqn_handler;
+
+	tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq);
+
+	nq->requested = false;
+	rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, "bnxt_qplib_nq", nq);
+	if (rc) {
+		dev_err(&nq->pdev->dev,
+			"Failed to request IRQ for NQ: %#x", rc);
+		bnxt_qplib_disable_nq(nq);
+		goto fail;
+	}
+	nq->requested = true;
+	nq->bar_reg = NQ_CONS_PCI_BAR_REGION;
+	nq->bar_reg_off = bar_reg_offset;
+	nq_base = pci_resource_start(pdev, nq->bar_reg);
+	if (!nq_base) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 4);
+	if (!nq->bar_reg_iomem) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+
+	return 0;
+fail:
+	bnxt_qplib_disable_nq(nq);
+	return rc;
+}
+
+void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq)
+{
+	if (nq->hwq.max_elements)
+		bnxt_qplib_free_hwq(nq->pdev, &nq->hwq);
+}
+
+int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
+{
+	nq->pdev = pdev;
+	if (!nq->hwq.max_elements ||
+	    nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
+		nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
+
+	if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0,
+				      &nq->hwq.max_elements,
+				      BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
+				      PAGE_SIZE, HWQ_TYPE_L2_CMPL))
+		return -ENOMEM;
+
+	nq->budget = 8;
+	return 0;
+}
+
+/* QP */
+int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_qp1 req;
+	struct creq_create_qp1_resp *resp;
+	struct bnxt_qplib_pbl *pbl;
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_q *rq = &qp->rq;
+	int rc;
+	u16 cmd_flags = 0;
+	u32 qp_flags = 0;
+
+	RCFW_CMD_PREP(req, CREATE_QP1, cmd_flags);
+
+	/* General */
+	req.type = qp->type;
+	req.dpi = cpu_to_le32(qp->dpi->dpi);
+	req.qp_handle = cpu_to_le64(qp->qp_handle);
+
+	/* SQ */
+	sq->hwq.max_elements = sq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL, 0,
+				       &sq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
+	if (!sq->swq) {
+		rc = -ENOMEM;
+		goto fail_sq;
+	}
+	pbl = &sq->hwq.pbl[PBL_LVL_0];
+	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+	req.sq_pg_size_sq_lvl =
+		((sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK)
+				<<  CMDQ_CREATE_QP1_SQ_LVL_SFT) |
+		(pbl->pg_size == ROCE_PG_SIZE_4K ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K :
+		 pbl->pg_size == ROCE_PG_SIZE_8K ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8K :
+		 pbl->pg_size == ROCE_PG_SIZE_64K ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_64K :
+		 pbl->pg_size == ROCE_PG_SIZE_2M ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_2M :
+		 pbl->pg_size == ROCE_PG_SIZE_8M ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8M :
+		 pbl->pg_size == ROCE_PG_SIZE_1G ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_1G :
+		 CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K);
+
+	if (qp->scq)
+		req.scq_cid = cpu_to_le32(qp->scq->id);
+
+	qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE;
+
+	/* RQ */
+	if (rq->max_wqe) {
+		rq->hwq.max_elements = qp->rq.max_wqe;
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL, 0,
+					       &rq->hwq.max_elements,
+					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
+					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		if (rc)
+			goto fail_sq;
+
+		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
+				  GFP_KERNEL);
+		if (!rq->swq) {
+			rc = -ENOMEM;
+			goto fail_rq;
+		}
+		pbl = &rq->hwq.pbl[PBL_LVL_0];
+		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+		req.rq_pg_size_rq_lvl =
+			((rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK) <<
+			 CMDQ_CREATE_QP1_RQ_LVL_SFT) |
+				(pbl->pg_size == ROCE_PG_SIZE_4K ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K :
+				 pbl->pg_size == ROCE_PG_SIZE_8K ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8K :
+				 pbl->pg_size == ROCE_PG_SIZE_64K ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_64K :
+				 pbl->pg_size == ROCE_PG_SIZE_2M ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_2M :
+				 pbl->pg_size == ROCE_PG_SIZE_8M ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8M :
+				 pbl->pg_size == ROCE_PG_SIZE_1G ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_1G :
+				 CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K);
+		if (qp->rcq)
+			req.rcq_cid = cpu_to_le32(qp->rcq->id);
+	}
+
+	/* Header buffer - allow hdr_buf pass in */
+	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
+	if (rc) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	req.qp_flags = cpu_to_le32(qp_flags);
+	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
+	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
+
+	req.sq_fwo_sq_sge =
+		cpu_to_le16((sq->max_sge & CMDQ_CREATE_QP1_SQ_SGE_MASK) <<
+			    CMDQ_CREATE_QP1_SQ_SGE_SFT);
+	req.rq_fwo_rq_sge =
+		cpu_to_le16((rq->max_sge & CMDQ_CREATE_QP1_RQ_SGE_MASK) <<
+			    CMDQ_CREATE_QP1_RQ_SGE_SFT);
+
+	req.pd_id = cpu_to_le32(qp->pd->id);
+
+	resp = (struct creq_create_qp1_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&res->pdev->dev, "QPLIB: FP: CREATE_QP1 send failed");
+		rc = -EINVAL;
+		goto fail;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP1 timed out");
+		rc = -ETIMEDOUT;
+		goto fail;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP1 failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		rc = -EINVAL;
+		goto fail;
+	}
+	qp->id = le32_to_cpu(resp->xid);
+	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	sq->flush_in_progress = false;
+	rq->flush_in_progress = false;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+fail_rq:
+	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	kfree(rq->swq);
+fail_sq:
+	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+	kfree(sq->swq);
+exit:
+	return rc;
+}
+
+int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
+	struct cmdq_create_qp req;
+	struct creq_create_qp_resp *resp;
+	struct bnxt_qplib_pbl *pbl;
+	struct sq_psn_search **psn_search_ptr;
+	unsigned long int psn_search, poff = 0;
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct bnxt_qplib_hwq *xrrq;
+	int i, rc, req_size, psn_sz;
+	u16 cmd_flags = 0, max_ssge;
+	u32 sw_prod, qp_flags = 0;
+
+	RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
+
+	/* General */
+	req.type = qp->type;
+	req.dpi = cpu_to_le32(qp->dpi->dpi);
+	req.qp_handle = cpu_to_le64(qp->qp_handle);
+
+	/* SQ */
+	psn_sz = (qp->type == CMDQ_CREATE_QP_TYPE_RC) ?
+		 sizeof(struct sq_psn_search) : 0;
+	sq->hwq.max_elements = sq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist,
+				       sq->nmap, &sq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE,
+				       psn_sz,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
+	if (!sq->swq) {
+		rc = -ENOMEM;
+		goto fail_sq;
+	}
+	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+	if (psn_sz) {
+		psn_search_ptr = (struct sq_psn_search **)
+				  &hw_sq_send_ptr[get_sqe_pg
+					(sq->hwq.max_elements)];
+		psn_search = (unsigned long int)
+			      &hw_sq_send_ptr[get_sqe_pg(sq->hwq.max_elements)]
+			      [get_sqe_idx(sq->hwq.max_elements)];
+		if (psn_search & ~PAGE_MASK) {
+			/* If the psn_search does not start on a page boundary,
+			 * then calculate the offset
+			 */
+			poff = (psn_search & ~PAGE_MASK) /
+				BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE;
+		}
+		for (i = 0; i < sq->hwq.max_elements; i++)
+			sq->swq[i].psn_search =
+				&psn_search_ptr[get_psne_pg(i + poff)]
+					       [get_psne_idx(i + poff)];
+	}
+	pbl = &sq->hwq.pbl[PBL_LVL_0];
+	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+	req.sq_pg_size_sq_lvl =
+		((sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK)
+				 <<  CMDQ_CREATE_QP_SQ_LVL_SFT) |
+		(pbl->pg_size == ROCE_PG_SIZE_4K ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K :
+		 pbl->pg_size == ROCE_PG_SIZE_8K ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8K :
+		 pbl->pg_size == ROCE_PG_SIZE_64K ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_64K :
+		 pbl->pg_size == ROCE_PG_SIZE_2M ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_2M :
+		 pbl->pg_size == ROCE_PG_SIZE_8M ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8M :
+		 pbl->pg_size == ROCE_PG_SIZE_1G ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G :
+		 CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K);
+
+	/* initialize all SQ WQEs to LOCAL_INVALID (sq prep for hw fetch) */
+	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+	for (sw_prod = 0; sw_prod < sq->hwq.max_elements; sw_prod++) {
+		hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
+						[get_sqe_idx(sw_prod)];
+		hw_sq_send_hdr->wqe_type = SQ_BASE_WQE_TYPE_LOCAL_INVALID;
+	}
+
+	if (qp->scq)
+		req.scq_cid = cpu_to_le32(qp->scq->id);
+
+	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE;
+	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED;
+	if (qp->sig_type)
+		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION;
+
+	/* RQ */
+	if (rq->max_wqe) {
+		rq->hwq.max_elements = rq->max_wqe;
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, rq->sglist,
+					       rq->nmap, &rq->hwq.max_elements,
+					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
+					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		if (rc)
+			goto fail_sq;
+
+		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
+				  GFP_KERNEL);
+		if (!rq->swq) {
+			rc = -ENOMEM;
+			goto fail_rq;
+		}
+		pbl = &rq->hwq.pbl[PBL_LVL_0];
+		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+		req.rq_pg_size_rq_lvl =
+			((rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK) <<
+			 CMDQ_CREATE_QP_RQ_LVL_SFT) |
+				(pbl->pg_size == ROCE_PG_SIZE_4K ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K :
+				 pbl->pg_size == ROCE_PG_SIZE_8K ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8K :
+				 pbl->pg_size == ROCE_PG_SIZE_64K ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_64K :
+				 pbl->pg_size == ROCE_PG_SIZE_2M ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_2M :
+				 pbl->pg_size == ROCE_PG_SIZE_8M ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8M :
+				 pbl->pg_size == ROCE_PG_SIZE_1G ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G :
+				 CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K);
+	}
+
+	if (qp->rcq)
+		req.rcq_cid = cpu_to_le32(qp->rcq->id);
+	req.qp_flags = cpu_to_le32(qp_flags);
+	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
+	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
+	qp->sq_hdr_buf = NULL;
+	qp->rq_hdr_buf = NULL;
+
+	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
+	if (rc)
+		goto fail_rq;
+
+	/* CTRL-22434: Irrespective of the requested SGE count on the SQ
+	 * always create the QP with max send sges possible if the requested
+	 * inline size is greater than 0.
+	 */
+	max_ssge = qp->max_inline_data ? 6 : sq->max_sge;
+	req.sq_fwo_sq_sge = cpu_to_le16(
+				((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
+				 << CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+	req.rq_fwo_rq_sge = cpu_to_le16(
+				((rq->max_sge & CMDQ_CREATE_QP_RQ_SGE_MASK)
+				 << CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
+	/* ORRQ and IRRQ */
+	if (psn_sz) {
+		xrrq = &qp->orrq;
+		xrrq->max_elements =
+			ORD_LIMIT_TO_ORRQ_SLOTS(qp->max_rd_atomic);
+		req_size = xrrq->max_elements *
+			   BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE + PAGE_SIZE - 1;
+		req_size &= ~(PAGE_SIZE - 1);
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+					       &xrrq->max_elements,
+					       BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE,
+					       0, req_size, HWQ_TYPE_CTX);
+		if (rc)
+			goto fail_buf_free;
+		pbl = &xrrq->pbl[PBL_LVL_0];
+		req.orrq_addr = cpu_to_le64(pbl->pg_map_arr[0]);
+
+		xrrq = &qp->irrq;
+		xrrq->max_elements = IRD_LIMIT_TO_IRRQ_SLOTS(
+						qp->max_dest_rd_atomic);
+		req_size = xrrq->max_elements *
+			   BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE + PAGE_SIZE - 1;
+		req_size &= ~(PAGE_SIZE - 1);
+
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+					       &xrrq->max_elements,
+					       BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE,
+					       0, req_size, HWQ_TYPE_CTX);
+		if (rc)
+			goto fail_orrq;
+
+		pbl = &xrrq->pbl[PBL_LVL_0];
+		req.irrq_addr = cpu_to_le64(pbl->pg_map_arr[0]);
+	}
+	req.pd_id = cpu_to_le32(qp->pd->id);
+
+	resp = (struct creq_create_qp_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP send failed");
+		rc = -EINVAL;
+		goto fail;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP timed out");
+		rc = -ETIMEDOUT;
+		goto fail;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		rc = -EINVAL;
+		goto fail;
+	}
+	qp->id = le32_to_cpu(resp->xid);
+	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	sq->flush_in_progress = false;
+	rq->flush_in_progress = false;
+
+	return 0;
+
+fail:
+	if (qp->irrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+fail_orrq:
+	if (qp->orrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+fail_buf_free:
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+fail_rq:
+	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	kfree(rq->swq);
+fail_sq:
+	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+	kfree(sq->swq);
+exit:
+	return rc;
+}
+
+static void __modify_flags_from_init_state(struct bnxt_qplib_qp *qp)
+{
+	switch (qp->state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RTR:
+		/* INIT->RTR, configure the path_mtu to the default
+		 * 2048 if not being requested
+		 */
+		if (!(qp->modify_flags &
+		    CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU)) {
+			qp->modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU;
+			qp->path_mtu =
+				CMDQ_MODIFY_QP_PATH_MTU_MTU_2048;
+		}
+		qp->modify_flags &=
+			~CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID;
+		/* Bono FW require the max_dest_rd_atomic to be >= 1 */
+		if (qp->max_dest_rd_atomic < 1)
+			qp->max_dest_rd_atomic = 1;
+		qp->modify_flags &= ~CMDQ_MODIFY_QP_MODIFY_MASK_SRC_MAC;
+		/* Bono FW 20.6.5 requires SGID_INDEX configuration */
+		if (!(qp->modify_flags &
+		    CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX)) {
+			qp->modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX;
+			qp->ah.sgid_index = 0;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static void __modify_flags_from_rtr_state(struct bnxt_qplib_qp *qp)
+{
+	switch (qp->state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RTS:
+		/* Bono FW requires the max_rd_atomic to be >= 1 */
+		if (qp->max_rd_atomic < 1)
+			qp->max_rd_atomic = 1;
+		/* Bono FW does not allow PKEY_INDEX,
+		 * DGID, FLOW_LABEL, SGID_INDEX, HOP_LIMIT,
+		 * TRAFFIC_CLASS, DEST_MAC, PATH_MTU, RQ_PSN,
+		 * MIN_RNR_TIMER, MAX_DEST_RD_ATOMIC, DEST_QP_ID
+		 * modification
+		 */
+		qp->modify_flags &=
+			~(CMDQ_MODIFY_QP_MODIFY_MASK_PKEY |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_DGID |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID);
+		break;
+	default:
+		break;
+	}
+}
+
+static void __filter_modify_flags(struct bnxt_qplib_qp *qp)
+{
+	switch (qp->cur_qp_state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RESET:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_INIT:
+		__modify_flags_from_init_state(qp);
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTR:
+		__modify_flags_from_rtr_state(qp);
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTS:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQD:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQE:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_ERR:
+		break;
+	default:
+		break;
+	}
+}
+
+int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_modify_qp req;
+	struct creq_modify_qp_resp *resp;
+	u16 cmd_flags = 0, pkey;
+	u32 temp32[4];
+	u32 bmask;
+
+	RCFW_CMD_PREP(req, MODIFY_QP, cmd_flags);
+
+	/* Filter out the qp_attr_mask based on the state->new transition */
+	__filter_modify_flags(qp);
+	bmask = qp->modify_flags;
+	req.modify_mask = cpu_to_le32(qp->modify_flags);
+	req.qp_cid = cpu_to_le32(qp->id);
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) {
+		req.network_type_en_sqd_async_notify_new_state =
+				(qp->state & CMDQ_MODIFY_QP_NEW_STATE_MASK) |
+				(qp->en_sqd_async_notify ?
+					CMDQ_MODIFY_QP_EN_SQD_ASYNC_NOTIFY : 0);
+	}
+	req.network_type_en_sqd_async_notify_new_state |= qp->nw_type;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS)
+		req.access = qp->access;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_PKEY) {
+		if (!bnxt_qplib_get_pkey(res, &res->pkey_tbl,
+					 qp->pkey_index, &pkey))
+			req.pkey = cpu_to_le16(pkey);
+	}
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_QKEY)
+		req.qkey = cpu_to_le32(qp->qkey);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DGID) {
+		memcpy(temp32, qp->ah.dgid.data, sizeof(struct bnxt_qplib_gid));
+		req.dgid[0] = cpu_to_le32(temp32[0]);
+		req.dgid[1] = cpu_to_le32(temp32[1]);
+		req.dgid[2] = cpu_to_le32(temp32[2]);
+		req.dgid[3] = cpu_to_le32(temp32[3]);
+	}
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL)
+		req.flow_label = cpu_to_le32(qp->ah.flow_label);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX)
+		req.sgid_index = cpu_to_le16(res->sgid_tbl.hw_id
+					     [qp->ah.sgid_index]);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT)
+		req.hop_limit = qp->ah.hop_limit;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS)
+		req.traffic_class = qp->ah.traffic_class;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC)
+		memcpy(req.dest_mac, qp->ah.dmac, 6);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU)
+		req.path_mtu = qp->path_mtu;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_TIMEOUT)
+		req.timeout = qp->timeout;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_RETRY_CNT)
+		req.retry_cnt = qp->retry_cnt;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_RNR_RETRY)
+		req.rnr_retry = qp->rnr_retry;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER)
+		req.min_rnr_timer = qp->min_rnr_timer;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN)
+		req.rq_psn = cpu_to_le32(qp->rq.psn);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN)
+		req.sq_psn = cpu_to_le32(qp->sq.psn);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_MAX_RD_ATOMIC)
+		req.max_rd_atomic =
+			ORD_LIMIT_TO_ORRQ_SLOTS(qp->max_rd_atomic);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC)
+		req.max_dest_rd_atomic =
+			IRD_LIMIT_TO_IRRQ_SLOTS(qp->max_dest_rd_atomic);
+
+	req.sq_size = cpu_to_le32(qp->sq.hwq.max_elements);
+	req.rq_size = cpu_to_le32(qp->rq.hwq.max_elements);
+	req.sq_sge = cpu_to_le16(qp->sq.max_sge);
+	req.rq_sge = cpu_to_le16(qp->rq.max_sge);
+	req.max_inline_data = cpu_to_le32(qp->max_inline_data);
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID)
+		req.dest_qp_id = cpu_to_le32(qp->dest_qpn);
+
+	req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(qp->vlan_id);
+
+	resp = (struct creq_modify_qp_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: MODIFY_QP send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: MODIFY_QP timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: MODIFY_QP failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	qp->cur_qp_state = qp->state;
+	return 0;
+}
+
+int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_query_qp req;
+	struct creq_query_qp_resp *resp;
+	struct creq_query_qp_resp_sb *sb;
+	u16 cmd_flags = 0;
+	u32 temp32[4];
+	int i;
+
+	RCFW_CMD_PREP(req, QUERY_QP, cmd_flags);
+
+	req.qp_cid = cpu_to_le32(qp->id);
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	resp = (struct creq_query_qp_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     (void **)&sb, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: QUERY_QP send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: QUERY_QP timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: QUERY_QP failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	/* Extract the context from the side buffer */
+	qp->state = sb->en_sqd_async_notify_state &
+			CREQ_QUERY_QP_RESP_SB_STATE_MASK;
+	qp->en_sqd_async_notify = sb->en_sqd_async_notify_state &
+				  CREQ_QUERY_QP_RESP_SB_EN_SQD_ASYNC_NOTIFY ?
+				  true : false;
+	qp->access = sb->access;
+	qp->pkey_index = le16_to_cpu(sb->pkey);
+	qp->qkey = le32_to_cpu(sb->qkey);
+
+	temp32[0] = le32_to_cpu(sb->dgid[0]);
+	temp32[1] = le32_to_cpu(sb->dgid[1]);
+	temp32[2] = le32_to_cpu(sb->dgid[2]);
+	temp32[3] = le32_to_cpu(sb->dgid[3]);
+	memcpy(qp->ah.dgid.data, temp32, sizeof(qp->ah.dgid.data));
+
+	qp->ah.flow_label = le32_to_cpu(sb->flow_label);
+
+	qp->ah.sgid_index = 0;
+	for (i = 0; i < res->sgid_tbl.max; i++) {
+		if (res->sgid_tbl.hw_id[i] == le16_to_cpu(sb->sgid_index)) {
+			qp->ah.sgid_index = i;
+			break;
+		}
+	}
+	if (i == res->sgid_tbl.max)
+		dev_warn(&res->pdev->dev, "QPLIB: SGID not found??");
+
+	qp->ah.hop_limit = sb->hop_limit;
+	qp->ah.traffic_class = sb->traffic_class;
+	memcpy(qp->ah.dmac, sb->dest_mac, 6);
+	qp->ah.vlan_id = (le16_to_cpu(sb->path_mtu_dest_vlan_id) &
+				CREQ_QUERY_QP_RESP_SB_VLAN_ID_MASK) >>
+				CREQ_QUERY_QP_RESP_SB_VLAN_ID_SFT;
+	qp->path_mtu = (le16_to_cpu(sb->path_mtu_dest_vlan_id) &
+				    CREQ_QUERY_QP_RESP_SB_PATH_MTU_MASK) >>
+				    CREQ_QUERY_QP_RESP_SB_PATH_MTU_SFT;
+	qp->timeout = sb->timeout;
+	qp->retry_cnt = sb->retry_cnt;
+	qp->rnr_retry = sb->rnr_retry;
+	qp->min_rnr_timer = sb->min_rnr_timer;
+	qp->rq.psn = le32_to_cpu(sb->rq_psn);
+	qp->max_rd_atomic = ORRQ_SLOTS_TO_ORD_LIMIT(sb->max_rd_atomic);
+	qp->sq.psn = le32_to_cpu(sb->sq_psn);
+	qp->max_dest_rd_atomic =
+			IRRQ_SLOTS_TO_IRD_LIMIT(sb->max_dest_rd_atomic);
+	qp->sq.max_wqe = qp->sq.hwq.max_elements;
+	qp->rq.max_wqe = qp->rq.hwq.max_elements;
+	qp->sq.max_sge = le16_to_cpu(sb->sq_sge);
+	qp->rq.max_sge = le16_to_cpu(sb->rq_sge);
+	qp->max_inline_data = le32_to_cpu(sb->max_inline_data);
+	qp->dest_qpn = le32_to_cpu(sb->dest_qp_id);
+	memcpy(qp->smac, sb->src_mac, 6);
+	qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id);
+	return 0;
+}
+
+static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp)
+{
+	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
+	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	int i;
+
+	for (i = 0; i < cq_hwq->max_elements; i++) {
+		hw_cqe_ptr = (struct cq_base **)cq_hwq->pbl_ptr;
+		hw_cqe = &hw_cqe_ptr[CQE_PG(i)][CQE_IDX(i)];
+		if (!CQE_CMP_VALID(hw_cqe, i, cq_hwq->max_elements))
+			continue;
+		switch (hw_cqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK) {
+		case CQ_BASE_CQE_TYPE_REQ:
+		case CQ_BASE_CQE_TYPE_TERMINAL:
+		{
+			struct cq_req *cqe = (struct cq_req *)hw_cqe;
+
+			if (qp == le64_to_cpu(cqe->qp_handle))
+				cqe->qp_handle = 0;
+			break;
+		}
+		case CQ_BASE_CQE_TYPE_RES_RC:
+		case CQ_BASE_CQE_TYPE_RES_UD:
+		case CQ_BASE_CQE_TYPE_RES_RAWETH_QP1:
+		{
+			struct cq_res_rc *cqe = (struct cq_res_rc *)hw_cqe;
+
+			if (qp == le64_to_cpu(cqe->qp_handle))
+				cqe->qp_handle = 0;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_qp req;
+	struct creq_destroy_qp_resp *resp;
+	unsigned long flags;
+	u16 cmd_flags = 0;
+
+	RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags);
+
+	req.qp_cid = cpu_to_le32(qp->id);
+	resp = (struct creq_destroy_qp_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_QP send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_QP timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_QP failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+
+	/* Must walk the associated CQs to nullified the QP ptr */
+	spin_lock_irqsave(&qp->scq->hwq.lock, flags);
+
+	__clean_cq(qp->scq, (u64)(unsigned long)qp);
+
+	if (qp->rcq && qp->rcq != qp->scq) {
+		spin_lock(&qp->rcq->hwq.lock);
+		__clean_cq(qp->rcq, (u64)(unsigned long)qp);
+		spin_unlock(&qp->rcq->hwq.lock);
+	}
+
+	spin_unlock_irqrestore(&qp->scq->hwq.lock, flags);
+
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+	bnxt_qplib_free_hwq(res->pdev, &qp->sq.hwq);
+	kfree(qp->sq.swq);
+
+	bnxt_qplib_free_hwq(res->pdev, &qp->rq.hwq);
+	kfree(qp->rq.swq);
+
+	if (qp->irrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+	if (qp->orrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+
+	return 0;
+}
+
+void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	u32 sw_prod;
+
+	memset(sge, 0, sizeof(*sge));
+
+	if (qp->sq_hdr_buf) {
+		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+		sge->addr = (dma_addr_t)(qp->sq_hdr_buf_map +
+					 sw_prod * qp->sq_hdr_buf_size);
+		sge->lkey = 0xFFFFFFFF;
+		sge->size = qp->sq_hdr_buf_size;
+		return qp->sq_hdr_buf + sw_prod * sge->size;
+	}
+	return NULL;
+}
+
+u32 bnxt_qplib_get_rq_prod_index(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+
+	return HWQ_CMP(rq->hwq.prod, &rq->hwq);
+}
+
+dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp, u32 index)
+{
+	return (qp->rq_hdr_buf_map + index * qp->rq_hdr_buf_size);
+}
+
+void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	u32 sw_prod;
+
+	memset(sge, 0, sizeof(*sge));
+
+	if (qp->rq_hdr_buf) {
+		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+		sge->addr = (dma_addr_t)(qp->rq_hdr_buf_map +
+					 sw_prod * qp->rq_hdr_buf_size);
+		sge->lkey = 0xFFFFFFFF;
+		sge->size = qp->rq_hdr_buf_size;
+		return qp->rq_hdr_buf + sw_prod * sge->size;
+	}
+	return NULL;
+}
+
+void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct dbr_dbr db_msg = { 0 };
+	u32 sw_prod;
+
+	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+
+	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+				   DBR_DBR_INDEX_MASK);
+	db_msg.type_xid =
+		cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    DBR_DBR_TYPE_SQ);
+	/* Flush all the WQE writes to HW */
+	wmb();
+	__iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_swq *swq;
+	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
+	struct sq_sge *hw_sge;
+	u32 sw_prod;
+	u8 wqe_size16;
+	int i, rc = 0, data_len = 0, pkt_num = 0;
+	__le32 temp32;
+
+	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
+		rc = -EINVAL;
+		goto done;
+	}
+	if (HWQ_CMP((sq->hwq.prod + 1), &sq->hwq) ==
+	    HWQ_CMP(sq->hwq.cons, &sq->hwq)) {
+		rc = -ENOMEM;
+		goto done;
+	}
+	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+	swq = &sq->swq[sw_prod];
+	swq->wr_id = wqe->wr_id;
+	swq->type = wqe->type;
+	swq->flags = wqe->flags;
+	if (qp->sig_type)
+		swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
+	swq->start_psn = sq->psn & BTH_PSN_MASK;
+
+	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+	hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
+					[get_sqe_idx(sw_prod)];
+
+	memset(hw_sq_send_hdr, 0, BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
+
+	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
+		/* Copy the inline data */
+		if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
+			dev_warn(&sq->hwq.pdev->dev,
+				 "QPLIB: Inline data length > 96 detected");
+			data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
+		} else {
+			data_len = wqe->inline_len;
+		}
+		memcpy(hw_sq_send_hdr->data, wqe->inline_data, data_len);
+		wqe_size16 = (data_len + 15) >> 4;
+	} else {
+		for (i = 0, hw_sge = (struct sq_sge *)hw_sq_send_hdr->data;
+		     i < wqe->num_sge; i++, hw_sge++) {
+			hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+			hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+			hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+			data_len += wqe->sg_list[i].size;
+		}
+		/* Each SGE entry = 1 WQE size16 */
+		wqe_size16 = wqe->num_sge;
+	}
+
+	/* Specifics */
+	switch (wqe->type) {
+	case BNXT_QPLIB_SWQE_TYPE_SEND:
+		if (qp->type == CMDQ_CREATE_QP1_TYPE_GSI) {
+			/* Assemble info for Raw Ethertype QPs */
+			struct sq_send_raweth_qp1 *sqe =
+				(struct sq_send_raweth_qp1 *)hw_sq_send_hdr;
+
+			sqe->wqe_type = wqe->type;
+			sqe->flags = wqe->flags;
+			sqe->wqe_size = wqe_size16 +
+				((offsetof(typeof(*sqe), data) + 15) >> 4);
+			sqe->cfa_action = cpu_to_le16(wqe->rawqp1.cfa_action);
+			sqe->lflags = cpu_to_le16(wqe->rawqp1.lflags);
+			sqe->length = cpu_to_le32(data_len);
+			sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
+				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK) <<
+				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT);
+
+			break;
+		}
+		/* else, just fall thru */
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
+	{
+		struct sq_send *sqe = (struct sq_send *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->wqe_size = wqe_size16 +
+				((offsetof(typeof(*sqe), data) + 15) >> 4);
+		sqe->inv_key_or_imm_data = cpu_to_le32(
+						wqe->send.inv_key);
+		if (qp->type == CMDQ_CREATE_QP_TYPE_UD) {
+			sqe->q_key = cpu_to_le32(wqe->send.q_key);
+			sqe->dst_qp = cpu_to_le32(
+					wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
+			sqe->length = cpu_to_le32(data_len);
+			sqe->avid = cpu_to_le32(wqe->send.avid &
+						SQ_SEND_AVID_MASK);
+			sq->psn = (sq->psn + 1) & BTH_PSN_MASK;
+		} else {
+			sqe->length = cpu_to_le32(data_len);
+			sqe->dst_qp = 0;
+			sqe->avid = 0;
+			if (qp->mtu)
+				pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+			if (!pkt_num)
+				pkt_num = 1;
+			sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+		}
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE:
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
+	{
+		struct sq_rdma *sqe = (struct sq_rdma *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->wqe_size = wqe_size16 +
+				((offsetof(typeof(*sqe), data) + 15) >> 4);
+		sqe->imm_data = cpu_to_le32(wqe->rdma.inv_key);
+		sqe->length = cpu_to_le32((u32)data_len);
+		sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
+		sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
+		if (qp->mtu)
+			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+		if (!pkt_num)
+			pkt_num = 1;
+		sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
+	{
+		struct sq_atomic *sqe = (struct sq_atomic *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->remote_key = cpu_to_le32(wqe->atomic.r_key);
+		sqe->remote_va = cpu_to_le64(wqe->atomic.remote_va);
+		sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
+		sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
+		if (qp->mtu)
+			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+		if (!pkt_num)
+			pkt_num = 1;
+		sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
+	{
+		struct sq_localinvalidate *sqe =
+				(struct sq_localinvalidate *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->inv_l_key = cpu_to_le32(wqe->local_inv.inv_l_key);
+
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR:
+	{
+		struct sq_fr_pmr *sqe = (struct sq_fr_pmr *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->access_cntl = wqe->frmr.access_cntl |
+				   SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE;
+		sqe->zero_based_page_size_log =
+			(wqe->frmr.pg_sz_log & SQ_FR_PMR_PAGE_SIZE_LOG_MASK) <<
+			SQ_FR_PMR_PAGE_SIZE_LOG_SFT |
+			(wqe->frmr.zero_based ? SQ_FR_PMR_ZERO_BASED : 0);
+		sqe->l_key = cpu_to_le32(wqe->frmr.l_key);
+		temp32 = cpu_to_le32(wqe->frmr.length);
+		memcpy(sqe->length, &temp32, sizeof(wqe->frmr.length));
+		sqe->numlevels_pbl_page_size_log =
+			((wqe->frmr.pbl_pg_sz_log <<
+					SQ_FR_PMR_PBL_PAGE_SIZE_LOG_SFT) &
+					SQ_FR_PMR_PBL_PAGE_SIZE_LOG_MASK) |
+			((wqe->frmr.levels << SQ_FR_PMR_NUMLEVELS_SFT) &
+					SQ_FR_PMR_NUMLEVELS_MASK);
+
+		for (i = 0; i < wqe->frmr.page_list_len; i++)
+			wqe->frmr.pbl_ptr[i] = cpu_to_le64(
+						wqe->frmr.page_list[i] |
+						PTU_PTE_VALID);
+		sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
+		sqe->va = cpu_to_le64(wqe->frmr.va);
+
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_BIND_MW:
+	{
+		struct sq_bind *sqe = (struct sq_bind *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->access_cntl = wqe->bind.access_cntl;
+		sqe->mw_type_zero_based = wqe->bind.mw_type |
+			(wqe->bind.zero_based ? SQ_BIND_ZERO_BASED : 0);
+		sqe->parent_l_key = cpu_to_le32(wqe->bind.parent_l_key);
+		sqe->l_key = cpu_to_le32(wqe->bind.r_key);
+		sqe->va = cpu_to_le64(wqe->bind.va);
+		temp32 = cpu_to_le32(wqe->bind.length);
+		memcpy(&sqe->length, &temp32, sizeof(wqe->bind.length));
+		break;
+	}
+	default:
+		/* Bad wqe, return error */
+		rc = -EINVAL;
+		goto done;
+	}
+	swq->next_psn = sq->psn & BTH_PSN_MASK;
+	if (swq->psn_search) {
+		swq->psn_search->opcode_start_psn = cpu_to_le32(
+			((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+			 SQ_PSN_SEARCH_START_PSN_MASK) |
+			((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+			 SQ_PSN_SEARCH_OPCODE_MASK));
+		swq->psn_search->flags_next_psn = cpu_to_le32(
+			((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+			 SQ_PSN_SEARCH_NEXT_PSN_MASK));
+	}
+
+	sq->hwq.prod++;
+done:
+	return rc;
+}
+
+void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct dbr_dbr db_msg = { 0 };
+	u32 sw_prod;
+
+	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+				   DBR_DBR_INDEX_MASK);
+	db_msg.type_xid =
+		cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    DBR_DBR_TYPE_RQ);
+
+	/* Flush the writes to HW Rx WQE before the ringing Rx DB */
+	wmb();
+	__iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct rq_wqe *rqe, **rqe_ptr;
+	struct sq_sge *hw_sge;
+	u32 sw_prod;
+	int i, rc = 0;
+
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		dev_err(&rq->hwq.pdev->dev,
+			"QPLIB: FP: QP (0x%x) is in the 0x%x state",
+			qp->id, qp->state);
+		rc = -EINVAL;
+		goto done;
+	}
+	if (HWQ_CMP((rq->hwq.prod + 1), &rq->hwq) ==
+	    HWQ_CMP(rq->hwq.cons, &rq->hwq)) {
+		dev_err(&rq->hwq.pdev->dev,
+			"QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
+		rc = -EINVAL;
+		goto done;
+	}
+	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	rq->swq[sw_prod].wr_id = wqe->wr_id;
+
+	rqe_ptr = (struct rq_wqe **)rq->hwq.pbl_ptr;
+	rqe = &rqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
+
+	memset(rqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+
+	/* Calculate wqe_size16 and data_len */
+	for (i = 0, hw_sge = (struct sq_sge *)rqe->data;
+	     i < wqe->num_sge; i++, hw_sge++) {
+		hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+		hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+		hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+	}
+	rqe->wqe_type = wqe->type;
+	rqe->flags = wqe->flags;
+	rqe->wqe_size = wqe->num_sge +
+			((offsetof(typeof(*rqe), data) + 15) >> 4);
+
+	/* Supply the rqe->wr_id index to the wr_id_tbl for now */
+	rqe->wr_id[0] = cpu_to_le32(sw_prod);
+
+	rq->hwq.prod++;
+done:
+	return rc;
+}
+
+/* CQ */
+
+/* Spinlock must be held */
+static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
+{
+	struct dbr_dbr db_msg = { 0 };
+
+	db_msg.type_xid =
+		cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    DBR_DBR_TYPE_CQ_ARMENA);
+	/* Flush memory writes before enabling the CQ */
+	wmb();
+	__iowrite64_copy(cq->dbr_base, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
+{
+	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
+	struct dbr_dbr db_msg = { 0 };
+	u32 sw_cons;
+
+	/* Ring DB */
+	sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
+	db_msg.index = cpu_to_le32((sw_cons << DBR_DBR_INDEX_SFT) &
+				    DBR_DBR_INDEX_MASK);
+	db_msg.type_xid =
+		cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    arm_type);
+	/* flush memory writes before arming the CQ */
+	wmb();
+	__iowrite64_copy(cq->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_cq req;
+	struct creq_create_cq_resp *resp;
+	struct bnxt_qplib_pbl *pbl;
+	u16 cmd_flags = 0;
+	int rc;
+
+	cq->hwq.max_elements = cq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, cq->sghead,
+				       cq->nmap, &cq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_CQE_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	RCFW_CMD_PREP(req, CREATE_CQ, cmd_flags);
+
+	if (!cq->dpi) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: FP: CREATE_CQ failed due to NULL DPI");
+		return -EINVAL;
+	}
+	req.dpi = cpu_to_le32(cq->dpi->dpi);
+	req.cq_handle = cpu_to_le64(cq->cq_handle);
+
+	req.cq_size = cpu_to_le32(cq->hwq.max_elements);
+	pbl = &cq->hwq.pbl[PBL_LVL_0];
+	req.pg_size_lvl = cpu_to_le32(
+	    ((cq->hwq.level & CMDQ_CREATE_CQ_LVL_MASK) <<
+						CMDQ_CREATE_CQ_LVL_SFT) |
+	    (pbl->pg_size == ROCE_PG_SIZE_4K ? CMDQ_CREATE_CQ_PG_SIZE_PG_4K :
+	     pbl->pg_size == ROCE_PG_SIZE_8K ? CMDQ_CREATE_CQ_PG_SIZE_PG_8K :
+	     pbl->pg_size == ROCE_PG_SIZE_64K ? CMDQ_CREATE_CQ_PG_SIZE_PG_64K :
+	     pbl->pg_size == ROCE_PG_SIZE_2M ? CMDQ_CREATE_CQ_PG_SIZE_PG_2M :
+	     pbl->pg_size == ROCE_PG_SIZE_8M ? CMDQ_CREATE_CQ_PG_SIZE_PG_8M :
+	     pbl->pg_size == ROCE_PG_SIZE_1G ? CMDQ_CREATE_CQ_PG_SIZE_PG_1G :
+	     CMDQ_CREATE_CQ_PG_SIZE_PG_4K));
+
+	req.pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+
+	req.cq_fco_cnq_id = cpu_to_le32(
+			(cq->cnq_hw_ring_id & CMDQ_CREATE_CQ_CNQ_ID_MASK) <<
+			 CMDQ_CREATE_CQ_CNQ_ID_SFT);
+
+	resp = (struct creq_create_cq_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_CQ send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_CQ timed out");
+		rc = -ETIMEDOUT;
+		goto fail;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_CQ failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		rc = -EINVAL;
+		goto fail;
+	}
+	cq->id = le32_to_cpu(resp->xid);
+	cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
+	cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
+	init_waitqueue_head(&cq->waitq);
+
+	bnxt_qplib_arm_cq_enable(cq);
+	return 0;
+
+fail:
+	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+exit:
+	return rc;
+}
+
+int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_cq req;
+	struct creq_destroy_cq_resp *resp;
+	u16 cmd_flags = 0;
+
+	RCFW_CMD_PREP(req, DESTROY_CQ, cmd_flags);
+
+	req.cq_cid = cpu_to_le32(cq->id);
+	resp = (struct creq_destroy_cq_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_CQ send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_CQ timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_CQ failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+	return 0;
+}
+
+static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
+		      struct bnxt_qplib_cqe **pcqe, int *budget)
+{
+	u32 sw_prod, sw_cons;
+	struct bnxt_qplib_cqe *cqe;
+	int rc = 0;
+
+	/* Now complete all outstanding SQEs with FLUSHED_ERR */
+	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+		if (sw_cons == sw_prod) {
+			sq->flush_in_progress = false;
+			break;
+		}
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR;
+		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
+		cqe->qp_handle = (u64)(unsigned long)qp;
+		cqe->wr_id = sq->swq[sw_cons].wr_id;
+		cqe->src_qp = qp->id;
+		cqe->type = sq->swq[sw_cons].type;
+		cqe++;
+		(*budget)--;
+		sq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!(*budget) && HWQ_CMP(sq->hwq.cons, &sq->hwq) != sw_prod)
+		/* Out of budget */
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
+		      int opcode, struct bnxt_qplib_cqe **pcqe, int *budget)
+{
+	struct bnxt_qplib_cqe *cqe;
+	u32 sw_prod, sw_cons;
+	int rc = 0;
+
+	/* Flush the rest of the RQ */
+	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(rq->hwq.cons, &rq->hwq);
+		if (sw_cons == sw_prod)
+			break;
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->status =
+		    CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR;
+		cqe->opcode = opcode;
+		cqe->qp_handle = (unsigned long)qp;
+		cqe->wr_id = rq->swq[sw_cons].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!*budget && HWQ_CMP(rq->hwq.cons, &rq->hwq) != sw_prod)
+		/* Out of budget */
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
+				     struct cq_req *hwcqe,
+				     struct bnxt_qplib_cqe **pcqe, int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *sq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 sw_cons, cqe_cons;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: Process Req qp is NULL");
+		return -EINVAL;
+	}
+	sq = &qp->sq;
+
+	cqe_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
+	if (cqe_cons > sq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process req reported ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			cqe_cons, sq->hwq.max_elements);
+		return -EINVAL;
+	}
+	/* If we were in the middle of flushing the SQ, continue */
+	if (sq->flush_in_progress)
+		goto flush;
+
+	/* Require to walk the sq's swq to fabricate CQEs for all previously
+	 * signaled SWQEs due to CQE aggregation from the current sq cons
+	 * to the cqe_cons
+	 */
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+		if (sw_cons == cqe_cons)
+			break;
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
+		cqe->qp_handle = (u64)(unsigned long)qp;
+		cqe->src_qp = qp->id;
+		cqe->wr_id = sq->swq[sw_cons].wr_id;
+		cqe->type = sq->swq[sw_cons].type;
+
+		/* For the last CQE, check for status.  For errors, regardless
+		 * of the request being signaled or not, it must complete with
+		 * the hwcqe error status
+		 */
+		if (HWQ_CMP((sw_cons + 1), &sq->hwq) == cqe_cons &&
+		    hwcqe->status != CQ_REQ_STATUS_OK) {
+			cqe->status = hwcqe->status;
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Processed Req ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id[%d] = 0x%llx with status 0x%x",
+				sw_cons, cqe->wr_id, cqe->status);
+			cqe++;
+			(*budget)--;
+			sq->flush_in_progress = true;
+			/* Must block new posting of SQ and RQ */
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+		} else {
+			if (sq->swq[sw_cons].flags &
+			    SQ_SEND_FLAGS_SIGNAL_COMP) {
+				cqe->status = CQ_REQ_STATUS_OK;
+				cqe++;
+				(*budget)--;
+			}
+		}
+		sq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!*budget && HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_cons) {
+		/* Out of budget */
+		rc = -EAGAIN;
+		goto done;
+	}
+	if (!sq->flush_in_progress)
+		goto done;
+flush:
+	/* Require to walk the sq's swq to fabricate CQEs for all
+	 * previously posted SWQEs due to the error CQE received
+	 */
+	rc = __flush_sq(sq, qp, pcqe, budget);
+	if (!rc)
+		sq->flush_in_progress = false;
+done:
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
+					struct cq_res_rc *hwcqe,
+					struct bnxt_qplib_cqe **pcqe,
+					int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 wr_id_idx;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL");
+		return -EINVAL;
+	}
+	cqe = *pcqe;
+	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
+	cqe->length = le32_to_cpu(hwcqe->length);
+	cqe->invrkey = le32_to_cpu(hwcqe->imm_data_or_inv_r_key);
+	cqe->mr_handle = le64_to_cpu(hwcqe->mr_handle);
+	cqe->flags = le16_to_cpu(hwcqe->flags);
+	cqe->status = hwcqe->status;
+	cqe->qp_handle = (u64)(unsigned long)qp;
+
+	wr_id_idx = le32_to_cpu(hwcqe->srq_or_rq_wr_id) &
+				CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK;
+	rq = &qp->rq;
+	if (wr_id_idx > rq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process RC ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+			wr_id_idx, rq->hwq.max_elements);
+		return -EINVAL;
+	}
+	if (rq->flush_in_progress)
+		goto flush_rq;
+
+	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+	cqe++;
+	(*budget)--;
+	rq->hwq.cons++;
+	*pcqe = cqe;
+
+	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+		rq->flush_in_progress = true;
+flush_rq:
+		rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_RC, pcqe, budget);
+		if (!rc)
+			rq->flush_in_progress = false;
+	}
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
+					struct cq_res_ud *hwcqe,
+					struct bnxt_qplib_cqe **pcqe,
+					int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 wr_id_idx;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL");
+		return -EINVAL;
+	}
+	cqe = *pcqe;
+	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
+	cqe->length = le32_to_cpu(hwcqe->length);
+	cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
+	cqe->flags = le16_to_cpu(hwcqe->flags);
+	cqe->status = hwcqe->status;
+	cqe->qp_handle = (u64)(unsigned long)qp;
+	memcpy(cqe->smac, hwcqe->src_mac, 6);
+	wr_id_idx = le32_to_cpu(hwcqe->src_qp_high_srq_or_rq_wr_id)
+				& CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK;
+	cqe->src_qp = le16_to_cpu(hwcqe->src_qp_low) |
+				  ((le32_to_cpu(
+				  hwcqe->src_qp_high_srq_or_rq_wr_id) &
+				 CQ_RES_UD_SRC_QP_HIGH_MASK) >> 8);
+
+	rq = &qp->rq;
+	if (wr_id_idx > rq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process UD ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: wr_id idx %#x exceeded RQ max %#x",
+			wr_id_idx, rq->hwq.max_elements);
+		return -EINVAL;
+	}
+	if (rq->flush_in_progress)
+		goto flush_rq;
+
+	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+	cqe++;
+	(*budget)--;
+	rq->hwq.cons++;
+	*pcqe = cqe;
+
+	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+		rq->flush_in_progress = true;
+flush_rq:
+		rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_UD, pcqe, budget);
+		if (!rc)
+			rq->flush_in_progress = false;
+	}
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
+						struct cq_res_raweth_qp1 *hwcqe,
+						struct bnxt_qplib_cqe **pcqe,
+						int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 wr_id_idx;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: process_cq Raw/QP1 qp is NULL");
+		return -EINVAL;
+	}
+	cqe = *pcqe;
+	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
+	cqe->flags = le16_to_cpu(hwcqe->flags);
+	cqe->qp_handle = (u64)(unsigned long)qp;
+
+	wr_id_idx =
+		le32_to_cpu(hwcqe->raweth_qp1_payload_offset_srq_or_rq_wr_id)
+				& CQ_RES_RAWETH_QP1_SRQ_OR_RQ_WR_ID_MASK;
+	cqe->src_qp = qp->id;
+	if (qp->id == 1 && !cqe->length) {
+		/* Add workaround for the length misdetection */
+		cqe->length = 296;
+	} else {
+		cqe->length = le16_to_cpu(hwcqe->length);
+	}
+	cqe->pkey_index = qp->pkey_index;
+	memcpy(cqe->smac, qp->smac, 6);
+
+	cqe->raweth_qp1_flags = le16_to_cpu(hwcqe->raweth_qp1_flags);
+	cqe->raweth_qp1_flags2 = le32_to_cpu(hwcqe->raweth_qp1_flags2);
+
+	rq = &qp->rq;
+	if (wr_id_idx > rq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id ");
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: ix 0x%x exceeded RQ max 0x%x",
+			wr_id_idx, rq->hwq.max_elements);
+		return -EINVAL;
+	}
+	if (rq->flush_in_progress)
+		goto flush_rq;
+
+	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+	cqe++;
+	(*budget)--;
+	rq->hwq.cons++;
+	*pcqe = cqe;
+
+	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+		rq->flush_in_progress = true;
+flush_rq:
+		rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_RAWETH_QP1, pcqe,
+				budget);
+		if (!rc)
+			rq->flush_in_progress = false;
+	}
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
+					  struct cq_terminal *hwcqe,
+					  struct bnxt_qplib_cqe **pcqe,
+					  int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *sq, *rq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 sw_cons = 0, cqe_cons;
+	int rc = 0;
+	u8 opcode = 0;
+
+	/* Check the Status */
+	if (hwcqe->status != CQ_TERMINAL_STATUS_OK)
+		dev_warn(&cq->hwq.pdev->dev,
+			 "QPLIB: FP: CQ Process Terminal Error status = 0x%x",
+			 hwcqe->status);
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process terminal qp is NULL");
+		return -EINVAL;
+	}
+	/* Must block new posting of SQ and RQ */
+	qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+
+	sq = &qp->sq;
+	rq = &qp->rq;
+
+	cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx);
+	if (cqe_cons == 0xFFFF)
+		goto do_rq;
+
+	if (cqe_cons > sq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process terminal reported ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			cqe_cons, sq->hwq.max_elements);
+		goto do_rq;
+	}
+	/* If we were in the middle of flushing, continue */
+	if (sq->flush_in_progress)
+		goto flush_sq;
+
+	/* Terminal CQE can also include aggregated successful CQEs prior.
+	 * So we must complete all CQEs from the current sq's cons to the
+	 * cq_cons with status OK
+	 */
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+		if (sw_cons == cqe_cons)
+			break;
+		if (sq->swq[sw_cons].flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+			memset(cqe, 0, sizeof(*cqe));
+			cqe->status = CQ_REQ_STATUS_OK;
+			cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
+			cqe->qp_handle = (u64)(unsigned long)qp;
+			cqe->src_qp = qp->id;
+			cqe->wr_id = sq->swq[sw_cons].wr_id;
+			cqe->type = sq->swq[sw_cons].type;
+			cqe++;
+			(*budget)--;
+		}
+		sq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!(*budget) && sw_cons != cqe_cons) {
+		/* Out of budget */
+		rc = -EAGAIN;
+		goto sq_done;
+	}
+	sq->flush_in_progress = true;
+flush_sq:
+	rc = __flush_sq(sq, qp, pcqe, budget);
+	if (!rc)
+		sq->flush_in_progress = false;
+sq_done:
+	if (rc)
+		return rc;
+do_rq:
+	cqe_cons = le16_to_cpu(hwcqe->rq_cons_idx);
+	if (cqe_cons == 0xFFFF) {
+		goto done;
+	} else if (cqe_cons > rq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Processed terminal ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: reported rq_cons_idx 0x%x exceeds max 0x%x",
+			cqe_cons, rq->hwq.max_elements);
+		goto done;
+	}
+	/* Terminal CQE requires all posted RQEs to complete with FLUSHED_ERR
+	 * from the current rq->cons to the rq->prod regardless what the
+	 * rq->cons the terminal CQE indicates
+	 */
+	rq->flush_in_progress = true;
+	switch (qp->type) {
+	case CMDQ_CREATE_QP1_TYPE_GSI:
+		opcode = CQ_BASE_CQE_TYPE_RES_RAWETH_QP1;
+		break;
+	case CMDQ_CREATE_QP_TYPE_RC:
+		opcode = CQ_BASE_CQE_TYPE_RES_RC;
+		break;
+	case CMDQ_CREATE_QP_TYPE_UD:
+		opcode = CQ_BASE_CQE_TYPE_RES_UD;
+		break;
+	}
+
+	rc = __flush_rq(rq, qp, opcode, pcqe, budget);
+	if (!rc)
+		rq->flush_in_progress = false;
+done:
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq,
+					struct cq_cutoff *hwcqe)
+{
+	/* Check the Status */
+	if (hwcqe->status != CQ_CUTOFF_STATUS_OK) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process Cutoff Error status = 0x%x",
+			hwcqe->status);
+		return -EINVAL;
+	}
+	clear_bit(CQ_FLAGS_RESIZE_IN_PROG, &cq->flags);
+	wake_up_interruptible(&cq->waitq);
+
+	return 0;
+}
+
+int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
+		       int num_cqes)
+{
+	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	unsigned long flags;
+	u32 sw_cons, raw_cons;
+	int budget, rc = 0;
+
+	spin_lock_irqsave(&cq->hwq.lock, flags);
+	raw_cons = cq->hwq.cons;
+	budget = num_cqes;
+
+	while (budget) {
+		sw_cons = HWQ_CMP(raw_cons, &cq->hwq);
+		hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
+		hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)];
+
+		/* Check for Valid bit */
+		if (!CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements))
+			break;
+
+		/* From the device's respective CQE format to qplib_wc*/
+		switch (hw_cqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK) {
+		case CQ_BASE_CQE_TYPE_REQ:
+			rc = bnxt_qplib_cq_process_req(cq,
+						       (struct cq_req *)hw_cqe,
+						       &cqe, &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_RES_RC:
+			rc = bnxt_qplib_cq_process_res_rc(cq,
+							  (struct cq_res_rc *)
+							  hw_cqe, &cqe,
+							  &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_RES_UD:
+			rc = bnxt_qplib_cq_process_res_ud
+					(cq, (struct cq_res_ud *)hw_cqe, &cqe,
+					 &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_RES_RAWETH_QP1:
+			rc = bnxt_qplib_cq_process_res_raweth_qp1
+					(cq, (struct cq_res_raweth_qp1 *)
+					 hw_cqe, &cqe, &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_TERMINAL:
+			rc = bnxt_qplib_cq_process_terminal
+					(cq, (struct cq_terminal *)hw_cqe,
+					 &cqe, &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_CUT_OFF:
+			bnxt_qplib_cq_process_cutoff
+					(cq, (struct cq_cutoff *)hw_cqe);
+			/* Done processing this CQ */
+			goto exit;
+		default:
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: process_cq unknown type 0x%lx",
+				hw_cqe->cqe_type_toggle &
+				CQ_BASE_CQE_TYPE_MASK);
+			rc = -EINVAL;
+			break;
+		}
+		if (rc < 0) {
+			if (rc == -EAGAIN)
+				break;
+			/* Error while processing the CQE, just skip to the
+			 * next one
+			 */
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: process_cqe error rc = 0x%x", rc);
+		}
+		raw_cons++;
+	}
+	if (cq->hwq.cons != raw_cons) {
+		cq->hwq.cons = raw_cons;
+		bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ);
+	}
+exit:
+	spin_unlock_irqrestore(&cq->hwq.lock, flags);
+	return num_cqes - budget;
+}
+
+void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->hwq.lock, flags);
+	if (arm_type)
+		bnxt_qplib_arm_cq(cq, arm_type);
+
+	spin_unlock_irqrestore(&cq->hwq.lock, flags);
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
new file mode 100644
index 000000000000..f0150f8da1e3
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -0,0 +1,439 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Fast Path Operators (header)
+ */
+
+#ifndef __BNXT_QPLIB_FP_H__
+#define __BNXT_QPLIB_FP_H__
+
+struct bnxt_qplib_sge {
+	u64				addr;
+	u32				lkey;
+	u32				size;
+};
+
+#define BNXT_QPLIB_MAX_SQE_ENTRY_SIZE	sizeof(struct sq_send)
+
+#define SQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_SQE_ENTRY_SIZE)
+#define SQE_MAX_IDX_PER_PG	(SQE_CNT_PER_PG - 1)
+
+static inline u32 get_sqe_pg(u32 val)
+{
+	return ((val & ~SQE_MAX_IDX_PER_PG) / SQE_CNT_PER_PG);
+}
+
+static inline u32 get_sqe_idx(u32 val)
+{
+	return (val & SQE_MAX_IDX_PER_PG);
+}
+
+#define BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE	sizeof(struct sq_psn_search)
+
+#define PSNE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE)
+#define PSNE_MAX_IDX_PER_PG	(PSNE_CNT_PER_PG - 1)
+
+static inline u32 get_psne_pg(u32 val)
+{
+	return ((val & ~PSNE_MAX_IDX_PER_PG) / PSNE_CNT_PER_PG);
+}
+
+static inline u32 get_psne_idx(u32 val)
+{
+	return (val & PSNE_MAX_IDX_PER_PG);
+}
+
+#define BNXT_QPLIB_QP_MAX_SGL	6
+
+struct bnxt_qplib_swq {
+	u64				wr_id;
+	u8				type;
+	u8				flags;
+	u32				start_psn;
+	u32				next_psn;
+	struct sq_psn_search		*psn_search;
+};
+
+struct bnxt_qplib_swqe {
+	/* General */
+	u64				wr_id;
+	u8				reqs_type;
+	u8				type;
+#define BNXT_QPLIB_SWQE_TYPE_SEND			0
+#define BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM		1
+#define BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV		2
+#define BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE			4
+#define BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM	5
+#define BNXT_QPLIB_SWQE_TYPE_RDMA_READ			6
+#define BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP		8
+#define BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD	11
+#define BNXT_QPLIB_SWQE_TYPE_LOCAL_INV			12
+#define BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR		13
+#define BNXT_QPLIB_SWQE_TYPE_REG_MR			13
+#define BNXT_QPLIB_SWQE_TYPE_BIND_MW			14
+#define BNXT_QPLIB_SWQE_TYPE_RECV			128
+#define BNXT_QPLIB_SWQE_TYPE_RECV_RDMA_IMM		129
+	u8				flags;
+#define BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP		BIT(0)
+#define BNXT_QPLIB_SWQE_FLAGS_RD_ATOMIC_FENCE		BIT(1)
+#define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE			BIT(2)
+#define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT		BIT(3)
+#define BNXT_QPLIB_SWQE_FLAGS_INLINE			BIT(4)
+	struct bnxt_qplib_sge		sg_list[BNXT_QPLIB_QP_MAX_SGL];
+	int				num_sge;
+	/* Max inline data is 96 bytes */
+	u32				inline_len;
+#define BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH		96
+	u8		inline_data[BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH];
+
+	union {
+		/* Send, with imm, inval key */
+		struct {
+			union {
+				__be32	imm_data;
+				u32	inv_key;
+			};
+			u32		q_key;
+			u32		dst_qp;
+			u16		avid;
+		} send;
+
+		/* Send Raw Ethernet and QP1 */
+		struct {
+			u16		lflags;
+			u16		cfa_action;
+			u32		cfa_meta;
+		} rawqp1;
+
+		/* RDMA write, with imm, read */
+		struct {
+			union {
+				__be32	imm_data;
+				u32	inv_key;
+			};
+			u64		remote_va;
+			u32		r_key;
+		} rdma;
+
+		/* Atomic cmp/swap, fetch/add */
+		struct {
+			u64		remote_va;
+			u32		r_key;
+			u64		swap_data;
+			u64		cmp_data;
+		} atomic;
+
+		/* Local Invalidate */
+		struct {
+			u32		inv_l_key;
+		} local_inv;
+
+		/* FR-PMR */
+		struct {
+			u8		access_cntl;
+			u8		pg_sz_log;
+			bool		zero_based;
+			u32		l_key;
+			u32		length;
+			u8		pbl_pg_sz_log;
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_4K			0
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_8K			1
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_64K			4
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_256K			6
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_1M			8
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_2M			9
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_4M			10
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_1G			18
+			u8		levels;
+#define PAGE_SHIFT_4K	12
+			__le64		*pbl_ptr;
+			dma_addr_t	pbl_dma_ptr;
+			u64		*page_list;
+			u16		page_list_len;
+			u64		va;
+		} frmr;
+
+		/* Bind */
+		struct {
+			u8		access_cntl;
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_LOCAL_WRITE		BIT(0)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_REMOTE_READ		BIT(1)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_REMOTE_WRITE	BIT(2)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_REMOTE_ATOMIC	BIT(3)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_WINDOW_BIND		BIT(4)
+			bool		zero_based;
+			u8		mw_type;
+			u32		parent_l_key;
+			u32		r_key;
+			u64		va;
+			u32		length;
+		} bind;
+	};
+};
+
+#define BNXT_QPLIB_MAX_RQE_ENTRY_SIZE	sizeof(struct rq_wqe)
+
+#define RQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_RQE_ENTRY_SIZE)
+#define RQE_MAX_IDX_PER_PG	(RQE_CNT_PER_PG - 1)
+#define RQE_PG(x)		(((x) & ~RQE_MAX_IDX_PER_PG) / RQE_CNT_PER_PG)
+#define RQE_IDX(x)		((x) & RQE_MAX_IDX_PER_PG)
+
+struct bnxt_qplib_q {
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_swq		*swq;
+	struct scatterlist		*sglist;
+	u32				nmap;
+	u32				max_wqe;
+	u16				max_sge;
+	u32				psn;
+	bool				flush_in_progress;
+};
+
+struct bnxt_qplib_qp {
+	struct bnxt_qplib_pd		*pd;
+	struct bnxt_qplib_dpi		*dpi;
+	u64				qp_handle;
+	u32				id;
+	u8				type;
+	u8				sig_type;
+	u32				modify_flags;
+	u8				state;
+	u8				cur_qp_state;
+	u32				max_inline_data;
+	u32				mtu;
+	u8				path_mtu;
+	bool				en_sqd_async_notify;
+	u16				pkey_index;
+	u32				qkey;
+	u32				dest_qp_id;
+	u8				access;
+	u8				timeout;
+	u8				retry_cnt;
+	u8				rnr_retry;
+	u32				min_rnr_timer;
+	u32				max_rd_atomic;
+	u32				max_dest_rd_atomic;
+	u32				dest_qpn;
+	u8				smac[6];
+	u16				vlan_id;
+	u8				nw_type;
+	struct bnxt_qplib_ah		ah;
+
+#define BTH_PSN_MASK			((1 << 24) - 1)
+	/* SQ */
+	struct bnxt_qplib_q		sq;
+	/* RQ */
+	struct bnxt_qplib_q		rq;
+	/* SRQ */
+	struct bnxt_qplib_srq		*srq;
+	/* CQ */
+	struct bnxt_qplib_cq		*scq;
+	struct bnxt_qplib_cq		*rcq;
+	/* IRRQ and ORRQ */
+	struct bnxt_qplib_hwq		irrq;
+	struct bnxt_qplib_hwq		orrq;
+	/* Header buffer for QP1 */
+	int				sq_hdr_buf_size;
+	int				rq_hdr_buf_size;
+/*
+ * Buffer space for ETH(14), IP or GRH(40), UDP header(8)
+ * and ib_bth + ib_deth (20).
+ * Max required is 82 when RoCE V2 is enabled
+ */
+#define BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2	86
+	/* Ethernet header	=  14 */
+	/* ib_grh		=  40 (provided by MAD) */
+	/* ib_bth + ib_deth	=  20 */
+	/* MAD			= 256 (provided by MAD) */
+	/* iCRC			=   4 */
+#define BNXT_QPLIB_MAX_QP1_RQ_ETH_HDR_SIZE	14
+#define BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2	512
+#define BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV4	20
+#define BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6	40
+#define BNXT_QPLIB_MAX_QP1_RQ_BDETH_HDR_SIZE	20
+	void				*sq_hdr_buf;
+	dma_addr_t			sq_hdr_buf_map;
+	void				*rq_hdr_buf;
+	dma_addr_t			rq_hdr_buf_map;
+};
+
+#define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE	sizeof(struct cq_base)
+
+#define CQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_CQE_ENTRY_SIZE)
+#define CQE_MAX_IDX_PER_PG	(CQE_CNT_PER_PG - 1)
+#define CQE_PG(x)		(((x) & ~CQE_MAX_IDX_PER_PG) / CQE_CNT_PER_PG)
+#define CQE_IDX(x)		((x) & CQE_MAX_IDX_PER_PG)
+
+#define ROCE_CQE_CMP_V			0
+#define CQE_CMP_VALID(hdr, raw_cons, cp_bit)			\
+	(!!((hdr)->cqe_type_toggle & CQ_BASE_TOGGLE) ==		\
+	   !((raw_cons) & (cp_bit)))
+
+struct bnxt_qplib_cqe {
+	u8				status;
+	u8				type;
+	u8				opcode;
+	u32				length;
+	u64				wr_id;
+	union {
+		__be32			immdata;
+		u32			invrkey;
+	};
+	u64				qp_handle;
+	u64				mr_handle;
+	u16				flags;
+	u8				smac[6];
+	u32				src_qp;
+	u16				raweth_qp1_flags;
+	u16				raweth_qp1_errors;
+	u16				raweth_qp1_cfa_code;
+	u32				raweth_qp1_flags2;
+	u32				raweth_qp1_metadata;
+	u8				raweth_qp1_payload_offset;
+	u16				pkey_index;
+};
+
+#define BNXT_QPLIB_QUEUE_START_PERIOD		0x01
+struct bnxt_qplib_cq {
+	struct bnxt_qplib_dpi		*dpi;
+	void __iomem			*dbr_base;
+	u32				max_wqe;
+	u32				id;
+	u16				count;
+	u16				period;
+	struct bnxt_qplib_hwq		hwq;
+	u32				cnq_hw_ring_id;
+	bool				resize_in_progress;
+	struct scatterlist		*sghead;
+	u32				nmap;
+	u64				cq_handle;
+
+#define CQ_RESIZE_WAIT_TIME_MS		500
+	unsigned long			flags;
+#define CQ_FLAGS_RESIZE_IN_PROG		1
+	wait_queue_head_t		waitq;
+};
+
+#define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE	sizeof(struct xrrq_irrq)
+#define BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE	sizeof(struct xrrq_orrq)
+#define IRD_LIMIT_TO_IRRQ_SLOTS(x)	(2 * (x) + 2)
+#define IRRQ_SLOTS_TO_IRD_LIMIT(s)	(((s) >> 1) - 1)
+#define ORD_LIMIT_TO_ORRQ_SLOTS(x)	((x) + 1)
+#define ORRQ_SLOTS_TO_ORD_LIMIT(s)	((s) - 1)
+
+#define BNXT_QPLIB_MAX_NQE_ENTRY_SIZE	sizeof(struct nq_base)
+
+#define NQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_NQE_ENTRY_SIZE)
+#define NQE_MAX_IDX_PER_PG	(NQE_CNT_PER_PG - 1)
+#define NQE_PG(x)		(((x) & ~NQE_MAX_IDX_PER_PG) / NQE_CNT_PER_PG)
+#define NQE_IDX(x)		((x) & NQE_MAX_IDX_PER_PG)
+
+#define NQE_CMP_VALID(hdr, raw_cons, cp_bit)			\
+	(!!(le32_to_cpu((hdr)->info63_v[0]) & NQ_BASE_V) ==	\
+	   !((raw_cons) & (cp_bit)))
+
+#define BNXT_QPLIB_NQE_MAX_CNT		(128 * 1024)
+
+#define NQ_CONS_PCI_BAR_REGION		2
+#define NQ_DB_KEY_CP			(0x2 << CMPL_DOORBELL_KEY_SFT)
+#define NQ_DB_IDX_VALID			CMPL_DOORBELL_IDX_VALID
+#define NQ_DB_IRQ_DIS			CMPL_DOORBELL_MASK
+#define NQ_DB_CP_FLAGS_REARM		(NQ_DB_KEY_CP |		\
+					 NQ_DB_IDX_VALID)
+#define NQ_DB_CP_FLAGS			(NQ_DB_KEY_CP    |	\
+					 NQ_DB_IDX_VALID |	\
+					 NQ_DB_IRQ_DIS)
+#define NQ_DB_REARM(db, raw_cons, cp_bit)			\
+	writel(NQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
+#define NQ_DB(db, raw_cons, cp_bit)				\
+	writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+struct bnxt_qplib_nq {
+	struct pci_dev			*pdev;
+
+	int				vector;
+	int				budget;
+	bool				requested;
+	struct tasklet_struct		worker;
+	struct bnxt_qplib_hwq		hwq;
+
+	u16				bar_reg;
+	u16				bar_reg_off;
+	u16				ring_id;
+	void __iomem			*bar_reg_iomem;
+
+	int				(*cqn_handler)
+						(struct bnxt_qplib_nq *nq,
+						 struct bnxt_qplib_cq *cq);
+	int				(*srqn_handler)
+						(struct bnxt_qplib_nq *nq,
+						 void *srq,
+						 u8 event);
+};
+
+void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq);
+int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
+			 int msix_vector, int bar_reg_offset,
+			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
+					    struct bnxt_qplib_cq *cq),
+			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
+					     void *srq,
+					     u8 event));
+int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge);
+void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge);
+u32 bnxt_qplib_get_rq_prod_index(struct bnxt_qplib_qp *qp);
+dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp,
+					    u32 index);
+void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp);
+int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe);
+void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp);
+int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe);
+int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
+int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
+int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
+		       int num);
+void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
+void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
+int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
+#endif /* __BNXT_QPLIB_FP_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
new file mode 100644
index 000000000000..23fb7260662b
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -0,0 +1,694 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: RDMA Controller HW interface
+ */
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/prefetch.h>
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_rcfw.h"
+static void bnxt_qplib_service_creq(unsigned long data);
+
+/* Hardware communication channel */
+int bnxt_qplib_rcfw_wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
+{
+	u16 cbit;
+	int rc;
+
+	cookie &= RCFW_MAX_COOKIE_VALUE;
+	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	if (!test_bit(cbit, rcfw->cmdq_bitmap))
+		dev_warn(&rcfw->pdev->dev,
+			 "QPLIB: CMD bit %d for cookie 0x%x is not set?",
+			 cbit, cookie);
+
+	rc = wait_event_timeout(rcfw->waitq,
+				!test_bit(cbit, rcfw->cmdq_bitmap),
+				msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
+	if (!rc) {
+		dev_warn(&rcfw->pdev->dev,
+			 "QPLIB: Bono Error: timeout %d msec, msg {0x%x}\n",
+			 RCFW_CMD_WAIT_TIME_MS, cookie);
+	}
+
+	return rc;
+};
+
+int bnxt_qplib_rcfw_block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
+{
+	u32 count = -1;
+	u16 cbit;
+
+	cookie &= RCFW_MAX_COOKIE_VALUE;
+	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	if (!test_bit(cbit, rcfw->cmdq_bitmap))
+		goto done;
+	do {
+		bnxt_qplib_service_creq((unsigned long)rcfw);
+	} while (test_bit(cbit, rcfw->cmdq_bitmap) && --count);
+done:
+	return count;
+};
+
+void *bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
+				   struct cmdq_base *req, void **crsbe,
+				   u8 is_block)
+{
+	struct bnxt_qplib_crsq *crsq = &rcfw->crsq;
+	struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
+	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	struct bnxt_qplib_hwq *crsb = &rcfw->crsb;
+	struct bnxt_qplib_crsqe *crsqe = NULL;
+	struct bnxt_qplib_crsbe **crsb_ptr;
+	u32 sw_prod, cmdq_prod;
+	u8 retry_cnt = 0xFF;
+	dma_addr_t dma_addr;
+	unsigned long flags;
+	u32 size, opcode;
+	u16 cookie, cbit;
+	int pg, idx;
+	u8 *preq;
+
+retry:
+	opcode = req->opcode;
+	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	    (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
+	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: RCFW not initialized, reject opcode 0x%x",
+			opcode);
+		return NULL;
+	}
+
+	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	    opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!");
+		return NULL;
+	}
+
+	/* Cmdq are in 16-byte units, each request can consume 1 or more
+	 * cmdqe
+	 */
+	spin_lock_irqsave(&cmdq->lock, flags);
+	if (req->cmd_size > cmdq->max_elements -
+	    ((HWQ_CMP(cmdq->prod, cmdq) - HWQ_CMP(cmdq->cons, cmdq)) &
+	     (cmdq->max_elements - 1))) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!");
+		spin_unlock_irqrestore(&cmdq->lock, flags);
+
+		if (!retry_cnt--)
+			return NULL;
+		goto retry;
+	}
+
+	retry_cnt = 0xFF;
+
+	cookie = atomic_inc_return(&rcfw->seq_num) & RCFW_MAX_COOKIE_VALUE;
+	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	if (is_block)
+		cookie |= RCFW_CMD_IS_BLOCKING;
+	req->cookie = cpu_to_le16(cookie);
+	if (test_and_set_bit(cbit, rcfw->cmdq_bitmap)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: RCFW MAX outstanding cmd reached!");
+		atomic_dec(&rcfw->seq_num);
+		spin_unlock_irqrestore(&cmdq->lock, flags);
+
+		if (!retry_cnt--)
+			return NULL;
+		goto retry;
+	}
+	/* Reserve a resp buffer slot if requested */
+	if (req->resp_size && crsbe) {
+		spin_lock(&crsb->lock);
+		sw_prod = HWQ_CMP(crsb->prod, crsb);
+		crsb_ptr = (struct bnxt_qplib_crsbe **)crsb->pbl_ptr;
+		*crsbe = (void *)&crsb_ptr[get_crsb_pg(sw_prod)]
+					  [get_crsb_idx(sw_prod)];
+		bnxt_qplib_crsb_dma_next(crsb->pbl_dma_ptr, sw_prod, &dma_addr);
+		req->resp_addr = cpu_to_le64(dma_addr);
+		crsb->prod++;
+		spin_unlock(&crsb->lock);
+
+		req->resp_size = (sizeof(struct bnxt_qplib_crsbe) +
+				  BNXT_QPLIB_CMDQE_UNITS - 1) /
+				 BNXT_QPLIB_CMDQE_UNITS;
+	}
+	cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr;
+	preq = (u8 *)req;
+	size = req->cmd_size * BNXT_QPLIB_CMDQE_UNITS;
+	do {
+		pg = 0;
+		idx = 0;
+
+		/* Locate the next cmdq slot */
+		sw_prod = HWQ_CMP(cmdq->prod, cmdq);
+		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)];
+		if (!cmdqe) {
+			dev_err(&rcfw->pdev->dev,
+				"QPLIB: RCFW request failed with no cmdqe!");
+			goto done;
+		}
+		/* Copy a segment of the req cmd to the cmdq */
+		memset(cmdqe, 0, sizeof(*cmdqe));
+		memcpy(cmdqe, preq, min_t(u32, size, sizeof(*cmdqe)));
+		preq += min_t(u32, size, sizeof(*cmdqe));
+		size -= min_t(u32, size, sizeof(*cmdqe));
+		cmdq->prod++;
+	} while (size > 0);
+
+	cmdq_prod = cmdq->prod;
+	if (rcfw->flags & FIRMWARE_FIRST_FLAG) {
+		/* The very first doorbell write is required to set this flag
+		 * which prompts the FW to reset its internal pointers
+		 */
+		cmdq_prod |= FIRMWARE_FIRST_FLAG;
+		rcfw->flags &= ~FIRMWARE_FIRST_FLAG;
+	}
+	sw_prod = HWQ_CMP(crsq->prod, crsq);
+	crsqe = &crsq->crsq[sw_prod];
+	memset(crsqe, 0, sizeof(*crsqe));
+	crsq->prod++;
+	crsqe->req_size = req->cmd_size;
+
+	/* ring CMDQ DB */
+	writel(cmdq_prod, rcfw->cmdq_bar_reg_iomem +
+	       rcfw->cmdq_bar_reg_prod_off);
+	writel(RCFW_CMDQ_TRIG_VAL, rcfw->cmdq_bar_reg_iomem +
+	       rcfw->cmdq_bar_reg_trig_off);
+done:
+	spin_unlock_irqrestore(&cmdq->lock, flags);
+	/* Return the CREQ response pointer */
+	return crsqe ? &crsqe->qp_event : NULL;
+}
+
+/* Completions */
+static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
+					 struct creq_func_event *func_event)
+{
+	switch (func_event->event) {
+	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+		/* SRQ ctx error, call srq_handler??
+		 * But there's no SRQ handle!
+		 */
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
+				       struct creq_qp_event *qp_event)
+{
+	struct bnxt_qplib_crsq *crsq = &rcfw->crsq;
+	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	struct bnxt_qplib_crsqe *crsqe;
+	u16 cbit, cookie, blocked = 0;
+	unsigned long flags;
+	u32 sw_cons;
+
+	switch (qp_event->event) {
+	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
+		dev_dbg(&rcfw->pdev->dev,
+			"QPLIB: Received QP error notification");
+		break;
+	default:
+		/* Command Response */
+		spin_lock_irqsave(&cmdq->lock, flags);
+		sw_cons = HWQ_CMP(crsq->cons, crsq);
+		crsqe = &crsq->crsq[sw_cons];
+		crsq->cons++;
+		memcpy(&crsqe->qp_event, qp_event, sizeof(crsqe->qp_event));
+
+		cookie = le16_to_cpu(crsqe->qp_event.cookie);
+		blocked = cookie & RCFW_CMD_IS_BLOCKING;
+		cookie &= RCFW_MAX_COOKIE_VALUE;
+		cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+		if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
+			dev_warn(&rcfw->pdev->dev,
+				 "QPLIB: CMD bit %d was not requested", cbit);
+
+		cmdq->cons += crsqe->req_size;
+		spin_unlock_irqrestore(&cmdq->lock, flags);
+		if (!blocked)
+			wake_up(&rcfw->waitq);
+		break;
+	}
+	return 0;
+}
+
+/* SP - CREQ Completion handlers */
+static void bnxt_qplib_service_creq(unsigned long data)
+{
+	struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
+	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct creq_base *creqe, **creq_ptr;
+	u32 sw_cons, raw_cons;
+	unsigned long flags;
+	u32 type;
+
+	/* Service the CREQ until empty */
+	spin_lock_irqsave(&creq->lock, flags);
+	raw_cons = creq->cons;
+	while (1) {
+		sw_cons = HWQ_CMP(raw_cons, creq);
+		creq_ptr = (struct creq_base **)creq->pbl_ptr;
+		creqe = &creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)];
+		if (!CREQ_CMP_VALID(creqe, raw_cons, creq->max_elements))
+			break;
+
+		type = creqe->type & CREQ_BASE_TYPE_MASK;
+		switch (type) {
+		case CREQ_BASE_TYPE_QP_EVENT:
+			if (!bnxt_qplib_process_qp_event
+			    (rcfw, (struct creq_qp_event *)creqe))
+				rcfw->creq_qp_event_processed++;
+			else {
+				dev_warn(&rcfw->pdev->dev, "QPLIB: crsqe with");
+				dev_warn(&rcfw->pdev->dev,
+					 "QPLIB: type = 0x%x not handled",
+					 type);
+			}
+			break;
+		case CREQ_BASE_TYPE_FUNC_EVENT:
+			if (!bnxt_qplib_process_func_event
+			    (rcfw, (struct creq_func_event *)creqe))
+				rcfw->creq_func_event_processed++;
+			else
+				dev_warn
+				(&rcfw->pdev->dev, "QPLIB:aeqe:%#x Not handled",
+				 type);
+			break;
+		default:
+			dev_warn(&rcfw->pdev->dev, "QPLIB: creqe with ");
+			dev_warn(&rcfw->pdev->dev,
+				 "QPLIB: op_event = 0x%x not handled", type);
+			break;
+		}
+		raw_cons++;
+	}
+	if (creq->cons != raw_cons) {
+		creq->cons = raw_cons;
+		CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons,
+			      creq->max_elements);
+	}
+	spin_unlock_irqrestore(&creq->lock, flags);
+}
+
+static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance)
+{
+	struct bnxt_qplib_rcfw *rcfw = dev_instance;
+	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct creq_base **creq_ptr;
+	u32 sw_cons;
+
+	/* Prefetch the CREQ element */
+	sw_cons = HWQ_CMP(creq->cons, creq);
+	creq_ptr = (struct creq_base **)rcfw->creq.pbl_ptr;
+	prefetch(&creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)]);
+
+	tasklet_schedule(&rcfw->worker);
+
+	return IRQ_HANDLED;
+}
+
+/* RCFW */
+int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw)
+{
+	struct creq_deinitialize_fw_resp *resp;
+	struct cmdq_deinitialize_fw req;
+	u16 cmd_flags = 0;
+
+	RCFW_CMD_PREP(req, DEINITIALIZE_FW, cmd_flags);
+	resp = (struct creq_deinitialize_fw_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp)
+		return -EINVAL;
+
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie)))
+		return -ETIMEDOUT;
+
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie))
+		return -EFAULT;
+
+	clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	return 0;
+}
+
+static int __get_pbl_pg_idx(struct bnxt_qplib_pbl *pbl)
+{
+	return (pbl->pg_size == ROCE_PG_SIZE_4K ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K :
+		pbl->pg_size == ROCE_PG_SIZE_8K ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8K :
+		pbl->pg_size == ROCE_PG_SIZE_64K ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_64K :
+		pbl->pg_size == ROCE_PG_SIZE_2M ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_2M :
+		pbl->pg_size == ROCE_PG_SIZE_8M ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8M :
+		pbl->pg_size == ROCE_PG_SIZE_1G ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_1G :
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K);
+}
+
+int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
+			 struct bnxt_qplib_ctx *ctx, int is_virtfn)
+{
+	struct creq_initialize_fw_resp *resp;
+	struct cmdq_initialize_fw req;
+	u16 cmd_flags = 0, level;
+
+	RCFW_CMD_PREP(req, INITIALIZE_FW, cmd_flags);
+
+	/*
+	 * VFs need not setup the HW context area, PF
+	 * shall setup this area for VF. Skipping the
+	 * HW programming
+	 */
+	if (is_virtfn)
+		goto skip_ctx_setup;
+
+	level = ctx->qpc_tbl.level;
+	req.qpc_pg_size_qpc_lvl = (level << CMDQ_INITIALIZE_FW_QPC_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->qpc_tbl.pbl[level]);
+	level = ctx->mrw_tbl.level;
+	req.mrw_pg_size_mrw_lvl = (level << CMDQ_INITIALIZE_FW_MRW_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->mrw_tbl.pbl[level]);
+	level = ctx->srqc_tbl.level;
+	req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]);
+	level = ctx->cq_tbl.level;
+	req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]);
+	level = ctx->srqc_tbl.level;
+	req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]);
+	level = ctx->cq_tbl.level;
+	req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]);
+	level = ctx->tim_tbl.level;
+	req.tim_pg_size_tim_lvl = (level << CMDQ_INITIALIZE_FW_TIM_LVL_SFT) |
+				  __get_pbl_pg_idx(&ctx->tim_tbl.pbl[level]);
+	level = ctx->tqm_pde_level;
+	req.tqm_pg_size_tqm_lvl = (level << CMDQ_INITIALIZE_FW_TQM_LVL_SFT) |
+				  __get_pbl_pg_idx(&ctx->tqm_pde.pbl[level]);
+
+	req.qpc_page_dir =
+		cpu_to_le64(ctx->qpc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.mrw_page_dir =
+		cpu_to_le64(ctx->mrw_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.srq_page_dir =
+		cpu_to_le64(ctx->srqc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.cq_page_dir =
+		cpu_to_le64(ctx->cq_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.tim_page_dir =
+		cpu_to_le64(ctx->tim_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.tqm_page_dir =
+		cpu_to_le64(ctx->tqm_pde.pbl[PBL_LVL_0].pg_map_arr[0]);
+
+	req.number_of_qp = cpu_to_le32(ctx->qpc_tbl.max_elements);
+	req.number_of_mrw = cpu_to_le32(ctx->mrw_tbl.max_elements);
+	req.number_of_srq = cpu_to_le32(ctx->srqc_tbl.max_elements);
+	req.number_of_cq = cpu_to_le32(ctx->cq_tbl.max_elements);
+
+	req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf);
+	req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf);
+	req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf);
+	req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf);
+	req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf);
+
+skip_ctx_setup:
+	req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id);
+	resp = (struct creq_initialize_fw_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: RCFW: INITIALIZE_FW send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: RCFW: INITIALIZE_FW timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: RCFW: INITIALIZE_FW failed");
+		return -EINVAL;
+	}
+	set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	return 0;
+}
+
+void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
+{
+	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->crsb);
+	kfree(rcfw->crsq.crsq);
+	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
+	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
+
+	rcfw->pdev = NULL;
+}
+
+int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+				  struct bnxt_qplib_rcfw *rcfw)
+{
+	rcfw->pdev = pdev;
+	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
+	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0,
+				      &rcfw->creq.max_elements,
+				      BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE,
+				      HWQ_TYPE_L2_CMPL)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: HW channel CREQ allocation failed");
+		goto fail;
+	}
+	rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT;
+	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->cmdq, NULL, 0,
+				      &rcfw->cmdq.max_elements,
+				      BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE,
+				      HWQ_TYPE_CTX)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: HW channel CMDQ allocation failed");
+		goto fail;
+	}
+
+	rcfw->crsq.max_elements = rcfw->cmdq.max_elements;
+	rcfw->crsq.crsq = kcalloc(rcfw->crsq.max_elements,
+				  sizeof(*rcfw->crsq.crsq), GFP_KERNEL);
+	if (!rcfw->crsq.crsq)
+		goto fail;
+
+	rcfw->crsb.max_elements = BNXT_QPLIB_CRSBE_MAX_CNT;
+	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->crsb, NULL, 0,
+				      &rcfw->crsb.max_elements,
+				      BNXT_QPLIB_CRSBE_UNITS, 0, PAGE_SIZE,
+				      HWQ_TYPE_CTX)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: HW channel CRSB allocation failed");
+		goto fail;
+	}
+	return 0;
+
+fail:
+	bnxt_qplib_free_rcfw_channel(rcfw);
+	return -ENOMEM;
+}
+
+void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
+{
+	unsigned long indx;
+
+	/* Make sure the HW channel is stopped! */
+	synchronize_irq(rcfw->vector);
+	tasklet_disable(&rcfw->worker);
+	tasklet_kill(&rcfw->worker);
+
+	if (rcfw->requested) {
+		free_irq(rcfw->vector, rcfw);
+		rcfw->requested = false;
+	}
+	if (rcfw->cmdq_bar_reg_iomem)
+		iounmap(rcfw->cmdq_bar_reg_iomem);
+	rcfw->cmdq_bar_reg_iomem = NULL;
+
+	if (rcfw->creq_bar_reg_iomem)
+		iounmap(rcfw->creq_bar_reg_iomem);
+	rcfw->creq_bar_reg_iomem = NULL;
+
+	indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size);
+	if (indx != rcfw->bmap_size)
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: disabling RCFW with pending cmd-bit %lx", indx);
+	kfree(rcfw->cmdq_bitmap);
+	rcfw->bmap_size = 0;
+
+	rcfw->aeq_handler = NULL;
+	rcfw->vector = 0;
+}
+
+int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
+				   struct bnxt_qplib_rcfw *rcfw,
+				   int msix_vector,
+				   int cp_bar_reg_off, int virt_fn,
+				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
+						      struct creq_func_event *))
+{
+	resource_size_t res_base;
+	struct cmdq_init init;
+	u16 bmap_size;
+	int rc;
+
+	/* General */
+	atomic_set(&rcfw->seq_num, 0);
+	rcfw->flags = FIRMWARE_FIRST_FLAG;
+	bmap_size = BITS_TO_LONGS(RCFW_MAX_OUTSTANDING_CMD *
+				  sizeof(unsigned long));
+	rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
+	if (!rcfw->cmdq_bitmap)
+		return -ENOMEM;
+	rcfw->bmap_size = bmap_size;
+
+	/* CMDQ */
+	rcfw->cmdq_bar_reg = RCFW_COMM_PCI_BAR_REGION;
+	res_base = pci_resource_start(pdev, rcfw->cmdq_bar_reg);
+	if (!res_base)
+		return -ENOMEM;
+
+	rcfw->cmdq_bar_reg_iomem = ioremap_nocache(res_base +
+					      RCFW_COMM_BASE_OFFSET,
+					      RCFW_COMM_SIZE);
+	if (!rcfw->cmdq_bar_reg_iomem) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: CMDQ BAR region %d mapping failed",
+			rcfw->cmdq_bar_reg);
+		return -ENOMEM;
+	}
+
+	rcfw->cmdq_bar_reg_prod_off = virt_fn ? RCFW_VF_COMM_PROD_OFFSET :
+					RCFW_PF_COMM_PROD_OFFSET;
+
+	rcfw->cmdq_bar_reg_trig_off = RCFW_COMM_TRIG_OFFSET;
+
+	/* CRSQ */
+	rcfw->crsq.prod = 0;
+	rcfw->crsq.cons = 0;
+
+	/* CREQ */
+	rcfw->creq_bar_reg = RCFW_COMM_CONS_PCI_BAR_REGION;
+	res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
+	if (!res_base)
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: CREQ BAR region %d resc start is 0!",
+			rcfw->creq_bar_reg);
+	rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
+						   4);
+	if (!rcfw->creq_bar_reg_iomem) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: CREQ BAR region %d mapping failed",
+			rcfw->creq_bar_reg);
+		return -ENOMEM;
+	}
+	rcfw->creq_qp_event_processed = 0;
+	rcfw->creq_func_event_processed = 0;
+
+	rcfw->vector = msix_vector;
+	if (aeq_handler)
+		rcfw->aeq_handler = aeq_handler;
+
+	tasklet_init(&rcfw->worker, bnxt_qplib_service_creq,
+		     (unsigned long)rcfw);
+
+	rcfw->requested = false;
+	rc = request_irq(rcfw->vector, bnxt_qplib_creq_irq, 0,
+			 "bnxt_qplib_creq", rcfw);
+	if (rc) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc);
+		bnxt_qplib_disable_rcfw_channel(rcfw);
+		return rc;
+	}
+	rcfw->requested = true;
+
+	init_waitqueue_head(&rcfw->waitq);
+
+	CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, 0, rcfw->creq.max_elements);
+
+	init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
+	init.cmdq_size_cmdq_lvl = cpu_to_le16(
+		((BNXT_QPLIB_CMDQE_MAX_CNT << CMDQ_INIT_CMDQ_SIZE_SFT) &
+		 CMDQ_INIT_CMDQ_SIZE_MASK) |
+		((rcfw->cmdq.level << CMDQ_INIT_CMDQ_LVL_SFT) &
+		 CMDQ_INIT_CMDQ_LVL_MASK));
+	init.creq_ring_id = cpu_to_le16(rcfw->creq_ring_id);
+
+	/* Write to the Bono mailbox register */
+	__iowrite32_copy(rcfw->cmdq_bar_reg_iomem, &init, sizeof(init) / 4);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
new file mode 100644
index 000000000000..d3567d75bf58
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -0,0 +1,231 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: RDMA Controller HW interface (header)
+ */
+
+#ifndef __BNXT_QPLIB_RCFW_H__
+#define __BNXT_QPLIB_RCFW_H__
+
+#define RCFW_CMDQ_TRIG_VAL		1
+#define RCFW_COMM_PCI_BAR_REGION	0
+#define RCFW_COMM_CONS_PCI_BAR_REGION	2
+#define RCFW_COMM_BASE_OFFSET		0x600
+#define RCFW_PF_COMM_PROD_OFFSET	0xc
+#define RCFW_VF_COMM_PROD_OFFSET	0xc
+#define RCFW_COMM_TRIG_OFFSET		0x100
+#define RCFW_COMM_SIZE			0x104
+
+#define RCFW_DBR_PCI_BAR_REGION		2
+
+#define RCFW_CMD_PREP(req, CMD, cmd_flags)				\
+	do {								\
+		memset(&(req), 0, sizeof((req)));			\
+		(req).opcode = CMDQ_BASE_OPCODE_##CMD;			\
+		(req).cmd_size = (sizeof((req)) +			\
+				BNXT_QPLIB_CMDQE_UNITS - 1) /		\
+				BNXT_QPLIB_CMDQE_UNITS;			\
+		(req).flags = cpu_to_le16(cmd_flags);			\
+	} while (0)
+
+#define RCFW_CMD_WAIT_TIME_MS		20000 /* 20 Seconds timeout */
+
+/* CMDQ elements */
+#define BNXT_QPLIB_CMDQE_MAX_CNT	256
+#define BNXT_QPLIB_CMDQE_UNITS		sizeof(struct bnxt_qplib_cmdqe)
+#define BNXT_QPLIB_CMDQE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CMDQE_UNITS)
+
+#define MAX_CMDQ_IDX			(BNXT_QPLIB_CMDQE_MAX_CNT - 1)
+#define MAX_CMDQ_IDX_PER_PG		(BNXT_QPLIB_CMDQE_CNT_PER_PG - 1)
+
+#define RCFW_MAX_OUTSTANDING_CMD	BNXT_QPLIB_CMDQE_MAX_CNT
+#define RCFW_MAX_COOKIE_VALUE		0x7FFF
+#define RCFW_CMD_IS_BLOCKING		0x8000
+
+/* Cmdq contains a fix number of a 16-Byte slots */
+struct bnxt_qplib_cmdqe {
+	u8		data[16];
+};
+
+static inline u32 get_cmdq_pg(u32 val)
+{
+	return (val & ~MAX_CMDQ_IDX_PER_PG) / BNXT_QPLIB_CMDQE_CNT_PER_PG;
+}
+
+static inline u32 get_cmdq_idx(u32 val)
+{
+	return val & MAX_CMDQ_IDX_PER_PG;
+}
+
+/* Crsq buf is 1024-Byte */
+struct bnxt_qplib_crsbe {
+	u8			data[1024];
+};
+
+/* CRSQ SB */
+#define BNXT_QPLIB_CRSBE_MAX_CNT	4
+#define BNXT_QPLIB_CRSBE_UNITS		sizeof(struct bnxt_qplib_crsbe)
+#define BNXT_QPLIB_CRSBE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CRSBE_UNITS)
+
+#define MAX_CRSB_IDX			(BNXT_QPLIB_CRSBE_MAX_CNT - 1)
+#define MAX_CRSB_IDX_PER_PG		(BNXT_QPLIB_CRSBE_CNT_PER_PG - 1)
+
+static inline u32 get_crsb_pg(u32 val)
+{
+	return (val & ~MAX_CRSB_IDX_PER_PG) / BNXT_QPLIB_CRSBE_CNT_PER_PG;
+}
+
+static inline u32 get_crsb_idx(u32 val)
+{
+	return val & MAX_CRSB_IDX_PER_PG;
+}
+
+static inline void bnxt_qplib_crsb_dma_next(dma_addr_t *pg_map_arr,
+					    u32 prod, dma_addr_t *dma_addr)
+{
+		*dma_addr = pg_map_arr[(prod) / BNXT_QPLIB_CRSBE_CNT_PER_PG];
+		*dma_addr += ((prod) % BNXT_QPLIB_CRSBE_CNT_PER_PG) *
+			      BNXT_QPLIB_CRSBE_UNITS;
+}
+
+/* CREQ */
+/* Allocate 1 per QP for async error notification for now */
+#define BNXT_QPLIB_CREQE_MAX_CNT	(64 * 1024)
+#define BNXT_QPLIB_CREQE_UNITS		16	/* 16-Bytes per prod unit */
+#define BNXT_QPLIB_CREQE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CREQE_UNITS)
+
+#define MAX_CREQ_IDX			(BNXT_QPLIB_CREQE_MAX_CNT - 1)
+#define MAX_CREQ_IDX_PER_PG		(BNXT_QPLIB_CREQE_CNT_PER_PG - 1)
+
+static inline u32 get_creq_pg(u32 val)
+{
+	return (val & ~MAX_CREQ_IDX_PER_PG) / BNXT_QPLIB_CREQE_CNT_PER_PG;
+}
+
+static inline u32 get_creq_idx(u32 val)
+{
+	return val & MAX_CREQ_IDX_PER_PG;
+}
+
+#define BNXT_QPLIB_CREQE_PER_PG	(PAGE_SIZE / sizeof(struct creq_base))
+
+#define CREQ_CMP_VALID(hdr, raw_cons, cp_bit)			\
+	(!!((hdr)->v & CREQ_BASE_V) ==				\
+	   !((raw_cons) & (cp_bit)))
+
+#define CREQ_DB_KEY_CP			(0x2 << CMPL_DOORBELL_KEY_SFT)
+#define CREQ_DB_IDX_VALID		CMPL_DOORBELL_IDX_VALID
+#define CREQ_DB_IRQ_DIS			CMPL_DOORBELL_MASK
+#define CREQ_DB_CP_FLAGS_REARM		(CREQ_DB_KEY_CP |	\
+					 CREQ_DB_IDX_VALID)
+#define CREQ_DB_CP_FLAGS		(CREQ_DB_KEY_CP |	\
+					 CREQ_DB_IDX_VALID |	\
+					 CREQ_DB_IRQ_DIS)
+#define CREQ_DB_REARM(db, raw_cons, cp_bit)			\
+	writel(CREQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
+#define CREQ_DB(db, raw_cons, cp_bit)				\
+	writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+/* HWQ */
+struct bnxt_qplib_crsqe {
+	struct creq_qp_event	qp_event;
+	u32			req_size;
+};
+
+struct bnxt_qplib_crsq {
+	struct bnxt_qplib_crsqe	*crsq;
+	u32			prod;
+	u32			cons;
+	u32			max_elements;
+};
+
+/* RCFW Communication Channels */
+struct bnxt_qplib_rcfw {
+	struct pci_dev		*pdev;
+	int			vector;
+	struct tasklet_struct	worker;
+	bool			requested;
+	unsigned long		*cmdq_bitmap;
+	u32			bmap_size;
+	unsigned long		flags;
+#define FIRMWARE_INITIALIZED_FLAG	1
+#define FIRMWARE_FIRST_FLAG		BIT(31)
+	wait_queue_head_t	waitq;
+	int			(*aeq_handler)(struct bnxt_qplib_rcfw *,
+					       struct creq_func_event *);
+	atomic_t		seq_num;
+
+	/* Bar region info */
+	void __iomem		*cmdq_bar_reg_iomem;
+	u16			cmdq_bar_reg;
+	u16			cmdq_bar_reg_prod_off;
+	u16			cmdq_bar_reg_trig_off;
+	u16			creq_ring_id;
+	u16			creq_bar_reg;
+	void __iomem		*creq_bar_reg_iomem;
+
+	/* Cmd-Resp and Async Event notification queue */
+	struct bnxt_qplib_hwq	creq;
+	u64			creq_qp_event_processed;
+	u64			creq_func_event_processed;
+
+	/* Actual Cmd and Resp Queues */
+	struct bnxt_qplib_hwq	cmdq;
+	struct bnxt_qplib_crsq	crsq;
+	struct bnxt_qplib_hwq	crsb;
+};
+
+void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+				  struct bnxt_qplib_rcfw *rcfw);
+void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
+				   struct bnxt_qplib_rcfw *rcfw,
+				   int msix_vector,
+				   int cp_bar_reg_off, int virt_fn,
+				   int (*aeq_handler)
+					(struct bnxt_qplib_rcfw *,
+					 struct creq_func_event *));
+
+int bnxt_qplib_rcfw_block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie);
+int bnxt_qplib_rcfw_wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie);
+void *bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
+				   struct cmdq_base *req, void **crsbe,
+				   u8 is_block);
+
+int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
+			 struct bnxt_qplib_ctx *ctx, int is_virtfn);
+#endif /* __BNXT_QPLIB_RCFW_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
new file mode 100644
index 000000000000..62447b3badec
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -0,0 +1,825 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: QPLib resource manager
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/inetdevice.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_vlan.h>
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_rcfw.h"
+
+static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats);
+static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats);
+
+/* PBL */
+static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
+		       bool is_umem)
+{
+	int i;
+
+	if (!is_umem) {
+		for (i = 0; i < pbl->pg_count; i++) {
+			if (pbl->pg_arr[i])
+				dma_free_coherent(&pdev->dev, pbl->pg_size,
+						  (void *)((unsigned long)
+						   pbl->pg_arr[i] &
+						  PAGE_MASK),
+						  pbl->pg_map_arr[i]);
+			else
+				dev_warn(&pdev->dev,
+					 "QPLIB: PBL free pg_arr[%d] empty?!",
+					 i);
+			pbl->pg_arr[i] = NULL;
+		}
+	}
+	kfree(pbl->pg_arr);
+	pbl->pg_arr = NULL;
+	kfree(pbl->pg_map_arr);
+	pbl->pg_map_arr = NULL;
+	pbl->pg_count = 0;
+	pbl->pg_size = 0;
+}
+
+static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
+		       struct scatterlist *sghead, u32 pages, u32 pg_size)
+{
+	struct scatterlist *sg;
+	bool is_umem = false;
+	int i;
+
+	/* page ptr arrays */
+	pbl->pg_arr = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	if (!pbl->pg_arr)
+		return -ENOMEM;
+
+	pbl->pg_map_arr = kcalloc(pages, sizeof(dma_addr_t), GFP_KERNEL);
+	if (!pbl->pg_map_arr) {
+		kfree(pbl->pg_arr);
+		pbl->pg_arr = NULL;
+		return -ENOMEM;
+	}
+	pbl->pg_count = 0;
+	pbl->pg_size = pg_size;
+
+	if (!sghead) {
+		for (i = 0; i < pages; i++) {
+			pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+							    pbl->pg_size,
+							    &pbl->pg_map_arr[i],
+							    GFP_KERNEL);
+			if (!pbl->pg_arr[i])
+				goto fail;
+			memset(pbl->pg_arr[i], 0, pbl->pg_size);
+			pbl->pg_count++;
+		}
+	} else {
+		i = 0;
+		is_umem = true;
+		for_each_sg(sghead, sg, pages, i) {
+			pbl->pg_map_arr[i] = sg_dma_address(sg);
+			pbl->pg_arr[i] = sg_virt(sg);
+			if (!pbl->pg_arr[i])
+				goto fail;
+
+			pbl->pg_count++;
+		}
+	}
+
+	return 0;
+
+fail:
+	__free_pbl(pdev, pbl, is_umem);
+	return -ENOMEM;
+}
+
+/* HWQ */
+void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
+{
+	int i;
+
+	if (!hwq->max_elements)
+		return;
+	if (hwq->level >= PBL_LVL_MAX)
+		return;
+
+	for (i = 0; i < hwq->level + 1; i++) {
+		if (i == hwq->level)
+			__free_pbl(pdev, &hwq->pbl[i], hwq->is_user);
+		else
+			__free_pbl(pdev, &hwq->pbl[i], false);
+	}
+
+	hwq->level = PBL_LVL_MAX;
+	hwq->max_elements = 0;
+	hwq->element_size = 0;
+	hwq->prod = 0;
+	hwq->cons = 0;
+	hwq->cp_bit = 0;
+}
+
+/* All HWQs are power of 2 in size */
+int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
+			      struct scatterlist *sghead, int nmap,
+			      u32 *elements, u32 element_size, u32 aux,
+			      u32 pg_size, enum bnxt_qplib_hwq_type hwq_type)
+{
+	u32 pages, slots, size, aux_pages = 0, aux_size = 0;
+	dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+	int i, rc;
+
+	hwq->level = PBL_LVL_MAX;
+
+	slots = roundup_pow_of_two(*elements);
+	if (aux) {
+		aux_size = roundup_pow_of_two(aux);
+		aux_pages = (slots * aux_size) / pg_size;
+		if ((slots * aux_size) % pg_size)
+			aux_pages++;
+	}
+	size = roundup_pow_of_two(element_size);
+
+	if (!sghead) {
+		hwq->is_user = false;
+		pages = (slots * size) / pg_size + aux_pages;
+		if ((slots * size) % pg_size)
+			pages++;
+		if (!pages)
+			return -EINVAL;
+	} else {
+		hwq->is_user = true;
+		pages = nmap;
+	}
+
+	/* Alloc the 1st memory block; can be a PDL/PTL/PBL */
+	if (sghead && (pages == MAX_PBL_LVL_0_PGS))
+		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], sghead,
+				 pages, pg_size);
+	else
+		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL, 1, pg_size);
+	if (rc)
+		goto fail;
+
+	hwq->level = PBL_LVL_0;
+
+	if (pages > MAX_PBL_LVL_0_PGS) {
+		if (pages > MAX_PBL_LVL_1_PGS) {
+			/* 2 levels of indirection */
+			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], NULL,
+					 MAX_PBL_LVL_1_PGS_FOR_LVL_2, pg_size);
+			if (rc)
+				goto fail;
+			/* Fill in lvl0 PBL */
+			dst_virt_ptr =
+				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
+			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
+			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++)
+				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+					src_phys_ptr[i] | PTU_PDE_VALID;
+			hwq->level = PBL_LVL_1;
+
+			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead,
+					 pages, pg_size);
+			if (rc)
+				goto fail;
+
+			/* Fill in lvl1 PBL */
+			dst_virt_ptr =
+				(dma_addr_t **)hwq->pbl[PBL_LVL_1].pg_arr;
+			src_phys_ptr = hwq->pbl[PBL_LVL_2].pg_map_arr;
+			for (i = 0; i < hwq->pbl[PBL_LVL_2].pg_count; i++) {
+				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+					src_phys_ptr[i] | PTU_PTE_VALID;
+			}
+			if (hwq_type == HWQ_TYPE_QUEUE) {
+				/* Find the last pg of the size */
+				i = hwq->pbl[PBL_LVL_2].pg_count;
+				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+								  PTU_PTE_LAST;
+				if (i > 1)
+					dst_virt_ptr[PTR_PG(i - 2)]
+						    [PTR_IDX(i - 2)] |=
+						    PTU_PTE_NEXT_TO_LAST;
+			}
+			hwq->level = PBL_LVL_2;
+		} else {
+			u32 flag = hwq_type == HWQ_TYPE_L2_CMPL ? 0 :
+						PTU_PTE_VALID;
+
+			/* 1 level of indirection */
+			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], sghead,
+					 pages, pg_size);
+			if (rc)
+				goto fail;
+			/* Fill in lvl0 PBL */
+			dst_virt_ptr =
+				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
+			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
+			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) {
+				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+					src_phys_ptr[i] | flag;
+			}
+			if (hwq_type == HWQ_TYPE_QUEUE) {
+				/* Find the last pg of the size */
+				i = hwq->pbl[PBL_LVL_1].pg_count;
+				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+								  PTU_PTE_LAST;
+				if (i > 1)
+					dst_virt_ptr[PTR_PG(i - 2)]
+						    [PTR_IDX(i - 2)] |=
+						    PTU_PTE_NEXT_TO_LAST;
+			}
+			hwq->level = PBL_LVL_1;
+		}
+	}
+	hwq->pdev = pdev;
+	spin_lock_init(&hwq->lock);
+	hwq->prod = 0;
+	hwq->cons = 0;
+	*elements = hwq->max_elements = slots;
+	hwq->element_size = size;
+
+	/* For direct access to the elements */
+	hwq->pbl_ptr = hwq->pbl[hwq->level].pg_arr;
+	hwq->pbl_dma_ptr = hwq->pbl[hwq->level].pg_map_arr;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_hwq(pdev, hwq);
+	return -ENOMEM;
+}
+
+/* Context Tables */
+void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx)
+{
+	int i;
+
+	bnxt_qplib_free_hwq(pdev, &ctx->qpc_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->mrw_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->srqc_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->cq_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->tim_tbl);
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
+		bnxt_qplib_free_hwq(pdev, &ctx->tqm_tbl[i]);
+	bnxt_qplib_free_hwq(pdev, &ctx->tqm_pde);
+	bnxt_qplib_free_stats_ctx(pdev, &ctx->stats);
+}
+
+/*
+ * Routine: bnxt_qplib_alloc_ctx
+ * Description:
+ *     Context tables are memories which are used by the chip fw.
+ *     The 6 tables defined are:
+ *             QPC ctx - holds QP states
+ *             MRW ctx - holds memory region and window
+ *             SRQ ctx - holds shared RQ states
+ *             CQ ctx - holds completion queue states
+ *             TQM ctx - holds Tx Queue Manager context
+ *             TIM ctx - holds timer context
+ *     Depending on the size of the tbl requested, either a 1 Page Buffer List
+ *     or a 1-to-2-stage indirection Page Directory List + 1 PBL is used
+ *     instead.
+ *     Table might be employed as follows:
+ *             For 0      < ctx size <= 1 PAGE, 0 level of ind is used
+ *             For 1 PAGE < ctx size <= 512 entries size, 1 level of ind is used
+ *             For 512    < ctx size <= MAX, 2 levels of ind is used
+ * Returns:
+ *     0 if success, else -ERRORS
+ */
+int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx,
+			 bool virt_fn)
+{
+	int i, j, k, rc = 0;
+	int fnz_idx = -1;
+	__le64 **pbl_ptr;
+
+	if (virt_fn)
+		goto stats_alloc;
+
+	/* QPC Tables */
+	ctx->qpc_tbl.max_elements = ctx->qpc_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL, 0,
+				       &ctx->qpc_tbl.max_elements,
+				       BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* MRW Tables */
+	ctx->mrw_tbl.max_elements = ctx->mrw_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL, 0,
+				       &ctx->mrw_tbl.max_elements,
+				       BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* SRQ Tables */
+	ctx->srqc_tbl.max_elements = ctx->srqc_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL, 0,
+				       &ctx->srqc_tbl.max_elements,
+				       BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* CQ Tables */
+	ctx->cq_tbl.max_elements = ctx->cq_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL, 0,
+				       &ctx->cq_tbl.max_elements,
+				       BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* TQM Buffer */
+	ctx->tqm_pde.max_elements = 512;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL, 0,
+				       &ctx->tqm_pde.max_elements, sizeof(u64),
+				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) {
+		if (!ctx->tqm_count[i])
+			continue;
+		ctx->tqm_tbl[i].max_elements = ctx->qpc_count *
+					       ctx->tqm_count[i];
+		rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL, 0,
+					       &ctx->tqm_tbl[i].max_elements, 1,
+					       0, PAGE_SIZE, HWQ_TYPE_CTX);
+		if (rc)
+			goto fail;
+	}
+	pbl_ptr = (__le64 **)ctx->tqm_pde.pbl_ptr;
+	for (i = 0, j = 0; i < MAX_TQM_ALLOC_REQ;
+	     i++, j += MAX_TQM_ALLOC_BLK_SIZE) {
+		if (!ctx->tqm_tbl[i].max_elements)
+			continue;
+		if (fnz_idx == -1)
+			fnz_idx = i;
+		switch (ctx->tqm_tbl[i].level) {
+		case PBL_LVL_2:
+			for (k = 0; k < ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_count;
+			     k++)
+				pbl_ptr[PTR_PG(j + k)][PTR_IDX(j + k)] =
+				  cpu_to_le64(
+				    ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_map_arr[k]
+				    | PTU_PTE_VALID);
+			break;
+		case PBL_LVL_1:
+		case PBL_LVL_0:
+		default:
+			pbl_ptr[PTR_PG(j)][PTR_IDX(j)] = cpu_to_le64(
+				ctx->tqm_tbl[i].pbl[PBL_LVL_0].pg_map_arr[0] |
+				PTU_PTE_VALID);
+			break;
+		}
+	}
+	if (fnz_idx == -1)
+		fnz_idx = 0;
+	ctx->tqm_pde_level = ctx->tqm_tbl[fnz_idx].level == PBL_LVL_2 ?
+			     PBL_LVL_2 : ctx->tqm_tbl[fnz_idx].level + 1;
+
+	/* TIM Buffer */
+	ctx->tim_tbl.max_elements = ctx->qpc_count * 16;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL, 0,
+				       &ctx->tim_tbl.max_elements, 1,
+				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+stats_alloc:
+	/* Stats */
+	rc = bnxt_qplib_alloc_stats_ctx(pdev, &ctx->stats);
+	if (rc)
+		goto fail;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_ctx(pdev, ctx);
+	return rc;
+}
+
+/* GUID */
+void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid)
+{
+	u8 mac[ETH_ALEN];
+
+	/* MAC-48 to EUI-64 mapping */
+	memcpy(mac, dev_addr, ETH_ALEN);
+	guid[0] = mac[0] ^ 2;
+	guid[1] = mac[1];
+	guid[2] = mac[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = mac[3];
+	guid[6] = mac[4];
+	guid[7] = mac[5];
+}
+
+static void bnxt_qplib_free_sgid_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_sgid_tbl *sgid_tbl)
+{
+	kfree(sgid_tbl->tbl);
+	kfree(sgid_tbl->hw_id);
+	kfree(sgid_tbl->ctx);
+	sgid_tbl->tbl = NULL;
+	sgid_tbl->hw_id = NULL;
+	sgid_tbl->ctx = NULL;
+	sgid_tbl->max = 0;
+	sgid_tbl->active = 0;
+}
+
+static int bnxt_qplib_alloc_sgid_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_sgid_tbl *sgid_tbl,
+				     u16 max)
+{
+	sgid_tbl->tbl = kcalloc(max, sizeof(struct bnxt_qplib_gid), GFP_KERNEL);
+	if (!sgid_tbl->tbl)
+		return -ENOMEM;
+
+	sgid_tbl->hw_id = kcalloc(max, sizeof(u16), GFP_KERNEL);
+	if (!sgid_tbl->hw_id)
+		goto out_free1;
+
+	sgid_tbl->ctx = kcalloc(max, sizeof(void *), GFP_KERNEL);
+	if (!sgid_tbl->ctx)
+		goto out_free2;
+
+	sgid_tbl->max = max;
+	return 0;
+out_free2:
+	kfree(sgid_tbl->hw_id);
+	sgid_tbl->hw_id = NULL;
+out_free1:
+	kfree(sgid_tbl->tbl);
+	sgid_tbl->tbl = NULL;
+	return -ENOMEM;
+};
+
+static void bnxt_qplib_cleanup_sgid_tbl(struct bnxt_qplib_res *res,
+					struct bnxt_qplib_sgid_tbl *sgid_tbl)
+{
+	int i;
+
+	for (i = 0; i < sgid_tbl->max; i++) {
+		if (memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
+			   sizeof(bnxt_qplib_gid_zero)))
+			bnxt_qplib_del_sgid(sgid_tbl, &sgid_tbl->tbl[i], true);
+	}
+	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
+	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
+	sgid_tbl->active = 0;
+}
+
+static void bnxt_qplib_init_sgid_tbl(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+				     struct net_device *netdev)
+{
+	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
+	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
+}
+
+static void bnxt_qplib_free_pkey_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_pkey_tbl *pkey_tbl)
+{
+	if (!pkey_tbl->tbl)
+		dev_dbg(&res->pdev->dev, "QPLIB: PKEY tbl not present");
+	else
+		kfree(pkey_tbl->tbl);
+
+	pkey_tbl->tbl = NULL;
+	pkey_tbl->max = 0;
+	pkey_tbl->active = 0;
+}
+
+static int bnxt_qplib_alloc_pkey_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_pkey_tbl *pkey_tbl,
+				     u16 max)
+{
+	pkey_tbl->tbl = kcalloc(max, sizeof(u16), GFP_KERNEL);
+	if (!pkey_tbl->tbl)
+		return -ENOMEM;
+
+	pkey_tbl->max = max;
+	return 0;
+};
+
+/* PDs */
+int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pdt, struct bnxt_qplib_pd *pd)
+{
+	u32 bit_num;
+
+	bit_num = find_first_bit(pdt->tbl, pdt->max);
+	if (bit_num == pdt->max)
+		return -ENOMEM;
+
+	/* Found unused PD */
+	clear_bit(bit_num, pdt->tbl);
+	pd->id = bit_num;
+	return 0;
+}
+
+int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_pd_tbl *pdt,
+			  struct bnxt_qplib_pd *pd)
+{
+	if (test_and_set_bit(pd->id, pdt->tbl)) {
+		dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d",
+			 pd->id);
+		return -EINVAL;
+	}
+	pd->id = 0;
+	return 0;
+}
+
+static void bnxt_qplib_free_pd_tbl(struct bnxt_qplib_pd_tbl *pdt)
+{
+	kfree(pdt->tbl);
+	pdt->tbl = NULL;
+	pdt->max = 0;
+}
+
+static int bnxt_qplib_alloc_pd_tbl(struct bnxt_qplib_res *res,
+				   struct bnxt_qplib_pd_tbl *pdt,
+				   u32 max)
+{
+	u32 bytes;
+
+	bytes = max >> 3;
+	if (!bytes)
+		bytes = 1;
+	pdt->tbl = kmalloc(bytes, GFP_KERNEL);
+	if (!pdt->tbl)
+		return -ENOMEM;
+
+	pdt->max = max;
+	memset((u8 *)pdt->tbl, 0xFF, bytes);
+
+	return 0;
+}
+
+/* DPIs */
+int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit,
+			 struct bnxt_qplib_dpi     *dpi,
+			 void                      *app)
+{
+	u32 bit_num;
+
+	bit_num = find_first_bit(dpit->tbl, dpit->max);
+	if (bit_num == dpit->max)
+		return -ENOMEM;
+
+	/* Found unused DPI */
+	clear_bit(bit_num, dpit->tbl);
+	dpit->app_tbl[bit_num] = app;
+
+	dpi->dpi = bit_num;
+	dpi->dbr = dpit->dbr_bar_reg_iomem + (bit_num * PAGE_SIZE);
+	dpi->umdbr = dpit->unmapped_dbr + (bit_num * PAGE_SIZE);
+
+	return 0;
+}
+
+int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_dpi_tbl *dpit,
+			   struct bnxt_qplib_dpi     *dpi)
+{
+	if (dpi->dpi >= dpit->max) {
+		dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d", dpi->dpi);
+		return -EINVAL;
+	}
+	if (test_and_set_bit(dpi->dpi, dpit->tbl)) {
+		dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d",
+			 dpi->dpi);
+		return -EINVAL;
+	}
+	if (dpit->app_tbl)
+		dpit->app_tbl[dpi->dpi] = NULL;
+	memset(dpi, 0, sizeof(*dpi));
+
+	return 0;
+}
+
+static void bnxt_qplib_free_dpi_tbl(struct bnxt_qplib_res     *res,
+				    struct bnxt_qplib_dpi_tbl *dpit)
+{
+	kfree(dpit->tbl);
+	kfree(dpit->app_tbl);
+	if (dpit->dbr_bar_reg_iomem)
+		pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem);
+	memset(dpit, 0, sizeof(*dpit));
+}
+
+static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res     *res,
+				    struct bnxt_qplib_dpi_tbl *dpit,
+				    u32                       dbr_offset)
+{
+	u32 dbr_bar_reg = RCFW_DBR_PCI_BAR_REGION;
+	resource_size_t bar_reg_base;
+	u32 dbr_len, bytes;
+
+	if (dpit->dbr_bar_reg_iomem) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: DBR BAR region %d already mapped", dbr_bar_reg);
+		return -EALREADY;
+	}
+
+	bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg);
+	if (!bar_reg_base) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: BAR region %d resc start failed", dbr_bar_reg);
+		return -ENOMEM;
+	}
+
+	dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset;
+	if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) {
+		dev_err(&res->pdev->dev, "QPLIB: Invalid DBR length %d",
+			dbr_len);
+		return -ENOMEM;
+	}
+
+	dpit->dbr_bar_reg_iomem = ioremap_nocache(bar_reg_base + dbr_offset,
+						  dbr_len);
+	if (!dpit->dbr_bar_reg_iomem) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: FP: DBR BAR region %d mapping failed",
+			dbr_bar_reg);
+		return -ENOMEM;
+	}
+
+	dpit->unmapped_dbr = bar_reg_base + dbr_offset;
+	dpit->max = dbr_len / PAGE_SIZE;
+
+	dpit->app_tbl = kcalloc(dpit->max, sizeof(void *), GFP_KERNEL);
+	if (!dpit->app_tbl) {
+		pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem);
+		dev_err(&res->pdev->dev,
+			"QPLIB: DPI app tbl allocation failed");
+		return -ENOMEM;
+	}
+
+	bytes = dpit->max >> 3;
+	if (!bytes)
+		bytes = 1;
+
+	dpit->tbl = kmalloc(bytes, GFP_KERNEL);
+	if (!dpit->tbl) {
+		pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem);
+		kfree(dpit->app_tbl);
+		dpit->app_tbl = NULL;
+		dev_err(&res->pdev->dev,
+			"QPLIB: DPI tbl allocation failed for size = %d",
+			bytes);
+		return -ENOMEM;
+	}
+
+	memset((u8 *)dpit->tbl, 0xFF, bytes);
+
+	return 0;
+}
+
+/* PKEYs */
+static void bnxt_qplib_cleanup_pkey_tbl(struct bnxt_qplib_pkey_tbl *pkey_tbl)
+{
+	memset(pkey_tbl->tbl, 0, sizeof(u16) * pkey_tbl->max);
+	pkey_tbl->active = 0;
+}
+
+static void bnxt_qplib_init_pkey_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_pkey_tbl *pkey_tbl)
+{
+	u16 pkey = 0xFFFF;
+
+	memset(pkey_tbl->tbl, 0, sizeof(u16) * pkey_tbl->max);
+
+	/* pkey default = 0xFFFF */
+	bnxt_qplib_add_pkey(res, pkey_tbl, &pkey, false);
+}
+
+/* Stats */
+static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats)
+{
+	if (stats->dma) {
+		dma_free_coherent(&pdev->dev, stats->size,
+				  stats->dma, stats->dma_map);
+	}
+	memset(stats, 0, sizeof(*stats));
+	stats->fw_id = -1;
+}
+
+static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->fw_id = -1;
+	stats->size = sizeof(struct ctx_hw_stats);
+	stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
+					&stats->dma_map, GFP_KERNEL);
+	if (!stats->dma) {
+		dev_err(&pdev->dev, "QPLIB: Stats DMA allocation failed");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res)
+{
+	bnxt_qplib_cleanup_pkey_tbl(&res->pkey_tbl);
+	bnxt_qplib_cleanup_sgid_tbl(res, &res->sgid_tbl);
+}
+
+int bnxt_qplib_init_res(struct bnxt_qplib_res *res)
+{
+	bnxt_qplib_init_sgid_tbl(&res->sgid_tbl, res->netdev);
+	bnxt_qplib_init_pkey_tbl(res, &res->pkey_tbl);
+
+	return 0;
+}
+
+void bnxt_qplib_free_res(struct bnxt_qplib_res *res)
+{
+	bnxt_qplib_free_pkey_tbl(res, &res->pkey_tbl);
+	bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl);
+	bnxt_qplib_free_pd_tbl(&res->pd_tbl);
+	bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl);
+
+	res->netdev = NULL;
+	res->pdev = NULL;
+}
+
+int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
+			 struct net_device *netdev,
+			 struct bnxt_qplib_dev_attr *dev_attr)
+{
+	int rc = 0;
+
+	res->pdev = pdev;
+	res->netdev = netdev;
+
+	rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_pkey_tbl(res, &res->pkey_tbl, dev_attr->max_pkey);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_pd_tbl(res, &res->pd_tbl, dev_attr->max_pd);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_dpi_tbl(res, &res->dpi_tbl, dev_attr->l2_db_size);
+	if (rc)
+		goto fail;
+
+	return 0;
+fail:
+	bnxt_qplib_free_res(res);
+	return rc;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
new file mode 100644
index 000000000000..6277d802ca4b
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -0,0 +1,223 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: QPLib resource manager (header)
+ */
+
+#ifndef __BNXT_QPLIB_RES_H__
+#define __BNXT_QPLIB_RES_H__
+
+extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero;
+
+#define PTR_CNT_PER_PG		(PAGE_SIZE / sizeof(void *))
+#define PTR_MAX_IDX_PER_PG	(PTR_CNT_PER_PG - 1)
+#define PTR_PG(x)		(((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
+#define PTR_IDX(x)		((x) & PTR_MAX_IDX_PER_PG)
+
+#define HWQ_CMP(idx, hwq)	((idx) & ((hwq)->max_elements - 1))
+
+enum bnxt_qplib_hwq_type {
+	HWQ_TYPE_CTX,
+	HWQ_TYPE_QUEUE,
+	HWQ_TYPE_L2_CMPL
+};
+
+#define MAX_PBL_LVL_0_PGS		1
+#define MAX_PBL_LVL_1_PGS		512
+#define MAX_PBL_LVL_1_PGS_SHIFT		9
+#define MAX_PBL_LVL_1_PGS_FOR_LVL_2	256
+#define MAX_PBL_LVL_2_PGS		(256 * 512)
+
+enum bnxt_qplib_pbl_lvl {
+	PBL_LVL_0,
+	PBL_LVL_1,
+	PBL_LVL_2,
+	PBL_LVL_MAX
+};
+
+#define ROCE_PG_SIZE_4K		(4 * 1024)
+#define ROCE_PG_SIZE_8K		(8 * 1024)
+#define ROCE_PG_SIZE_64K	(64 * 1024)
+#define ROCE_PG_SIZE_2M		(2 * 1024 * 1024)
+#define ROCE_PG_SIZE_8M		(8 * 1024 * 1024)
+#define ROCE_PG_SIZE_1G		(1024 * 1024 * 1024)
+
+struct bnxt_qplib_pbl {
+	u32				pg_count;
+	u32				pg_size;
+	void				**pg_arr;
+	dma_addr_t			*pg_map_arr;
+};
+
+struct bnxt_qplib_hwq {
+	struct pci_dev			*pdev;
+	/* lock to protect qplib_hwq */
+	spinlock_t			lock;
+	struct bnxt_qplib_pbl		pbl[PBL_LVL_MAX];
+	enum bnxt_qplib_pbl_lvl		level;		/* 0, 1, or 2 */
+	/* ptr for easy access to the PBL entries */
+	void				**pbl_ptr;
+	/* ptr for easy access to the dma_addr */
+	dma_addr_t			*pbl_dma_ptr;
+	u32				max_elements;
+	u16				element_size;	/* Size of each entry */
+
+	u32				prod;		/* raw */
+	u32				cons;		/* raw */
+	u8				cp_bit;
+	u8				is_user;
+};
+
+/* Tables */
+struct bnxt_qplib_pd_tbl {
+	unsigned long			*tbl;
+	u32				max;
+};
+
+struct bnxt_qplib_sgid_tbl {
+	struct bnxt_qplib_gid		*tbl;
+	u16				*hw_id;
+	u16				max;
+	u16				active;
+	void				*ctx;
+};
+
+struct bnxt_qplib_pkey_tbl {
+	u16				*tbl;
+	u16				max;
+	u16				active;
+};
+
+struct bnxt_qplib_dpi {
+	u32				dpi;
+	void __iomem			*dbr;
+	u64				umdbr;
+};
+
+struct bnxt_qplib_dpi_tbl {
+	void				**app_tbl;
+	unsigned long			*tbl;
+	u16				max;
+	void __iomem			*dbr_bar_reg_iomem;
+	u64				unmapped_dbr;
+};
+
+struct bnxt_qplib_stats {
+	dma_addr_t			dma_map;
+	void				*dma;
+	u32				size;
+	u32				fw_id;
+};
+
+struct bnxt_qplib_vf_res {
+	u32 max_qp_per_vf;
+	u32 max_mrw_per_vf;
+	u32 max_srq_per_vf;
+	u32 max_cq_per_vf;
+	u32 max_gid_per_vf;
+};
+
+#define BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE	448
+#define BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE	64
+#define BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE	64
+#define BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE	128
+
+struct bnxt_qplib_ctx {
+	u32				qpc_count;
+	struct bnxt_qplib_hwq		qpc_tbl;
+	u32				mrw_count;
+	struct bnxt_qplib_hwq		mrw_tbl;
+	u32				srqc_count;
+	struct bnxt_qplib_hwq		srqc_tbl;
+	u32				cq_count;
+	struct bnxt_qplib_hwq		cq_tbl;
+	struct bnxt_qplib_hwq		tim_tbl;
+#define MAX_TQM_ALLOC_REQ		32
+#define MAX_TQM_ALLOC_BLK_SIZE		8
+	u8				tqm_count[MAX_TQM_ALLOC_REQ];
+	struct bnxt_qplib_hwq		tqm_pde;
+	u32				tqm_pde_level;
+	struct bnxt_qplib_hwq		tqm_tbl[MAX_TQM_ALLOC_REQ];
+	struct bnxt_qplib_stats		stats;
+	struct bnxt_qplib_vf_res	vf_res;
+};
+
+struct bnxt_qplib_res {
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+
+	struct bnxt_qplib_rcfw		*rcfw;
+
+	struct bnxt_qplib_pd_tbl	pd_tbl;
+	struct bnxt_qplib_sgid_tbl	sgid_tbl;
+	struct bnxt_qplib_pkey_tbl	pkey_tbl;
+	struct bnxt_qplib_dpi_tbl	dpi_tbl;
+};
+
+#define to_bnxt_qplib(ptr, type, member)	\
+	container_of(ptr, type, member)
+
+struct bnxt_qplib_pd;
+struct bnxt_qplib_dev_attr;
+
+void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq);
+int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
+			      struct scatterlist *sl, int nmap, u32 *elements,
+			      u32 elements_per_page, u32 aux, u32 pg_size,
+			      enum bnxt_qplib_hwq_type hwq_type);
+void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid);
+int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pd_tbl,
+			struct bnxt_qplib_pd *pd);
+int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_pd_tbl *pd_tbl,
+			  struct bnxt_qplib_pd *pd);
+int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit,
+			 struct bnxt_qplib_dpi     *dpi,
+			 void                      *app);
+int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_dpi_tbl *dpi_tbl,
+			   struct bnxt_qplib_dpi *dpi);
+void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res);
+int bnxt_qplib_init_res(struct bnxt_qplib_res *res);
+void bnxt_qplib_free_res(struct bnxt_qplib_res *res);
+int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
+			 struct net_device *netdev,
+			 struct bnxt_qplib_dev_attr *dev_attr);
+void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx);
+int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx,
+			 bool virt_fn);
+#endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
new file mode 100644
index 000000000000..7b31eccedf11
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -0,0 +1,838 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Slow Path Operators
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+
+#include "roce_hsi.h"
+
+#include "qplib_res.h"
+#include "qplib_rcfw.h"
+#include "qplib_sp.h"
+
+const struct bnxt_qplib_gid bnxt_qplib_gid_zero = {{ 0, 0, 0, 0, 0, 0, 0, 0,
+						     0, 0, 0, 0, 0, 0, 0, 0 } };
+
+/* Device */
+int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
+			    struct bnxt_qplib_dev_attr *attr)
+{
+	struct cmdq_query_func req;
+	struct creq_query_func_resp *resp;
+	struct creq_query_func_resp_sb *sb;
+	u16 cmd_flags = 0;
+	u32 temp;
+	u8 *tqm_alloc;
+	int i;
+
+	RCFW_CMD_PREP(req, QUERY_FUNC, cmd_flags);
+
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	resp = (struct creq_query_func_resp *)
+		bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void **)&sb,
+					     0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: QUERY_FUNC send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: QUERY_FUNC timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: QUERY_FUNC failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	/* Extract the context from the side buffer */
+	attr->max_qp = le32_to_cpu(sb->max_qp);
+	attr->max_qp_rd_atom =
+		sb->max_qp_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
+		BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
+	attr->max_qp_init_rd_atom =
+		sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
+		BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom;
+	attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr);
+	attr->max_qp_sges = sb->max_sge;
+	attr->max_cq = le32_to_cpu(sb->max_cq);
+	attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
+	attr->max_cq_sges = attr->max_qp_sges;
+	attr->max_mr = le32_to_cpu(sb->max_mr);
+	attr->max_mw = le32_to_cpu(sb->max_mw);
+
+	attr->max_mr_size = le64_to_cpu(sb->max_mr_size);
+	attr->max_pd = 64 * 1024;
+	attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp);
+	attr->max_ah = le32_to_cpu(sb->max_ah);
+
+	attr->max_fmr = le32_to_cpu(sb->max_fmr);
+	attr->max_map_per_fmr = sb->max_map_per_fmr;
+
+	attr->max_srq = le16_to_cpu(sb->max_srq);
+	attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
+	attr->max_srq_sges = sb->max_srq_sge;
+	/* Bono only reports 1 PKEY for now, but it can support > 1 */
+	attr->max_pkey = le32_to_cpu(sb->max_pkeys);
+
+	attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+	attr->l2_db_size = (sb->l2_db_space_size + 1) * PAGE_SIZE;
+	attr->max_sgid = le32_to_cpu(sb->max_gid);
+
+	strlcpy(attr->fw_ver, "20.6.28.0", sizeof(attr->fw_ver));
+
+	for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) {
+		temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
+		tqm_alloc = (u8 *)&temp;
+		attr->tqm_alloc_reqs[i * 4] = *tqm_alloc;
+		attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc);
+		attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc);
+		attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc);
+	}
+	return 0;
+}
+
+/* SGID */
+int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
+			struct bnxt_qplib_gid *gid)
+{
+	if (index > sgid_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: Index %d exceeded SGID table max (%d)",
+			index, sgid_tbl->max);
+		return -EINVAL;
+	}
+	memcpy(gid, &sgid_tbl->tbl[index], sizeof(*gid));
+	return 0;
+}
+
+int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, bool update)
+{
+	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+						   struct bnxt_qplib_res,
+						   sgid_tbl);
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	int index;
+
+	if (!sgid_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		return -EINVAL;
+	}
+	/* Do we need a sgid_lock here? */
+	if (!sgid_tbl->active) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: SGID table has no active entries");
+		return -ENOMEM;
+	}
+	for (index = 0; index < sgid_tbl->max; index++) {
+		if (!memcmp(&sgid_tbl->tbl[index], gid, sizeof(*gid)))
+			break;
+	}
+	if (index == sgid_tbl->max) {
+		dev_warn(&res->pdev->dev, "GID not found in the SGID table");
+		return 0;
+	}
+	/* Remove GID from the SGID table */
+	if (update) {
+		struct cmdq_delete_gid req;
+		struct creq_delete_gid_resp *resp;
+		u16 cmd_flags = 0;
+
+		RCFW_CMD_PREP(req, DELETE_GID, cmd_flags);
+		if (sgid_tbl->hw_id[index] == 0xFFFF) {
+			dev_err(&res->pdev->dev,
+				"QPLIB: GID entry contains an invalid HW id");
+			return -EINVAL;
+		}
+		req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]);
+		resp = (struct creq_delete_gid_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, NULL,
+						     0);
+		if (!resp) {
+			dev_err(&res->pdev->dev,
+				"QPLIB: SP: DELETE_GID send failed");
+			return -EINVAL;
+		}
+		if (!bnxt_qplib_rcfw_wait_for_resp(rcfw,
+						   le16_to_cpu(req.cookie))) {
+			/* Cmd timed out */
+			dev_err(&res->pdev->dev,
+				"QPLIB: SP: DELETE_GID timed out");
+			return -ETIMEDOUT;
+		}
+		if (resp->status ||
+		    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+			dev_err(&res->pdev->dev,
+				"QPLIB: SP: DELETE_GID failed ");
+			dev_err(&res->pdev->dev,
+				"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+				resp->status, le16_to_cpu(req.cookie),
+				le16_to_cpu(resp->cookie));
+			return -EINVAL;
+		}
+	}
+	memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
+	       sizeof(bnxt_qplib_gid_zero));
+	sgid_tbl->active--;
+	dev_dbg(&res->pdev->dev,
+		"QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
+		 index, sgid_tbl->hw_id[index], sgid_tbl->active);
+	sgid_tbl->hw_id[index] = (u16)-1;
+
+	/* unlock */
+	return 0;
+}
+
+int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, u8 *smac, u16 vlan_id,
+			bool update, u32 *index)
+{
+	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+						   struct bnxt_qplib_res,
+						   sgid_tbl);
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	int i, free_idx, rc = 0;
+
+	if (!sgid_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		return -EINVAL;
+	}
+	/* Do we need a sgid_lock here? */
+	if (sgid_tbl->active == sgid_tbl->max) {
+		dev_err(&res->pdev->dev, "QPLIB: SGID table is full");
+		return -ENOMEM;
+	}
+	free_idx = sgid_tbl->max;
+	for (i = 0; i < sgid_tbl->max; i++) {
+		if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) {
+			dev_dbg(&res->pdev->dev,
+				"QPLIB: SGID entry already exist in entry %d!",
+				i);
+			*index = i;
+			return -EALREADY;
+		} else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
+				   sizeof(bnxt_qplib_gid_zero)) &&
+			   free_idx == sgid_tbl->max) {
+			free_idx = i;
+		}
+	}
+	if (free_idx == sgid_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: SGID table is FULL but count is not MAX??");
+		return -ENOMEM;
+	}
+	if (update) {
+		struct cmdq_add_gid req;
+		struct creq_add_gid_resp *resp;
+		u16 cmd_flags = 0;
+		u32 temp32[4];
+		u16 temp16[3];
+
+		RCFW_CMD_PREP(req, ADD_GID, cmd_flags);
+
+		memcpy(temp32, gid->data, sizeof(struct bnxt_qplib_gid));
+		req.gid[0] = cpu_to_be32(temp32[3]);
+		req.gid[1] = cpu_to_be32(temp32[2]);
+		req.gid[2] = cpu_to_be32(temp32[1]);
+		req.gid[3] = cpu_to_be32(temp32[0]);
+		if (vlan_id != 0xFFFF)
+			req.vlan = cpu_to_le16((vlan_id &
+					CMDQ_ADD_GID_VLAN_VLAN_ID_MASK) |
+					CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
+					CMDQ_ADD_GID_VLAN_VLAN_EN);
+
+		/* MAC in network format */
+		memcpy(temp16, smac, 6);
+		req.src_mac[0] = cpu_to_be16(temp16[0]);
+		req.src_mac[1] = cpu_to_be16(temp16[1]);
+		req.src_mac[2] = cpu_to_be16(temp16[2]);
+
+		resp = (struct creq_add_gid_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+		if (!resp) {
+			dev_err(&res->pdev->dev,
+				"QPLIB: SP: ADD_GID send failed");
+			return -EINVAL;
+		}
+		if (!bnxt_qplib_rcfw_wait_for_resp(rcfw,
+						   le16_to_cpu(req.cookie))) {
+			/* Cmd timed out */
+			dev_err(&res->pdev->dev,
+				"QPIB: SP: ADD_GID timed out");
+			return -ETIMEDOUT;
+		}
+		if (resp->status ||
+		    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+			dev_err(&res->pdev->dev, "QPLIB: SP: ADD_GID failed ");
+			dev_err(&res->pdev->dev,
+				"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+				resp->status, le16_to_cpu(req.cookie),
+				le16_to_cpu(resp->cookie));
+			return -EINVAL;
+		}
+		sgid_tbl->hw_id[free_idx] = le32_to_cpu(resp->xid);
+	}
+	/* Add GID to the sgid_tbl */
+	memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
+	sgid_tbl->active++;
+	dev_dbg(&res->pdev->dev,
+		"QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
+		 free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
+
+	*index = free_idx;
+	/* unlock */
+	return rc;
+}
+
+/* pkeys */
+int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
+			u16 *pkey)
+{
+	if (index == 0xFFFF) {
+		*pkey = 0xFFFF;
+		return 0;
+	}
+	if (index > pkey_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: Index %d exceeded PKEY table max (%d)",
+			index, pkey_tbl->max);
+		return -EINVAL;
+	}
+	memcpy(pkey, &pkey_tbl->tbl[index], sizeof(*pkey));
+	return 0;
+}
+
+int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update)
+{
+	int i, rc = 0;
+
+	if (!pkey_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		return -EINVAL;
+	}
+
+	/* Do we need a pkey_lock here? */
+	if (!pkey_tbl->active) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: PKEY table has no active entries");
+		return -ENOMEM;
+	}
+	for (i = 0; i < pkey_tbl->max; i++) {
+		if (!memcmp(&pkey_tbl->tbl[i], pkey, sizeof(*pkey)))
+			break;
+	}
+	if (i == pkey_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: PKEY 0x%04x not found in the pkey table",
+			*pkey);
+		return -ENOMEM;
+	}
+	memset(&pkey_tbl->tbl[i], 0, sizeof(*pkey));
+	pkey_tbl->active--;
+
+	/* unlock */
+	return rc;
+}
+
+int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update)
+{
+	int i, free_idx, rc = 0;
+
+	if (!pkey_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		return -EINVAL;
+	}
+
+	/* Do we need a pkey_lock here? */
+	if (pkey_tbl->active == pkey_tbl->max) {
+		dev_err(&res->pdev->dev, "QPLIB: PKEY table is full");
+		return -ENOMEM;
+	}
+	free_idx = pkey_tbl->max;
+	for (i = 0; i < pkey_tbl->max; i++) {
+		if (!memcmp(&pkey_tbl->tbl[i], pkey, sizeof(*pkey)))
+			return -EALREADY;
+		else if (!pkey_tbl->tbl[i] && free_idx == pkey_tbl->max)
+			free_idx = i;
+	}
+	if (free_idx == pkey_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: PKEY table is FULL but count is not MAX??");
+		return -ENOMEM;
+	}
+	/* Add PKEY to the pkey_tbl */
+	memcpy(&pkey_tbl->tbl[free_idx], pkey, sizeof(*pkey));
+	pkey_tbl->active++;
+
+	/* unlock */
+	return rc;
+}
+
+/* AH */
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_ah req;
+	struct creq_create_ah_resp *resp;
+	u16 cmd_flags = 0;
+	u32 temp32[4];
+	u16 temp16[3];
+
+	RCFW_CMD_PREP(req, CREATE_AH, cmd_flags);
+
+	memcpy(temp32, ah->dgid.data, sizeof(struct bnxt_qplib_gid));
+	req.dgid[0] = cpu_to_le32(temp32[0]);
+	req.dgid[1] = cpu_to_le32(temp32[1]);
+	req.dgid[2] = cpu_to_le32(temp32[2]);
+	req.dgid[3] = cpu_to_le32(temp32[3]);
+
+	req.type = ah->nw_type;
+	req.hop_limit = ah->hop_limit;
+	req.sgid_index = cpu_to_le16(res->sgid_tbl.hw_id[ah->sgid_index]);
+	req.dest_vlan_id_flow_label = cpu_to_le32((ah->flow_label &
+					CMDQ_CREATE_AH_FLOW_LABEL_MASK) |
+					CMDQ_CREATE_AH_DEST_VLAN_ID_MASK);
+	req.pd_id = cpu_to_le32(ah->pd->id);
+	req.traffic_class = ah->traffic_class;
+
+	/* MAC in network format */
+	memcpy(temp16, ah->dmac, 6);
+	req.dest_mac[0] = cpu_to_le16(temp16[0]);
+	req.dest_mac[1] = cpu_to_le16(temp16[1]);
+	req.dest_mac[2] = cpu_to_le16(temp16[2]);
+
+	resp = (struct creq_create_ah_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 1);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: CREATE_AH send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: CREATE_AH timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: CREATE_AH failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	ah->id = le32_to_cpu(resp->xid);
+	return 0;
+}
+
+int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_ah req;
+	struct creq_destroy_ah_resp *resp;
+	u16 cmd_flags = 0;
+
+	/* Clean up the AH table in the device */
+	RCFW_CMD_PREP(req, DESTROY_AH, cmd_flags);
+
+	req.ah_cid = cpu_to_le32(ah->id);
+
+	resp = (struct creq_destroy_ah_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 1);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: DESTROY_AH send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: DESTROY_AH timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: DESTROY_AH failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* MRW */
+int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_deallocate_key req;
+	struct creq_deallocate_key_resp *resp;
+	u16 cmd_flags = 0;
+
+	if (mrw->lkey == 0xFFFFFFFF) {
+		dev_info(&res->pdev->dev,
+			 "QPLIB: SP: Free a reserved lkey MRW");
+		return 0;
+	}
+
+	RCFW_CMD_PREP(req, DEALLOCATE_KEY, cmd_flags);
+
+	req.mrw_flags = mrw->type;
+
+	if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1)  ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
+		req.key = cpu_to_le32(mrw->rkey);
+	else
+		req.key = cpu_to_le32(mrw->lkey);
+
+	resp = (struct creq_deallocate_key_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR failed ");
+		dev_err(&res->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	/* Free the qplib's MRW memory */
+	if (mrw->hwq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+
+	return 0;
+}
+
+int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_allocate_mrw req;
+	struct creq_allocate_mrw_resp *resp;
+	u16 cmd_flags = 0;
+	unsigned long tmp;
+
+	RCFW_CMD_PREP(req, ALLOCATE_MRW, cmd_flags);
+
+	req.pd_id = cpu_to_le32(mrw->pd->id);
+	req.mrw_flags = mrw->type;
+	if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR &&
+	     mrw->flags & BNXT_QPLIB_FR_PMR) ||
+	    mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A ||
+	    mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B)
+		req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY;
+	tmp = (unsigned long)mrw;
+	req.mrw_handle = cpu_to_le64(tmp);
+
+	resp = (struct creq_allocate_mrw_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, 0);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW send failed");
+		return -EINVAL;
+	}
+	if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
+		/* Cmd timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+	if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1)  ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
+		mrw->rkey = le32_to_cpu(resp->xid);
+	else
+		mrw->lkey = le32_to_cpu(resp->xid);
+	return 0;
+}
+
+int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
+			 bool block)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_deregister_mr req;
+	struct creq_deregister_mr_resp *resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, DEREGISTER_MR, cmd_flags);
+
+	req.lkey = cpu_to_le32(mrw->lkey);
+	resp = (struct creq_deregister_mr_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, block);
+	if (!resp) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: DEREG_MR send failed");
+		return -EINVAL;
+	}
+	if (block)
+		rc = bnxt_qplib_rcfw_block_for_resp(rcfw,
+						    le16_to_cpu(req.cookie));
+	else
+		rc = bnxt_qplib_rcfw_wait_for_resp(rcfw,
+						   le16_to_cpu(req.cookie));
+	if (!rc) {
+		/* Cmd timed out */
+		dev_err(&res->pdev->dev, "QPLIB: SP: DEREG_MR timed out");
+		return -ETIMEDOUT;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: SP: DEREG_MR failed ");
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+
+	/* Free the qplib's MR memory */
+	if (mrw->hwq.max_elements) {
+		mrw->va = 0;
+		mrw->total_size = 0;
+		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+	}
+
+	return 0;
+}
+
+int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
+		      u64 *pbl_tbl, int num_pbls, bool block)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_register_mr req;
+	struct creq_register_mr_resp *resp;
+	u16 cmd_flags = 0, level;
+	int pg_ptrs, pages, i, rc;
+	dma_addr_t **pbl_ptr;
+	u32 pg_size;
+
+	if (num_pbls) {
+		pg_ptrs = roundup_pow_of_two(num_pbls);
+		pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
+		if (!pages)
+			pages++;
+
+		if (pages > MAX_PBL_LVL_1_PGS) {
+			dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages ");
+			dev_err(&res->pdev->dev,
+				"requested (0x%x) exceeded max (0x%x)",
+				pages, MAX_PBL_LVL_1_PGS);
+			return -ENOMEM;
+		}
+		/* Free the hwq if it already exist, must be a rereg */
+		if (mr->hwq.max_elements)
+			bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+
+		mr->hwq.max_elements = pages;
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
+					       &mr->hwq.max_elements,
+					       PAGE_SIZE, 0, PAGE_SIZE,
+					       HWQ_TYPE_CTX);
+		if (rc) {
+			dev_err(&res->pdev->dev,
+				"SP: Reg MR memory allocation failed");
+			return -ENOMEM;
+		}
+		/* Write to the hwq */
+		pbl_ptr = (dma_addr_t **)mr->hwq.pbl_ptr;
+		for (i = 0; i < num_pbls; i++)
+			pbl_ptr[PTR_PG(i)][PTR_IDX(i)] =
+				(pbl_tbl[i] & PAGE_MASK) | PTU_PTE_VALID;
+	}
+
+	RCFW_CMD_PREP(req, REGISTER_MR, cmd_flags);
+
+	/* Configure the request */
+	if (mr->hwq.level == PBL_LVL_MAX) {
+		level = 0;
+		req.pbl = 0;
+		pg_size = PAGE_SIZE;
+	} else {
+		level = mr->hwq.level + 1;
+		req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
+		pg_size = mr->hwq.pbl[PBL_LVL_0].pg_size;
+	}
+	req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) |
+			       ((ilog2(pg_size) <<
+				 CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) &
+				CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK);
+	req.access = (mr->flags & 0xFFFF);
+	req.va = cpu_to_le64(mr->va);
+	req.key = cpu_to_le32(mr->lkey);
+	req.mr_size = cpu_to_le64(mr->total_size);
+
+	resp = (struct creq_register_mr_resp *)
+			bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						     NULL, block);
+	if (!resp) {
+		dev_err(&res->pdev->dev, "SP: REG_MR send failed");
+		rc = -EINVAL;
+		goto fail;
+	}
+	if (block)
+		rc = bnxt_qplib_rcfw_block_for_resp(rcfw,
+						    le16_to_cpu(req.cookie));
+	else
+		rc = bnxt_qplib_rcfw_wait_for_resp(rcfw,
+						   le16_to_cpu(req.cookie));
+	if (!rc) {
+		/* Cmd timed out */
+		dev_err(&res->pdev->dev, "SP: REG_MR timed out");
+		rc = -ETIMEDOUT;
+		goto fail;
+	}
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&res->pdev->dev, "QPLIB: SP: REG_MR failed ");
+		dev_err(&res->pdev->dev,
+			"QPLIB: SP: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		rc = -EINVAL;
+		goto fail;
+	}
+	return 0;
+
+fail:
+	if (mr->hwq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+	return rc;
+}
+
+int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
+					struct bnxt_qplib_frpl *frpl,
+					int max_pg_ptrs)
+{
+	int pg_ptrs, pages, rc;
+
+	/* Re-calculate the max to fit the HWQ allocation model */
+	pg_ptrs = roundup_pow_of_two(max_pg_ptrs);
+	pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
+	if (!pages)
+		pages++;
+
+	if (pages > MAX_PBL_LVL_1_PGS)
+		return -ENOMEM;
+
+	frpl->hwq.max_elements = pages;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL, 0,
+				       &frpl->hwq.max_elements, PAGE_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (!rc)
+		frpl->max_pg_ptrs = pg_ptrs;
+
+	return rc;
+}
+
+int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_frpl *frpl)
+{
+	bnxt_qplib_free_hwq(res->pdev, &frpl->hwq);
+	return 0;
+}
+
+int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_map_tc_to_cos req;
+	struct creq_map_tc_to_cos_resp *resp;
+	u16 cmd_flags = 0;
+	int tleft;
+
+	RCFW_CMD_PREP(req, MAP_TC_TO_COS, cmd_flags);
+	req.cos0 = cpu_to_le16(cids[0]);
+	req.cos1 = cpu_to_le16(cids[1]);
+
+	resp = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, NULL, 0);
+	if (!resp) {
+		dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS send failed");
+		return -EINVAL;
+	}
+
+	tleft = bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie));
+	if (!tleft) {
+		dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS timed out");
+		return -ETIMEDOUT;
+	}
+
+	if (resp->status ||
+	    le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
+		dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS failed ");
+		dev_err(&res->pdev->dev,
+			"QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
+			resp->status, le16_to_cpu(req.cookie),
+			le16_to_cpu(resp->cookie));
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
new file mode 100644
index 000000000000..1442a617e968
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -0,0 +1,160 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Slow Path Operators (header)
+ *
+ */
+
+#ifndef __BNXT_QPLIB_SP_H__
+#define __BNXT_QPLIB_SP_H__
+
+struct bnxt_qplib_dev_attr {
+	char				fw_ver[32];
+	u16				max_sgid;
+	u16				max_mrw;
+	u32				max_qp;
+#define BNXT_QPLIB_MAX_OUT_RD_ATOM	126
+	u32				max_qp_rd_atom;
+	u32				max_qp_init_rd_atom;
+	u32				max_qp_wqes;
+	u32				max_qp_sges;
+	u32				max_cq;
+	u32				max_cq_wqes;
+	u32				max_cq_sges;
+	u32				max_mr;
+	u64				max_mr_size;
+	u32				max_pd;
+	u32				max_mw;
+	u32				max_raw_ethy_qp;
+	u32				max_ah;
+	u32				max_fmr;
+	u32				max_map_per_fmr;
+	u32				max_srq;
+	u32				max_srq_wqes;
+	u32				max_srq_sges;
+	u32				max_pkey;
+	u32				max_inline_data;
+	u32				l2_db_size;
+	u8				tqm_alloc_reqs[MAX_TQM_ALLOC_REQ];
+};
+
+struct bnxt_qplib_pd {
+	u32				id;
+};
+
+struct bnxt_qplib_gid {
+	u8				data[16];
+};
+
+struct bnxt_qplib_ah {
+	struct bnxt_qplib_gid		dgid;
+	struct bnxt_qplib_pd		*pd;
+	u32				id;
+	u8				sgid_index;
+	/* For Query AH if the hw table and SW table are differnt */
+	u8				host_sgid_index;
+	u8				traffic_class;
+	u32				flow_label;
+	u8				hop_limit;
+	u8				sl;
+	u8				dmac[6];
+	u16				vlan_id;
+	u8				nw_type;
+};
+
+struct bnxt_qplib_mrw {
+	struct bnxt_qplib_pd		*pd;
+	int				type;
+	u32				flags;
+#define BNXT_QPLIB_FR_PMR		0x80000000
+	u32				lkey;
+	u32				rkey;
+#define BNXT_QPLIB_RSVD_LKEY		0xFFFFFFFF
+	u64				va;
+	u64				total_size;
+	u32				npages;
+	u64				mr_handle;
+	struct bnxt_qplib_hwq		hwq;
+};
+
+struct bnxt_qplib_frpl {
+	int				max_pg_ptrs;
+	struct bnxt_qplib_hwq		hwq;
+};
+
+#define BNXT_QPLIB_ACCESS_LOCAL_WRITE	BIT(0)
+#define BNXT_QPLIB_ACCESS_REMOTE_READ	BIT(1)
+#define BNXT_QPLIB_ACCESS_REMOTE_WRITE	BIT(2)
+#define BNXT_QPLIB_ACCESS_REMOTE_ATOMIC	BIT(3)
+#define BNXT_QPLIB_ACCESS_MW_BIND	BIT(4)
+#define BNXT_QPLIB_ACCESS_ZERO_BASED	BIT(5)
+#define BNXT_QPLIB_ACCESS_ON_DEMAND	BIT(6)
+
+int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
+			struct bnxt_qplib_gid *gid);
+int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, bool update);
+int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id,
+			bool update, u32 *index);
+int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
+			u16 *pkey);
+int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update);
+int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update);
+int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
+			    struct bnxt_qplib_dev_attr *attr);
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_mrw *mrw);
+int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
+			 bool block);
+int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
+		      u64 *pbl_tbl, int num_pbls, bool block);
+int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
+int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
+				 struct bnxt_qplib_mrw *mr, int max);
+int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
+					struct bnxt_qplib_frpl *frpl, int max);
+int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_frpl *frpl);
+int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids);
+#endif /* __BNXT_QPLIB_SP_H__*/
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
new file mode 100644
index 000000000000..fc23477ac52f
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -0,0 +1,2821 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: RoCE HSI File - Autogenerated
+ */
+
+#ifndef __BNXT_RE_HSI_H__
+#define __BNXT_RE_HSI_H__
+
+/* include bnxt_hsi.h from bnxt_en driver */
+#include "bnxt_hsi.h"
+
+/* CMP Door Bell Format (4 bytes) */
+struct cmpl_doorbell {
+	__le32 key_mask_valid_idx;
+	#define CMPL_DOORBELL_IDX_MASK				    0xffffffUL
+	#define CMPL_DOORBELL_IDX_SFT				    0
+	#define CMPL_DOORBELL_RESERVED_MASK			    0x3000000UL
+	#define CMPL_DOORBELL_RESERVED_SFT			    24
+	#define CMPL_DOORBELL_IDX_VALID			    0x4000000UL
+	#define CMPL_DOORBELL_MASK				    0x8000000UL
+	#define CMPL_DOORBELL_KEY_MASK				    0xf0000000UL
+	#define CMPL_DOORBELL_KEY_SFT				    28
+	#define CMPL_DOORBELL_KEY_CMPL				   (0x2UL << 28)
+};
+
+/* Status Door Bell Format (4 bytes) */
+struct status_doorbell {
+	__le32 key_idx;
+	#define STATUS_DOORBELL_IDX_MASK			    0xffffffUL
+	#define STATUS_DOORBELL_IDX_SFT			    0
+	#define STATUS_DOORBELL_RESERVED_MASK			    0xf000000UL
+	#define STATUS_DOORBELL_RESERVED_SFT			    24
+	#define STATUS_DOORBELL_KEY_MASK			    0xf0000000UL
+	#define STATUS_DOORBELL_KEY_SFT			    28
+	#define STATUS_DOORBELL_KEY_STAT			   (0x3UL << 28)
+};
+
+/* RoCE Host Structures */
+
+/* Doorbell Structures */
+/* 64b Doorbell Format (8 bytes) */
+struct dbr_dbr {
+	__le32 index;
+	#define DBR_DBR_INDEX_MASK				    0xfffffUL
+	#define DBR_DBR_INDEX_SFT				    0
+	#define DBR_DBR_RESERVED12_MASK			    0xfff00000UL
+	#define DBR_DBR_RESERVED12_SFT				    20
+	__le32 type_xid;
+	#define DBR_DBR_XID_MASK				    0xfffffUL
+	#define DBR_DBR_XID_SFT				    0
+	#define DBR_DBR_RESERVED8_MASK				    0xff00000UL
+	#define DBR_DBR_RESERVED8_SFT				    20
+	#define DBR_DBR_TYPE_MASK				    0xf0000000UL
+	#define DBR_DBR_TYPE_SFT				    28
+	#define DBR_DBR_TYPE_SQ				   (0x0UL << 28)
+	#define DBR_DBR_TYPE_RQ				   (0x1UL << 28)
+	#define DBR_DBR_TYPE_SRQ				   (0x2UL << 28)
+	#define DBR_DBR_TYPE_SRQ_ARM				   (0x3UL << 28)
+	#define DBR_DBR_TYPE_CQ				   (0x4UL << 28)
+	#define DBR_DBR_TYPE_CQ_ARMSE				   (0x5UL << 28)
+	#define DBR_DBR_TYPE_CQ_ARMALL				   (0x6UL << 28)
+	#define DBR_DBR_TYPE_CQ_ARMENA				   (0x7UL << 28)
+	#define DBR_DBR_TYPE_SRQ_ARMENA			   (0x8UL << 28)
+	#define DBR_DBR_TYPE_CQ_CUTOFF_ACK			   (0x9UL << 28)
+	#define DBR_DBR_TYPE_NULL				   (0xfUL << 28)
+};
+
+/* 32b Doorbell Format (4 bytes) */
+struct dbr_dbr32 {
+	__le32 type_abs_incr_xid;
+	#define DBR_DBR32_XID_MASK				    0xfffffUL
+	#define DBR_DBR32_XID_SFT				    0
+	#define DBR_DBR32_RESERVED4_MASK			    0xf00000UL
+	#define DBR_DBR32_RESERVED4_SFT			    20
+	#define DBR_DBR32_INCR_MASK				    0xf000000UL
+	#define DBR_DBR32_INCR_SFT				    24
+	#define DBR_DBR32_ABS					    0x10000000UL
+	#define DBR_DBR32_TYPE_MASK				    0xe0000000UL
+	#define DBR_DBR32_TYPE_SFT				    29
+	#define DBR_DBR32_TYPE_SQ				   (0x0UL << 29)
+};
+
+/* SQ WQE Structures */
+/* Base SQ WQE (8 bytes) */
+struct sq_base {
+	u8 wqe_type;
+	#define SQ_BASE_WQE_TYPE_SEND				   0x0UL
+	#define SQ_BASE_WQE_TYPE_SEND_W_IMMEAD			   0x1UL
+	#define SQ_BASE_WQE_TYPE_SEND_W_INVALID		   0x2UL
+	#define SQ_BASE_WQE_TYPE_WRITE_WQE			   0x4UL
+	#define SQ_BASE_WQE_TYPE_WRITE_W_IMMEAD		   0x5UL
+	#define SQ_BASE_WQE_TYPE_READ_WQE			   0x6UL
+	#define SQ_BASE_WQE_TYPE_ATOMIC_CS			   0x8UL
+	#define SQ_BASE_WQE_TYPE_ATOMIC_FA			   0xbUL
+	#define SQ_BASE_WQE_TYPE_LOCAL_INVALID			   0xcUL
+	#define SQ_BASE_WQE_TYPE_FR_PMR			   0xdUL
+	#define SQ_BASE_WQE_TYPE_BIND				   0xeUL
+	u8 unused_0[7];
+};
+
+/* WQE SGE (16 bytes) */
+struct sq_sge {
+	__le64 va_or_pa;
+	__le32 l_key;
+	__le32 size;
+};
+
+/* PSN Search Structure (8 bytes) */
+struct sq_psn_search {
+	__le32 opcode_start_psn;
+	#define SQ_PSN_SEARCH_START_PSN_MASK			    0xffffffUL
+	#define SQ_PSN_SEARCH_START_PSN_SFT			    0
+	#define SQ_PSN_SEARCH_OPCODE_MASK			    0xff000000UL
+	#define SQ_PSN_SEARCH_OPCODE_SFT			    24
+	__le32 flags_next_psn;
+	#define SQ_PSN_SEARCH_NEXT_PSN_MASK			    0xffffffUL
+	#define SQ_PSN_SEARCH_NEXT_PSN_SFT			    0
+	#define SQ_PSN_SEARCH_FLAGS_MASK			    0xff000000UL
+	#define SQ_PSN_SEARCH_FLAGS_SFT			    24
+};
+
+/* Send SQ WQE (40 bytes) */
+struct sq_send {
+	u8 wqe_type;
+	#define SQ_SEND_WQE_TYPE_SEND				   0x0UL
+	#define SQ_SEND_WQE_TYPE_SEND_W_IMMEAD			   0x1UL
+	#define SQ_SEND_WQE_TYPE_SEND_W_INVALID		   0x2UL
+	u8 flags;
+	#define SQ_SEND_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_SEND_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_SEND_FLAGS_UC_FENCE				    0x4UL
+	#define SQ_SEND_FLAGS_SE				    0x8UL
+	#define SQ_SEND_FLAGS_INLINE				    0x10UL
+	u8 wqe_size;
+	u8 reserved8_1;
+	__le32 inv_key_or_imm_data;
+	__le32 length;
+	__le32 q_key;
+	__le32 dst_qp;
+	#define SQ_SEND_DST_QP_MASK				    0xffffffUL
+	#define SQ_SEND_DST_QP_SFT				    0
+	#define SQ_SEND_RESERVED8_2_MASK			    0xff000000UL
+	#define SQ_SEND_RESERVED8_2_SFT			    24
+	__le32 avid;
+	#define SQ_SEND_AVID_MASK				    0xfffffUL
+	#define SQ_SEND_AVID_SFT				    0
+	#define SQ_SEND_RESERVED_AVID_MASK			    0xfff00000UL
+	#define SQ_SEND_RESERVED_AVID_SFT			    20
+	__le64 reserved64;
+	__le32 data[24];
+};
+
+/* Send Raw Ethernet and QP1 SQ WQE (40 bytes) */
+struct sq_send_raweth_qp1 {
+	u8 wqe_type;
+	#define SQ_SEND_RAWETH_QP1_WQE_TYPE_SEND		   0x0UL
+	u8 flags;
+	#define SQ_SEND_RAWETH_QP1_FLAGS_SIGNAL_COMP		    0x1UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_RD_OR_ATOMIC_FENCE	    0x2UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_UC_FENCE		    0x4UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_SE			    0x8UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_INLINE		    0x10UL
+	u8 wqe_size;
+	u8 reserved8;
+	__le16 lflags;
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_TCP_UDP_CHKSUM	    0x1UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_IP_CHKSUM		    0x2UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_NOCRC		    0x4UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_STAMP		    0x8UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_T_IP_CHKSUM		    0x10UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_RESERVED1_1		    0x20UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_RESERVED1_2		    0x40UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_RESERVED1_3		    0x80UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_ROCE_CRC		    0x100UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_FCOE_CRC		    0x200UL
+	__le16 cfa_action;
+	__le32 length;
+	__le32 reserved32_1;
+	__le32 cfa_meta;
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK	    0xfffUL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT	    0
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_DE		    0x1000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_PRI_MASK	    0xe000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_PRI_SFT	    13
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_MASK	    0x70000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_SFT	    16
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID88A8    (0x0UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID8100    (0x1UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID9100    (0x2UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID9200    (0x3UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID9300    (0x4UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPIDCFG     (0x5UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_LAST	\
+				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPIDCFG
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_RESERVED_MASK     0xff80000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_RESERVED_SFT      19
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_MASK		    0xf0000000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_SFT		    28
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_NONE		   (0x0UL << 28)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_VLAN_TAG	   (0x1UL << 28)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_LAST		\
+				SQ_SEND_RAWETH_QP1_CFA_META_KEY_VLAN_TAG
+	__le32 reserved32_2;
+	__le64 reserved64;
+	__le32 data[24];
+};
+
+/* RDMA SQ WQE (40 bytes) */
+struct sq_rdma {
+	u8 wqe_type;
+	#define SQ_RDMA_WQE_TYPE_WRITE_WQE			   0x4UL
+	#define SQ_RDMA_WQE_TYPE_WRITE_W_IMMEAD		   0x5UL
+	#define SQ_RDMA_WQE_TYPE_READ_WQE			   0x6UL
+	u8 flags;
+	#define SQ_RDMA_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_RDMA_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_RDMA_FLAGS_UC_FENCE				    0x4UL
+	#define SQ_RDMA_FLAGS_SE				    0x8UL
+	#define SQ_RDMA_FLAGS_INLINE				    0x10UL
+	u8 wqe_size;
+	u8 reserved8;
+	__le32 imm_data;
+	__le32 length;
+	__le32 reserved32_1;
+	__le64 remote_va;
+	__le32 remote_key;
+	__le32 reserved32_2;
+	__le32 data[24];
+};
+
+/* Atomic SQ WQE (40 bytes) */
+struct sq_atomic {
+	u8 wqe_type;
+	#define SQ_ATOMIC_WQE_TYPE_ATOMIC_CS			   0x8UL
+	#define SQ_ATOMIC_WQE_TYPE_ATOMIC_FA			   0xbUL
+	u8 flags;
+	#define SQ_ATOMIC_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_ATOMIC_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_ATOMIC_FLAGS_UC_FENCE			    0x4UL
+	#define SQ_ATOMIC_FLAGS_SE				    0x8UL
+	#define SQ_ATOMIC_FLAGS_INLINE				    0x10UL
+	__le16 reserved16;
+	__le32 remote_key;
+	__le64 remote_va;
+	__le64 swap_data;
+	__le64 cmp_data;
+	__le32 data[24];
+};
+
+/* Local Invalidate SQ WQE (40 bytes) */
+struct sq_localinvalidate {
+	u8 wqe_type;
+	#define SQ_LOCALINVALIDATE_WQE_TYPE_LOCAL_INVALID	   0xcUL
+	u8 flags;
+	#define SQ_LOCALINVALIDATE_FLAGS_SIGNAL_COMP		    0x1UL
+	#define SQ_LOCALINVALIDATE_FLAGS_RD_OR_ATOMIC_FENCE	    0x2UL
+	#define SQ_LOCALINVALIDATE_FLAGS_UC_FENCE		    0x4UL
+	#define SQ_LOCALINVALIDATE_FLAGS_SE			    0x8UL
+	#define SQ_LOCALINVALIDATE_FLAGS_INLINE		    0x10UL
+	__le16 reserved16;
+	__le32 inv_l_key;
+	__le64 reserved64;
+	__le32 reserved128[4];
+	__le32 data[24];
+};
+
+/* FR-PMR SQ WQE (40 bytes) */
+struct sq_fr_pmr {
+	u8 wqe_type;
+	#define SQ_FR_PMR_WQE_TYPE_FR_PMR			   0xdUL
+	u8 flags;
+	#define SQ_FR_PMR_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_FR_PMR_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_FR_PMR_FLAGS_UC_FENCE			    0x4UL
+	#define SQ_FR_PMR_FLAGS_SE				    0x8UL
+	#define SQ_FR_PMR_FLAGS_INLINE				    0x10UL
+	u8 access_cntl;
+	#define SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE		    0x1UL
+	#define SQ_FR_PMR_ACCESS_CNTL_REMOTE_READ		    0x2UL
+	#define SQ_FR_PMR_ACCESS_CNTL_REMOTE_WRITE		    0x4UL
+	#define SQ_FR_PMR_ACCESS_CNTL_REMOTE_ATOMIC		    0x8UL
+	#define SQ_FR_PMR_ACCESS_CNTL_WINDOW_BIND		    0x10UL
+	u8 zero_based_page_size_log;
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_MASK			    0x1fUL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_SFT			    0
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_4K		   0x0UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_8K		   0x1UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_64K		   0x4UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_256K		   0x6UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_1M		   0x8UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_2M		   0x9UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_4M		   0xaUL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_1G		   0x12UL
+	#define SQ_FR_PMR_ZERO_BASED				    0x20UL
+	#define SQ_FR_PMR_RESERVED2_MASK			    0xc0UL
+	#define SQ_FR_PMR_RESERVED2_SFT			    6
+	__le32 l_key;
+	u8 length[5];
+	u8 reserved8_1;
+	u8 reserved8_2;
+	u8 numlevels_pbl_page_size_log;
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_MASK		    0x1fUL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_SFT		    0
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_4K		   0x0UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_8K		   0x1UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_64K		   0x4UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_256K		   0x6UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_1M		   0x8UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_2M		   0x9UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_4M		   0xaUL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_1G		   0x12UL
+	#define SQ_FR_PMR_RESERVED1				    0x20UL
+	#define SQ_FR_PMR_NUMLEVELS_MASK			    0xc0UL
+	#define SQ_FR_PMR_NUMLEVELS_SFT			    6
+	#define SQ_FR_PMR_NUMLEVELS_PHYSICAL			   (0x0UL << 6)
+	#define SQ_FR_PMR_NUMLEVELS_LAYER1			   (0x1UL << 6)
+	#define SQ_FR_PMR_NUMLEVELS_LAYER2			   (0x2UL << 6)
+	__le64 pblptr;
+	__le64 va;
+	__le32 data[24];
+};
+
+/* Bind SQ WQE (40 bytes) */
+struct sq_bind {
+	u8 wqe_type;
+	#define SQ_BIND_WQE_TYPE_BIND				   0xeUL
+	u8 flags;
+	#define SQ_BIND_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_BIND_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_BIND_FLAGS_UC_FENCE				    0x4UL
+	#define SQ_BIND_FLAGS_SE				    0x8UL
+	#define SQ_BIND_FLAGS_INLINE				    0x10UL
+	u8 access_cntl;
+	#define SQ_BIND_ACCESS_CNTL_LOCAL_WRITE		    0x1UL
+	#define SQ_BIND_ACCESS_CNTL_REMOTE_READ		    0x2UL
+	#define SQ_BIND_ACCESS_CNTL_REMOTE_WRITE		    0x4UL
+	#define SQ_BIND_ACCESS_CNTL_REMOTE_ATOMIC		    0x8UL
+	#define SQ_BIND_ACCESS_CNTL_WINDOW_BIND		    0x10UL
+	u8 reserved8_1;
+	u8 mw_type_zero_based;
+	#define SQ_BIND_ZERO_BASED				    0x1UL
+	#define SQ_BIND_MW_TYPE				    0x2UL
+	#define SQ_BIND_MW_TYPE_TYPE1				   (0x0UL << 1)
+	#define SQ_BIND_MW_TYPE_TYPE2				   (0x1UL << 1)
+	#define SQ_BIND_RESERVED6_MASK				    0xfcUL
+	#define SQ_BIND_RESERVED6_SFT				    2
+	u8 reserved8_2;
+	__le16 reserved16;
+	__le32 parent_l_key;
+	__le32 l_key;
+	__le64 va;
+	u8 length[5];
+	u8 data_reserved24[99];
+	#define SQ_BIND_RESERVED24_MASK			    0xffffff00UL
+	#define SQ_BIND_RESERVED24_SFT				    8
+	#define SQ_BIND_DATA_MASK				    0xffffffffUL
+	#define SQ_BIND_DATA_SFT				    0
+};
+
+/* RQ/SRQ WQE Structures */
+/* RQ/SRQ WQE (40 bytes) */
+struct rq_wqe {
+	u8 wqe_type;
+	#define RQ_WQE_WQE_TYPE_RCV				   0x80UL
+	u8 flags;
+	u8 wqe_size;
+	u8 reserved8;
+	__le32 reserved32;
+	__le32 wr_id[2];
+	#define RQ_WQE_WR_ID_MASK				    0xfffffUL
+	#define RQ_WQE_WR_ID_SFT				    0
+	#define RQ_WQE_RESERVED44_MASK				    0xfff00000UL
+	#define RQ_WQE_RESERVED44_SFT				    20
+	__le32 reserved128[4];
+	__le32 data[24];
+};
+
+/* CQ CQE Structures */
+/* Base CQE (32 bytes) */
+struct cq_base {
+	__le64 reserved64_1;
+	__le64 reserved64_2;
+	__le64 reserved64_3;
+	u8 cqe_type_toggle;
+	#define CQ_BASE_TOGGLE					    0x1UL
+	#define CQ_BASE_CQE_TYPE_MASK				    0x1eUL
+	#define CQ_BASE_CQE_TYPE_SFT				    1
+	#define CQ_BASE_CQE_TYPE_REQ				   (0x0UL << 1)
+	#define CQ_BASE_CQE_TYPE_RES_RC			   (0x1UL << 1)
+	#define CQ_BASE_CQE_TYPE_RES_UD			   (0x2UL << 1)
+	#define CQ_BASE_CQE_TYPE_RES_RAWETH_QP1		   (0x3UL << 1)
+	#define CQ_BASE_CQE_TYPE_TERMINAL			   (0xeUL << 1)
+	#define CQ_BASE_CQE_TYPE_CUT_OFF			   (0xfUL << 1)
+	#define CQ_BASE_RESERVED3_MASK				    0xe0UL
+	#define CQ_BASE_RESERVED3_SFT				    5
+	u8 status;
+	__le16 reserved16;
+	__le32 reserved32;
+};
+
+/* Requester CQ CQE (32 bytes) */
+struct cq_req {
+	__le64 qp_handle;
+	__le16 sq_cons_idx;
+	__le16 reserved16_1;
+	__le32 reserved32_2;
+	__le64 reserved64;
+	u8 cqe_type_toggle;
+	#define CQ_REQ_TOGGLE					    0x1UL
+	#define CQ_REQ_CQE_TYPE_MASK				    0x1eUL
+	#define CQ_REQ_CQE_TYPE_SFT				    1
+	#define CQ_REQ_CQE_TYPE_REQ				   (0x0UL << 1)
+	#define CQ_REQ_RESERVED3_MASK				    0xe0UL
+	#define CQ_REQ_RESERVED3_SFT				    5
+	u8 status;
+	#define CQ_REQ_STATUS_OK				   0x0UL
+	#define CQ_REQ_STATUS_BAD_RESPONSE_ERR			   0x1UL
+	#define CQ_REQ_STATUS_LOCAL_LENGTH_ERR			   0x2UL
+	#define CQ_REQ_STATUS_LOCAL_QP_OPERATION_ERR		   0x3UL
+	#define CQ_REQ_STATUS_LOCAL_PROTECTION_ERR		   0x4UL
+	#define CQ_REQ_STATUS_MEMORY_MGT_OPERATION_ERR		   0x5UL
+	#define CQ_REQ_STATUS_REMOTE_INVALID_REQUEST_ERR	   0x6UL
+	#define CQ_REQ_STATUS_REMOTE_ACCESS_ERR		   0x7UL
+	#define CQ_REQ_STATUS_REMOTE_OPERATION_ERR		   0x8UL
+	#define CQ_REQ_STATUS_RNR_NAK_RETRY_CNT_ERR		   0x9UL
+	#define CQ_REQ_STATUS_TRANSPORT_RETRY_CNT_ERR		   0xaUL
+	#define CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR		   0xbUL
+	__le16 reserved16_2;
+	__le32 reserved32_1;
+};
+
+/* Responder RC CQE (32 bytes) */
+struct cq_res_rc {
+	__le32 length;
+	__le32 imm_data_or_inv_r_key;
+	__le64 qp_handle;
+	__le64 mr_handle;
+	u8 cqe_type_toggle;
+	#define CQ_RES_RC_TOGGLE				    0x1UL
+	#define CQ_RES_RC_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_RES_RC_CQE_TYPE_SFT				    1
+	#define CQ_RES_RC_CQE_TYPE_RES_RC			   (0x1UL << 1)
+	#define CQ_RES_RC_RESERVED3_MASK			    0xe0UL
+	#define CQ_RES_RC_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_RES_RC_STATUS_OK				   0x0UL
+	#define CQ_RES_RC_STATUS_LOCAL_ACCESS_ERROR		   0x1UL
+	#define CQ_RES_RC_STATUS_LOCAL_LENGTH_ERR		   0x2UL
+	#define CQ_RES_RC_STATUS_LOCAL_PROTECTION_ERR		   0x3UL
+	#define CQ_RES_RC_STATUS_LOCAL_QP_OPERATION_ERR	   0x4UL
+	#define CQ_RES_RC_STATUS_MEMORY_MGT_OPERATION_ERR	   0x5UL
+	#define CQ_RES_RC_STATUS_REMOTE_INVALID_REQUEST_ERR       0x6UL
+	#define CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR	   0x7UL
+	#define CQ_RES_RC_STATUS_HW_FLUSH_ERR			   0x8UL
+	__le16 flags;
+	#define CQ_RES_RC_FLAGS_SRQ				    0x1UL
+	#define CQ_RES_RC_FLAGS_SRQ_RQ				   (0x0UL << 0)
+	#define CQ_RES_RC_FLAGS_SRQ_SRQ			   (0x1UL << 0)
+	#define CQ_RES_RC_FLAGS_SRQ_LAST    CQ_RES_RC_FLAGS_SRQ_SRQ
+	#define CQ_RES_RC_FLAGS_IMM				    0x2UL
+	#define CQ_RES_RC_FLAGS_INV				    0x4UL
+	#define CQ_RES_RC_FLAGS_RDMA				    0x8UL
+	#define CQ_RES_RC_FLAGS_RDMA_SEND			   (0x0UL << 3)
+	#define CQ_RES_RC_FLAGS_RDMA_RDMA_WRITE		   (0x1UL << 3)
+	#define CQ_RES_RC_FLAGS_RDMA_LAST    CQ_RES_RC_FLAGS_RDMA_RDMA_WRITE
+	__le32 srq_or_rq_wr_id;
+	#define CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK			    0xfffffUL
+	#define CQ_RES_RC_SRQ_OR_RQ_WR_ID_SFT			    0
+	#define CQ_RES_RC_RESERVED12_MASK			    0xfff00000UL
+	#define CQ_RES_RC_RESERVED12_SFT			    20
+};
+
+/* Responder UD CQE (32 bytes) */
+struct cq_res_ud {
+	__le32 length;
+	#define CQ_RES_UD_LENGTH_MASK				    0x3fffUL
+	#define CQ_RES_UD_LENGTH_SFT				    0
+	#define CQ_RES_UD_RESERVED18_MASK			    0xffffc000UL
+	#define CQ_RES_UD_RESERVED18_SFT			    14
+	__le32 imm_data;
+	__le64 qp_handle;
+	__le16 src_mac[3];
+	__le16 src_qp_low;
+	u8 cqe_type_toggle;
+	#define CQ_RES_UD_TOGGLE				    0x1UL
+	#define CQ_RES_UD_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_RES_UD_CQE_TYPE_SFT				    1
+	#define CQ_RES_UD_CQE_TYPE_RES_UD			   (0x2UL << 1)
+	#define CQ_RES_UD_RESERVED3_MASK			    0xe0UL
+	#define CQ_RES_UD_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_RES_UD_STATUS_OK				   0x0UL
+	#define CQ_RES_UD_STATUS_LOCAL_ACCESS_ERROR		   0x1UL
+	#define CQ_RES_UD_STATUS_HW_LOCAL_LENGTH_ERR		   0x2UL
+	#define CQ_RES_UD_STATUS_LOCAL_PROTECTION_ERR		   0x3UL
+	#define CQ_RES_UD_STATUS_LOCAL_QP_OPERATION_ERR	   0x4UL
+	#define CQ_RES_UD_STATUS_MEMORY_MGT_OPERATION_ERR	   0x5UL
+	#define CQ_RES_UD_STATUS_WORK_REQUEST_FLUSHED_ERR	   0x7UL
+	#define CQ_RES_UD_STATUS_HW_FLUSH_ERR			   0x8UL
+	__le16 flags;
+	#define CQ_RES_UD_FLAGS_SRQ				    0x1UL
+	#define CQ_RES_UD_FLAGS_SRQ_RQ				   (0x0UL << 0)
+	#define CQ_RES_UD_FLAGS_SRQ_SRQ			   (0x1UL << 0)
+	#define CQ_RES_UD_FLAGS_SRQ_LAST    CQ_RES_UD_FLAGS_SRQ_SRQ
+	#define CQ_RES_UD_FLAGS_IMM				    0x2UL
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK		    0xcUL
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT		    2
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1			   (0x0UL << 2)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4		   (0x2UL << 2)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6		   (0x3UL << 2)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_LAST		\
+					CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6
+	__le32 src_qp_high_srq_or_rq_wr_id;
+	#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK			    0xfffffUL
+	#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_SFT			    0
+	#define CQ_RES_UD_RESERVED4_MASK			    0xf00000UL
+	#define CQ_RES_UD_RESERVED4_SFT			    20
+	#define CQ_RES_UD_SRC_QP_HIGH_MASK			    0xff000000UL
+	#define CQ_RES_UD_SRC_QP_HIGH_SFT			    24
+};
+
+/* Responder RawEth and QP1 CQE (32 bytes) */
+struct cq_res_raweth_qp1 {
+	__le16 length;
+	#define CQ_RES_RAWETH_QP1_LENGTH_MASK			    0x3fffUL
+	#define CQ_RES_RAWETH_QP1_LENGTH_SFT			    0
+	#define CQ_RES_RAWETH_QP1_RESERVED2_MASK		    0xc000UL
+	#define CQ_RES_RAWETH_QP1_RESERVED2_SFT		    14
+	__le16 raweth_qp1_flags;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ERROR	    0x1UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_RESERVED5_1_MASK 0x3eUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_RESERVED5_1_SFT 1
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_MASK      0x3c0UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_SFT       6
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_NOT_KNOWN (0x0UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_IP       (0x1UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_TCP      (0x2UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_UDP      (0x3UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_FCOE     (0x4UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ROCE     (0x5UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ICMP     (0x7UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_PTP_WO_TIMESTAMP \
+								 (0x8UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_PTP_W_TIMESTAMP \
+								 (0x9UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_LAST	\
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_PTP_W_TIMESTAMP
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_MASK	    0x3ffUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_SFT		    0
+	#define CQ_RES_RAWETH_QP1_RESERVED6_MASK		    0xfc00UL
+	#define CQ_RES_RAWETH_QP1_RESERVED6_SFT		    10
+	__le16 raweth_qp1_errors;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_RESERVED4_MASK 0xfUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_RESERVED4_SFT  0
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_IP_CS_ERROR    0x10UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_L4_CS_ERROR    0x20UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_IP_CS_ERROR  0x40UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_L4_CS_ERROR  0x80UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_CRC_ERROR      0x100UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_MASK 0xe00UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_SFT 9
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_NO_ERROR \
+								(0x0UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_VERSION \
+								(0x1UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_HDR_LEN \
+								(0x2UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_TUNNEL_TOTAL_ERROR \
+								(0x3UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_IP_TOTAL_ERROR \
+								(0x4UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_UDP_TOTAL_ERROR \
+								(0x5UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_TTL \
+								(0x6UL << 9)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_LAST \
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_TTL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_MASK 0xf000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_SFT  12
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_NO_ERROR \
+								(0x0UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L3_BAD_VERSION \
+								(0x1UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L3_BAD_HDR_LEN \
+								 (0x2UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L3_BAD_TTL \
+								 (0x3UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_IP_TOTAL_ERROR \
+								 (0x4UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_UDP_TOTAL_ERROR \
+								 (0x5UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_HDR_LEN \
+								 (0x6UL << 12)
+	#define \
+	 CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_HDR_LEN_TOO_SMALL\
+								 (0x7UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_OPT_LEN \
+								 (0x8UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_LAST \
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_OPT_LEN
+	__le16 raweth_qp1_cfa_code;
+	__le64 qp_handle;
+	__le32 raweth_qp1_flags2;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_CS_CALC     0x1UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_L4_CS_CALC     0x2UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_T_IP_CS_CALC   0x4UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_T_L4_CS_CALC   0x8UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_MASK 0xf0UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_SFT 4
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_NONE \
+								(0x0UL << 4)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_VLAN \
+								(0x1UL << 4)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_LAST\
+			CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_VLAN
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_TYPE	    0x100UL
+	__le32 raweth_qp1_metadata;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_VID_MASK     0xfffUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_VID_SFT      0
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_DE	    0x1000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_PRI_MASK     0xe000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_PRI_SFT      13
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_TPID_MASK    0xffff0000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_TPID_SFT     16
+	u8 cqe_type_toggle;
+	#define CQ_RES_RAWETH_QP1_TOGGLE			    0x1UL
+	#define CQ_RES_RAWETH_QP1_CQE_TYPE_MASK		    0x1eUL
+	#define CQ_RES_RAWETH_QP1_CQE_TYPE_SFT			    1
+	#define CQ_RES_RAWETH_QP1_CQE_TYPE_RES_RAWETH_QP1	   (0x3UL << 1)
+	#define CQ_RES_RAWETH_QP1_RESERVED3_MASK		    0xe0UL
+	#define CQ_RES_RAWETH_QP1_RESERVED3_SFT		    5
+	u8 status;
+	#define CQ_RES_RAWETH_QP1_STATUS_OK			   0x0UL
+	#define CQ_RES_RAWETH_QP1_STATUS_LOCAL_ACCESS_ERROR       0x1UL
+	#define CQ_RES_RAWETH_QP1_STATUS_HW_LOCAL_LENGTH_ERR      0x2UL
+	#define CQ_RES_RAWETH_QP1_STATUS_LOCAL_PROTECTION_ERR     0x3UL
+	#define CQ_RES_RAWETH_QP1_STATUS_LOCAL_QP_OPERATION_ERR   0x4UL
+	#define CQ_RES_RAWETH_QP1_STATUS_MEMORY_MGT_OPERATION_ERR 0x5UL
+	#define CQ_RES_RAWETH_QP1_STATUS_WORK_REQUEST_FLUSHED_ERR 0x7UL
+	#define CQ_RES_RAWETH_QP1_STATUS_HW_FLUSH_ERR		   0x8UL
+	__le16 flags;
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ			    0x1UL
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ_RQ			   0x0UL
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ		   0x1UL
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ_LAST \
+					CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ
+	__le32 raweth_qp1_payload_offset_srq_or_rq_wr_id;
+	#define CQ_RES_RAWETH_QP1_SRQ_OR_RQ_WR_ID_MASK		    0xfffffUL
+	#define CQ_RES_RAWETH_QP1_SRQ_OR_RQ_WR_ID_SFT		    0
+	#define CQ_RES_RAWETH_QP1_RESERVED4_MASK		    0xf00000UL
+	#define CQ_RES_RAWETH_QP1_RESERVED4_SFT		    20
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_PAYLOAD_OFFSET_MASK   0xff000000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_PAYLOAD_OFFSET_SFT    24
+};
+
+/* Terminal CQE (32 bytes) */
+struct cq_terminal {
+	__le64 qp_handle;
+	__le16 sq_cons_idx;
+	__le16 rq_cons_idx;
+	__le32 reserved32_1;
+	__le64 reserved64_3;
+	u8 cqe_type_toggle;
+	#define CQ_TERMINAL_TOGGLE				    0x1UL
+	#define CQ_TERMINAL_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_TERMINAL_CQE_TYPE_SFT			    1
+	#define CQ_TERMINAL_CQE_TYPE_TERMINAL			   (0xeUL << 1)
+	#define CQ_TERMINAL_RESERVED3_MASK			    0xe0UL
+	#define CQ_TERMINAL_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_TERMINAL_STATUS_OK				   0x0UL
+	__le16 reserved16;
+	__le32 reserved32_2;
+};
+
+/* Cutoff CQE (32 bytes) */
+struct cq_cutoff {
+	__le64 reserved64_1;
+	__le64 reserved64_2;
+	__le64 reserved64_3;
+	u8 cqe_type_toggle;
+	#define CQ_CUTOFF_TOGGLE				    0x1UL
+	#define CQ_CUTOFF_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_CUTOFF_CQE_TYPE_SFT				    1
+	#define CQ_CUTOFF_CQE_TYPE_CUT_OFF			   (0xfUL << 1)
+	#define CQ_CUTOFF_RESERVED3_MASK			    0xe0UL
+	#define CQ_CUTOFF_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_CUTOFF_STATUS_OK				   0x0UL
+	__le16 reserved16;
+	__le32 reserved32;
+};
+
+/* Notification Queue (NQ) Structures */
+/* Base NQ Record (16 bytes) */
+struct nq_base {
+	__le16 info10_type;
+	#define NQ_BASE_TYPE_MASK				    0x3fUL
+	#define NQ_BASE_TYPE_SFT				    0
+	#define NQ_BASE_TYPE_CQ_NOTIFICATION			   0x30UL
+	#define NQ_BASE_TYPE_SRQ_EVENT				   0x32UL
+	#define NQ_BASE_TYPE_DBQ_EVENT				   0x34UL
+	#define NQ_BASE_TYPE_QP_EVENT				   0x38UL
+	#define NQ_BASE_TYPE_FUNC_EVENT			   0x3aUL
+	#define NQ_BASE_INFO10_MASK				    0xffc0UL
+	#define NQ_BASE_INFO10_SFT				    6
+	__le16 info16;
+	__le32 info32;
+	__le32 info63_v[2];
+	#define NQ_BASE_V					    0x1UL
+	#define NQ_BASE_INFO63_MASK				    0xfffffffeUL
+	#define NQ_BASE_INFO63_SFT				    1
+};
+
+/* Completion Queue Notification (16 bytes) */
+struct nq_cn {
+	__le16 type;
+	#define NQ_CN_TYPE_MASK				    0x3fUL
+	#define NQ_CN_TYPE_SFT					    0
+	#define NQ_CN_TYPE_CQ_NOTIFICATION			   0x30UL
+	#define NQ_CN_RESERVED9_MASK				    0xffc0UL
+	#define NQ_CN_RESERVED9_SFT				    6
+	__le16 reserved16;
+	__le32 cq_handle_low;
+	__le32 v;
+	#define NQ_CN_V					    0x1UL
+	#define NQ_CN_RESERVED31_MASK				    0xfffffffeUL
+	#define NQ_CN_RESERVED31_SFT				    1
+	__le32 cq_handle_high;
+};
+
+/* SRQ Event Notification (16 bytes) */
+struct nq_srq_event {
+	u8 type;
+	#define NQ_SRQ_EVENT_TYPE_MASK				    0x3fUL
+	#define NQ_SRQ_EVENT_TYPE_SFT				    0
+	#define NQ_SRQ_EVENT_TYPE_SRQ_EVENT			   0x32UL
+	#define NQ_SRQ_EVENT_RESERVED1_MASK			    0xc0UL
+	#define NQ_SRQ_EVENT_RESERVED1_SFT			    6
+	u8 event;
+	#define NQ_SRQ_EVENT_EVENT_SRQ_THRESHOLD_EVENT		   0x1UL
+	__le16 reserved16;
+	__le32 srq_handle_low;
+	__le32 v;
+	#define NQ_SRQ_EVENT_V					    0x1UL
+	#define NQ_SRQ_EVENT_RESERVED31_MASK			    0xfffffffeUL
+	#define NQ_SRQ_EVENT_RESERVED31_SFT			    1
+	__le32 srq_handle_high;
+};
+
+/* DBQ Async Event Notification (16 bytes) */
+struct nq_dbq_event {
+	u8 type;
+	#define NQ_DBQ_EVENT_TYPE_MASK				    0x3fUL
+	#define NQ_DBQ_EVENT_TYPE_SFT				    0
+	#define NQ_DBQ_EVENT_TYPE_DBQ_EVENT			   0x34UL
+	#define NQ_DBQ_EVENT_RESERVED1_MASK			    0xc0UL
+	#define NQ_DBQ_EVENT_RESERVED1_SFT			    6
+	u8 event;
+	#define NQ_DBQ_EVENT_EVENT_DBQ_THRESHOLD_EVENT		   0x1UL
+	__le16 db_pfid;
+	#define NQ_DBQ_EVENT_DB_PFID_MASK			    0xfUL
+	#define NQ_DBQ_EVENT_DB_PFID_SFT			    0
+	#define NQ_DBQ_EVENT_RESERVED12_MASK			    0xfff0UL
+	#define NQ_DBQ_EVENT_RESERVED12_SFT			    4
+	__le32 db_dpi;
+	#define NQ_DBQ_EVENT_DB_DPI_MASK			    0xfffffUL
+	#define NQ_DBQ_EVENT_DB_DPI_SFT			    0
+	#define NQ_DBQ_EVENT_RESERVED12_2_MASK			    0xfff00000UL
+	#define NQ_DBQ_EVENT_RESERVED12_2_SFT			    20
+	__le32 v;
+	#define NQ_DBQ_EVENT_V					    0x1UL
+	#define NQ_DBQ_EVENT_RESERVED32_MASK			    0xfffffffeUL
+	#define NQ_DBQ_EVENT_RESERVED32_SFT			    1
+	__le32 db_type_db_xid;
+	#define NQ_DBQ_EVENT_DB_XID_MASK			    0xfffffUL
+	#define NQ_DBQ_EVENT_DB_XID_SFT			    0
+	#define NQ_DBQ_EVENT_RESERVED8_MASK			    0xff00000UL
+	#define NQ_DBQ_EVENT_RESERVED8_SFT			    20
+	#define NQ_DBQ_EVENT_DB_TYPE_MASK			    0xf0000000UL
+	#define NQ_DBQ_EVENT_DB_TYPE_SFT			    28
+};
+
+/* Read Request/Response Queue Structures */
+/* Input Read Request Queue (IRRQ) Message (32 bytes) */
+struct xrrq_irrq {
+	__le16 credits_type;
+	#define XRRQ_IRRQ_TYPE					    0x1UL
+	#define XRRQ_IRRQ_TYPE_READ_REQ			   0x0UL
+	#define XRRQ_IRRQ_TYPE_ATOMIC_REQ			   0x1UL
+	#define XRRQ_IRRQ_RESERVED10_MASK			    0x7feUL
+	#define XRRQ_IRRQ_RESERVED10_SFT			    1
+	#define XRRQ_IRRQ_CREDITS_MASK				    0xf800UL
+	#define XRRQ_IRRQ_CREDITS_SFT				    11
+	__le16 reserved16;
+	__le32 reserved32;
+	__le32 psn;
+	#define XRRQ_IRRQ_PSN_MASK				    0xffffffUL
+	#define XRRQ_IRRQ_PSN_SFT				    0
+	#define XRRQ_IRRQ_RESERVED8_1_MASK			    0xff000000UL
+	#define XRRQ_IRRQ_RESERVED8_1_SFT			    24
+	__le32 msn;
+	#define XRRQ_IRRQ_MSN_MASK				    0xffffffUL
+	#define XRRQ_IRRQ_MSN_SFT				    0
+	#define XRRQ_IRRQ_RESERVED8_2_MASK			    0xff000000UL
+	#define XRRQ_IRRQ_RESERVED8_2_SFT			    24
+	__le64 va_or_atomic_result;
+	__le32 rdma_r_key;
+	__le32 length;
+};
+
+/* Output Read Request Queue (ORRQ) Message (32 bytes) */
+struct xrrq_orrq {
+	__le16 num_sges_type;
+	#define XRRQ_ORRQ_TYPE					    0x1UL
+	#define XRRQ_ORRQ_TYPE_READ_REQ			   0x0UL
+	#define XRRQ_ORRQ_TYPE_ATOMIC_REQ			   0x1UL
+	#define XRRQ_ORRQ_RESERVED10_MASK			    0x7feUL
+	#define XRRQ_ORRQ_RESERVED10_SFT			    1
+	#define XRRQ_ORRQ_NUM_SGES_MASK			    0xf800UL
+	#define XRRQ_ORRQ_NUM_SGES_SFT				    11
+	__le16 reserved16;
+	__le32 length;
+	__le32 psn;
+	#define XRRQ_ORRQ_PSN_MASK				    0xffffffUL
+	#define XRRQ_ORRQ_PSN_SFT				    0
+	#define XRRQ_ORRQ_RESERVED8_1_MASK			    0xff000000UL
+	#define XRRQ_ORRQ_RESERVED8_1_SFT			    24
+	__le32 end_psn;
+	#define XRRQ_ORRQ_END_PSN_MASK				    0xffffffUL
+	#define XRRQ_ORRQ_END_PSN_SFT				    0
+	#define XRRQ_ORRQ_RESERVED8_2_MASK			    0xff000000UL
+	#define XRRQ_ORRQ_RESERVED8_2_SFT			    24
+	__le64 first_sge_phy_or_sing_sge_va;
+	__le32 single_sge_l_key;
+	__le32 single_sge_size;
+};
+
+/* Page Buffer List Memory Structures (PBL) */
+/* Page Table Entry (PTE) (8 bytes) */
+struct ptu_pte {
+	__le32 page_next_to_last_last_valid[2];
+	#define PTU_PTE_VALID					    0x1UL
+	#define PTU_PTE_LAST					    0x2UL
+	#define PTU_PTE_NEXT_TO_LAST				    0x4UL
+	#define PTU_PTE_PAGE_MASK				    0xfffff000UL
+	#define PTU_PTE_PAGE_SFT				    12
+};
+
+/* Page Directory Entry (PDE) (8 bytes) */
+struct ptu_pde {
+	__le32 page_valid[2];
+	#define PTU_PDE_VALID					    0x1UL
+	#define PTU_PDE_PAGE_MASK				    0xfffff000UL
+	#define PTU_PDE_PAGE_SFT				    12
+};
+
+/* RoCE Fastpath Host Structures */
+/* Command Queue (CMDQ) Interface */
+/* Init CMDQ (16 bytes) */
+struct cmdq_init {
+	__le64 cmdq_pbl;
+	__le16 cmdq_size_cmdq_lvl;
+	#define CMDQ_INIT_CMDQ_LVL_MASK			    0x3UL
+	#define CMDQ_INIT_CMDQ_LVL_SFT				    0
+	#define CMDQ_INIT_CMDQ_SIZE_MASK			    0xfffcUL
+	#define CMDQ_INIT_CMDQ_SIZE_SFT			    2
+	__le16 creq_ring_id;
+	__le32 prod_idx;
+};
+
+/* Update CMDQ producer index (16 bytes) */
+struct cmdq_update {
+	__le64 reserved64;
+	__le32 reserved32;
+	__le32 prod_idx;
+};
+
+/* CMDQ common header structure (16 bytes) */
+struct cmdq_base {
+	u8 opcode;
+	#define CMDQ_BASE_OPCODE_CREATE_QP			   0x1UL
+	#define CMDQ_BASE_OPCODE_DESTROY_QP			   0x2UL
+	#define CMDQ_BASE_OPCODE_MODIFY_QP			   0x3UL
+	#define CMDQ_BASE_OPCODE_QUERY_QP			   0x4UL
+	#define CMDQ_BASE_OPCODE_CREATE_SRQ			   0x5UL
+	#define CMDQ_BASE_OPCODE_DESTROY_SRQ			   0x6UL
+	#define CMDQ_BASE_OPCODE_QUERY_SRQ			   0x8UL
+	#define CMDQ_BASE_OPCODE_CREATE_CQ			   0x9UL
+	#define CMDQ_BASE_OPCODE_DESTROY_CQ			   0xaUL
+	#define CMDQ_BASE_OPCODE_RESIZE_CQ			   0xcUL
+	#define CMDQ_BASE_OPCODE_ALLOCATE_MRW			   0xdUL
+	#define CMDQ_BASE_OPCODE_DEALLOCATE_KEY		   0xeUL
+	#define CMDQ_BASE_OPCODE_REGISTER_MR			   0xfUL
+	#define CMDQ_BASE_OPCODE_DEREGISTER_MR			   0x10UL
+	#define CMDQ_BASE_OPCODE_ADD_GID			   0x11UL
+	#define CMDQ_BASE_OPCODE_DELETE_GID			   0x12UL
+	#define CMDQ_BASE_OPCODE_MODIFY_GID			   0x17UL
+	#define CMDQ_BASE_OPCODE_QUERY_GID			   0x18UL
+	#define CMDQ_BASE_OPCODE_CREATE_QP1			   0x13UL
+	#define CMDQ_BASE_OPCODE_DESTROY_QP1			   0x14UL
+	#define CMDQ_BASE_OPCODE_CREATE_AH			   0x15UL
+	#define CMDQ_BASE_OPCODE_DESTROY_AH			   0x16UL
+	#define CMDQ_BASE_OPCODE_INITIALIZE_FW			   0x80UL
+	#define CMDQ_BASE_OPCODE_DEINITIALIZE_FW		   0x81UL
+	#define CMDQ_BASE_OPCODE_STOP_FUNC			   0x82UL
+	#define CMDQ_BASE_OPCODE_QUERY_FUNC			   0x83UL
+	#define CMDQ_BASE_OPCODE_SET_FUNC_RESOURCES		   0x84UL
+	#define CMDQ_BASE_OPCODE_READ_CONTEXT			   0x85UL
+	#define CMDQ_BASE_OPCODE_VF_BACKCHANNEL_REQUEST	   0x86UL
+	#define CMDQ_BASE_OPCODE_READ_VF_MEMORY		   0x87UL
+	#define CMDQ_BASE_OPCODE_COMPLETE_VF_REQUEST		   0x88UL
+	#define CMDQ_BASE_OPCODE_EXTEND_CONTEXT_ARRRAY		   0x89UL
+	#define CMDQ_BASE_OPCODE_MAP_TC_TO_COS			   0x8aUL
+	#define CMDQ_BASE_OPCODE_QUERY_VERSION			   0x8bUL
+	#define CMDQ_BASE_OPCODE_MODIFY_CC			   0x8cUL
+	#define CMDQ_BASE_OPCODE_QUERY_CC			   0x8dUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Create QP command (96 bytes) */
+struct cmdq_create_qp {
+	u8 opcode;
+	#define CMDQ_CREATE_QP_OPCODE_CREATE_QP		   0x1UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 qp_handle;
+	__le32 qp_flags;
+	#define CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED		   0x1UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION	   0x2UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE      0x4UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED		   0x8UL
+	u8 type;
+	#define CMDQ_CREATE_QP_TYPE_RC				   0x2UL
+	#define CMDQ_CREATE_QP_TYPE_UD				   0x4UL
+	#define CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE		   0x6UL
+	u8 sq_pg_size_sq_lvl;
+	#define CMDQ_CREATE_QP_SQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_SQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP_SQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP_SQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP_SQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_MASK			    0xf0UL
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 rq_pg_size_rq_lvl;
+	#define CMDQ_CREATE_QP_RQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_RQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP_RQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP_RQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP_RQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_MASK			    0xf0UL
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 unused_0;
+	__le32 dpi;
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_fwo_sq_sge;
+	#define CMDQ_CREATE_QP_SQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_SQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP_SQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP_SQ_FWO_SFT			    4
+	__le16 rq_fwo_rq_sge;
+	#define CMDQ_CREATE_QP_RQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_RQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP_RQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP_RQ_FWO_SFT			    4
+	__le32 scq_cid;
+	__le32 rcq_cid;
+	__le32 srq_cid;
+	__le32 pd_id;
+	__le64 sq_pbl;
+	__le64 rq_pbl;
+	__le64 irrq_addr;
+	__le64 orrq_addr;
+};
+
+/* Destroy QP command (24 bytes) */
+struct cmdq_destroy_qp {
+	u8 opcode;
+	#define CMDQ_DESTROY_QP_OPCODE_DESTROY_QP		   0x2UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 qp_cid;
+	__le32 unused_0;
+};
+
+/* Modify QP command (112 bytes) */
+struct cmdq_modify_qp {
+	u8 opcode;
+	#define CMDQ_MODIFY_QP_OPCODE_MODIFY_QP		   0x3UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 modify_mask;
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_STATE		    0x1UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY     0x2UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS		    0x4UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_PKEY		    0x8UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_QKEY		    0x10UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_DGID		    0x20UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL		    0x40UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX		    0x80UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT		    0x100UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS	    0x200UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC		    0x400UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU		    0x1000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TIMEOUT		    0x2000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RETRY_CNT		    0x4000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RNR_RETRY		    0x8000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN		    0x10000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MAX_RD_ATOMIC	    0x20000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER	    0x40000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN		    0x80000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC      0x100000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SIZE		    0x200000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SIZE		    0x400000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SGE		    0x800000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SGE		    0x1000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MAX_INLINE_DATA	    0x2000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID		    0x4000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SRC_MAC		    0x8000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID		    0x10000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_ENABLE_CC		    0x20000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TOS_ECN		    0x40000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TOS_DSCP		    0x80000000UL
+	__le32 qp_cid;
+	u8 network_type_en_sqd_async_notify_new_state;
+	#define CMDQ_MODIFY_QP_NEW_STATE_MASK			    0xfUL
+	#define CMDQ_MODIFY_QP_NEW_STATE_SFT			    0
+	#define CMDQ_MODIFY_QP_NEW_STATE_RESET			   0x0UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_INIT			   0x1UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_RTR			   0x2UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_RTS			   0x3UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_SQD			   0x4UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_SQE			   0x5UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_ERR			   0x6UL
+	#define CMDQ_MODIFY_QP_EN_SQD_ASYNC_NOTIFY		    0x10UL
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_MASK		    0xc0UL
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_SFT		    6
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV1		   (0x0UL << 6)
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV4	   (0x2UL << 6)
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV6	   (0x3UL << 6)
+	u8 access;
+	#define CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE		    0x1UL
+	#define CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE		    0x2UL
+	#define CMDQ_MODIFY_QP_ACCESS_REMOTE_READ		    0x4UL
+	#define CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC		    0x8UL
+	__le16 pkey;
+	__le32 qkey;
+	__le32 dgid[4];
+	__le32 flow_label;
+	__le16 sgid_index;
+	u8 hop_limit;
+	u8 traffic_class;
+	__le16 dest_mac[3];
+	u8 tos_dscp_tos_ecn;
+	#define CMDQ_MODIFY_QP_TOS_ECN_MASK			    0x3UL
+	#define CMDQ_MODIFY_QP_TOS_ECN_SFT			    0
+	#define CMDQ_MODIFY_QP_TOS_DSCP_MASK			    0xfcUL
+	#define CMDQ_MODIFY_QP_TOS_DSCP_SFT			    2
+	u8 path_mtu;
+	#define CMDQ_MODIFY_QP_PATH_MTU_MASK			    0xf0UL
+	#define CMDQ_MODIFY_QP_PATH_MTU_SFT			    4
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_256		   (0x0UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_512		   (0x1UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_1024		   (0x2UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_2048		   (0x3UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_4096		   (0x4UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_8192		   (0x5UL << 4)
+	u8 timeout;
+	u8 retry_cnt;
+	u8 rnr_retry;
+	u8 min_rnr_timer;
+	__le32 rq_psn;
+	__le32 sq_psn;
+	u8 max_rd_atomic;
+	u8 max_dest_rd_atomic;
+	__le16 enable_cc;
+	#define CMDQ_MODIFY_QP_ENABLE_CC			    0x1UL
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_sge;
+	__le16 rq_sge;
+	__le32 max_inline_data;
+	__le32 dest_qp_id;
+	__le32 unused_3;
+	__le16 src_mac[3];
+	__le16 vlan_pcp_vlan_dei_vlan_id;
+	#define CMDQ_MODIFY_QP_VLAN_ID_MASK			    0xfffUL
+	#define CMDQ_MODIFY_QP_VLAN_ID_SFT			    0
+	#define CMDQ_MODIFY_QP_VLAN_DEI			    0x1000UL
+	#define CMDQ_MODIFY_QP_VLAN_PCP_MASK			    0xe000UL
+	#define CMDQ_MODIFY_QP_VLAN_PCP_SFT			    13
+};
+
+/* Query QP command (24 bytes) */
+struct cmdq_query_qp {
+	u8 opcode;
+	#define CMDQ_QUERY_QP_OPCODE_QUERY_QP			   0x4UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 qp_cid;
+	__le32 unused_0;
+};
+
+/* Create SRQ command (48 bytes) */
+struct cmdq_create_srq {
+	u8 opcode;
+	#define CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ		   0x5UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 srq_handle;
+	__le16 pg_size_lvl;
+	#define CMDQ_CREATE_SRQ_LVL_MASK			    0x3UL
+	#define CMDQ_CREATE_SRQ_LVL_SFT			    0
+	#define CMDQ_CREATE_SRQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_SRQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_SRQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_SRQ_PG_SIZE_MASK			    0x1cUL
+	#define CMDQ_CREATE_SRQ_PG_SIZE_SFT			    2
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_4K			   (0x0UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_8K			   (0x1UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_64K			   (0x2UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_2M			   (0x3UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_8M			   (0x4UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_1G			   (0x5UL << 2)
+	__le16 eventq_id;
+	#define CMDQ_CREATE_SRQ_EVENTQ_ID_MASK			    0xfffUL
+	#define CMDQ_CREATE_SRQ_EVENTQ_ID_SFT			    0
+	__le16 srq_size;
+	__le16 srq_fwo;
+	__le32 dpi;
+	__le32 pd_id;
+	__le64 pbl;
+};
+
+/* Destroy SRQ command (24 bytes) */
+struct cmdq_destroy_srq {
+	u8 opcode;
+	#define CMDQ_DESTROY_SRQ_OPCODE_DESTROY_SRQ		   0x6UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 srq_cid;
+	__le32 unused_0;
+};
+
+/* Query SRQ command (24 bytes) */
+struct cmdq_query_srq {
+	u8 opcode;
+	#define CMDQ_QUERY_SRQ_OPCODE_QUERY_SRQ		   0x8UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 srq_cid;
+	__le32 unused_0;
+};
+
+/* Create CQ command (48 bytes) */
+struct cmdq_create_cq {
+	u8 opcode;
+	#define CMDQ_CREATE_CQ_OPCODE_CREATE_CQ		   0x9UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 cq_handle;
+	__le32 pg_size_lvl;
+	#define CMDQ_CREATE_CQ_LVL_MASK			    0x3UL
+	#define CMDQ_CREATE_CQ_LVL_SFT				    0
+	#define CMDQ_CREATE_CQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_CQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_CQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_CQ_PG_SIZE_MASK			    0x1cUL
+	#define CMDQ_CREATE_CQ_PG_SIZE_SFT			    2
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_4K			   (0x0UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_8K			   (0x1UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_64K			   (0x2UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_2M			   (0x3UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_8M			   (0x4UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_1G			   (0x5UL << 2)
+	__le32 cq_fco_cnq_id;
+	#define CMDQ_CREATE_CQ_CNQ_ID_MASK			    0xfffUL
+	#define CMDQ_CREATE_CQ_CNQ_ID_SFT			    0
+	#define CMDQ_CREATE_CQ_CQ_FCO_MASK			    0xfffff000UL
+	#define CMDQ_CREATE_CQ_CQ_FCO_SFT			    12
+	__le32 dpi;
+	__le32 cq_size;
+	__le64 pbl;
+};
+
+/* Destroy CQ command (24 bytes) */
+struct cmdq_destroy_cq {
+	u8 opcode;
+	#define CMDQ_DESTROY_CQ_OPCODE_DESTROY_CQ		   0xaUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 cq_cid;
+	__le32 unused_0;
+};
+
+/* Resize CQ command (40 bytes) */
+struct cmdq_resize_cq {
+	u8 opcode;
+	#define CMDQ_RESIZE_CQ_OPCODE_RESIZE_CQ		   0xcUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 cq_cid;
+	__le32 new_cq_size_pg_size_lvl;
+	#define CMDQ_RESIZE_CQ_LVL_MASK			    0x3UL
+	#define CMDQ_RESIZE_CQ_LVL_SFT				    0
+	#define CMDQ_RESIZE_CQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_RESIZE_CQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_RESIZE_CQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_RESIZE_CQ_PG_SIZE_MASK			    0x1cUL
+	#define CMDQ_RESIZE_CQ_PG_SIZE_SFT			    2
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_4K			   (0x0UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_8K			   (0x1UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_64K			   (0x2UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_2M			   (0x3UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_8M			   (0x4UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_1G			   (0x5UL << 2)
+	#define CMDQ_RESIZE_CQ_NEW_CQ_SIZE_MASK		    0x1fffe0UL
+	#define CMDQ_RESIZE_CQ_NEW_CQ_SIZE_SFT			    5
+	__le64 new_pbl;
+	__le32 new_cq_fco;
+	__le32 unused_2;
+};
+
+/* Allocate MRW command (32 bytes) */
+struct cmdq_allocate_mrw {
+	u8 opcode;
+	#define CMDQ_ALLOCATE_MRW_OPCODE_ALLOCATE_MRW		   0xdUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 mrw_handle;
+	u8 mrw_flags;
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MASK		    0xfUL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_SFT		    0
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR			   0x0UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR		   0x1UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1		   0x2UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A		   0x3UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B		   0x4UL
+	u8 access;
+	#define CMDQ_ALLOCATE_MRW_ACCESS_RESERVED_MASK		    0x1fUL
+	#define CMDQ_ALLOCATE_MRW_ACCESS_RESERVED_SFT		    0
+	#define CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY	    0x20UL
+	__le16 unused_1;
+	__le32 pd_id;
+};
+
+/* De-allocate key command (24 bytes) */
+struct cmdq_deallocate_key {
+	u8 opcode;
+	#define CMDQ_DEALLOCATE_KEY_OPCODE_DEALLOCATE_KEY	   0xeUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	u8 mrw_flags;
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MASK		    0xfUL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_SFT		    0
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MR		   0x0UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_PMR		   0x1UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MW_TYPE1		   0x2UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MW_TYPE2A	   0x3UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MW_TYPE2B	   0x4UL
+	u8 unused_1[3];
+	__le32 key;
+};
+
+/* Register MR command (48 bytes) */
+struct cmdq_register_mr {
+	u8 opcode;
+	#define CMDQ_REGISTER_MR_OPCODE_REGISTER_MR		   0xfUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	u8 log2_pg_size_lvl;
+	#define CMDQ_REGISTER_MR_LVL_MASK			    0x3UL
+	#define CMDQ_REGISTER_MR_LVL_SFT			    0
+	#define CMDQ_REGISTER_MR_LVL_LVL_0			   0x0UL
+	#define CMDQ_REGISTER_MR_LVL_LVL_1			   0x1UL
+	#define CMDQ_REGISTER_MR_LVL_LVL_2			   0x2UL
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK		    0x7cUL
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT		    2
+	u8 access;
+	#define CMDQ_REGISTER_MR_ACCESS_LOCAL_WRITE		    0x1UL
+	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_READ		    0x2UL
+	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_WRITE		    0x4UL
+	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_ATOMIC		    0x8UL
+	#define CMDQ_REGISTER_MR_ACCESS_MW_BIND		    0x10UL
+	#define CMDQ_REGISTER_MR_ACCESS_ZERO_BASED		    0x20UL
+	__le16 unused_1;
+	__le32 key;
+	__le64 pbl;
+	__le64 va;
+	__le64 mr_size;
+};
+
+/* Deregister MR command (24 bytes) */
+struct cmdq_deregister_mr {
+	u8 opcode;
+	#define CMDQ_DEREGISTER_MR_OPCODE_DEREGISTER_MR	   0x10UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 lkey;
+	__le32 unused_0;
+};
+
+/* Add GID command (48 bytes) */
+struct cmdq_add_gid {
+	u8 opcode;
+	#define CMDQ_ADD_GID_OPCODE_ADD_GID			   0x11UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__be32 gid[4];
+	__be16 src_mac[3];
+	__le16 vlan;
+	#define CMDQ_ADD_GID_VLAN_VLAN_ID_MASK			    0xfffUL
+	#define CMDQ_ADD_GID_VLAN_VLAN_ID_SFT			    0
+	#define CMDQ_ADD_GID_VLAN_TPID_MASK			    0x7000UL
+	#define CMDQ_ADD_GID_VLAN_TPID_SFT			    12
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_88A8		   (0x0UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_8100		   (0x1UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_9100		   (0x2UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_9200		   (0x3UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_9300		   (0x4UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_CFG1		   (0x5UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_CFG2		   (0x6UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_CFG3		   (0x7UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_LAST    CMDQ_ADD_GID_VLAN_TPID_TPID_CFG3
+	#define CMDQ_ADD_GID_VLAN_VLAN_EN			    0x8000UL
+	__le16 ipid;
+	__le16 stats_ctx;
+	#define CMDQ_ADD_GID_STATS_CTX_STATS_CTX_ID_MASK	    0x7fffUL
+	#define CMDQ_ADD_GID_STATS_CTX_STATS_CTX_ID_SFT	    0
+	#define CMDQ_ADD_GID_STATS_CTX_STATS_CTX_VALID		    0x8000UL
+	__le32 unused_0;
+};
+
+/* Delete GID command (24 bytes) */
+struct cmdq_delete_gid {
+	u8 opcode;
+	#define CMDQ_DELETE_GID_OPCODE_DELETE_GID		   0x12UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le16 gid_index;
+	__le16 unused_0;
+	__le32 unused_1;
+};
+
+/* Modify GID command (48 bytes) */
+struct cmdq_modify_gid {
+	u8 opcode;
+	#define CMDQ_MODIFY_GID_OPCODE_MODIFY_GID		   0x17UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 gid[4];
+	__le16 src_mac[3];
+	__le16 vlan;
+	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_MASK		    0xfffUL
+	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_SFT		    0
+	#define CMDQ_MODIFY_GID_VLAN_TPID_MASK			    0x7000UL
+	#define CMDQ_MODIFY_GID_VLAN_TPID_SFT			    12
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_88A8		   (0x0UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_8100		   (0x1UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_9100		   (0x2UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_9200		   (0x3UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_9300		   (0x4UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG1		   (0x5UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG2		   (0x6UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG3		   (0x7UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_LAST		\
+					CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG3
+	#define CMDQ_MODIFY_GID_VLAN_VLAN_EN			    0x8000UL
+	__le16 ipid;
+	__le16 gid_index;
+	__le16 stats_ctx;
+	#define CMDQ_MODIFY_GID_STATS_CTX_STATS_CTX_ID_MASK	    0x7fffUL
+	#define CMDQ_MODIFY_GID_STATS_CTX_STATS_CTX_ID_SFT	    0
+	#define CMDQ_MODIFY_GID_STATS_CTX_STATS_CTX_VALID	    0x8000UL
+	__le16 unused_0;
+};
+
+/* Query GID command (24 bytes) */
+struct cmdq_query_gid {
+	u8 opcode;
+	#define CMDQ_QUERY_GID_OPCODE_QUERY_GID		   0x18UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le16 gid_index;
+	__le16 unused_0;
+	__le32 unused_1;
+};
+
+/* Create QP1 command (80 bytes) */
+struct cmdq_create_qp1 {
+	u8 opcode;
+	#define CMDQ_CREATE_QP1_OPCODE_CREATE_QP1		   0x13UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 qp_handle;
+	__le32 qp_flags;
+	#define CMDQ_CREATE_QP1_QP_FLAGS_SRQ_USED		   0x1UL
+	#define CMDQ_CREATE_QP1_QP_FLAGS_FORCE_COMPLETION	   0x2UL
+	#define CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE     0x4UL
+	u8 type;
+	#define CMDQ_CREATE_QP1_TYPE_GSI			   0x1UL
+	u8 sq_pg_size_sq_lvl;
+	#define CMDQ_CREATE_QP1_SQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_SQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP1_SQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP1_SQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP1_SQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 rq_pg_size_rq_lvl;
+	#define CMDQ_CREATE_QP1_RQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_RQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP1_RQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP1_RQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP1_RQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 unused_0;
+	__le32 dpi;
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_fwo_sq_sge;
+	#define CMDQ_CREATE_QP1_SQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_SQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP1_SQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP1_SQ_FWO_SFT			    4
+	__le16 rq_fwo_rq_sge;
+	#define CMDQ_CREATE_QP1_RQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_RQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP1_RQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP1_RQ_FWO_SFT			    4
+	__le32 scq_cid;
+	__le32 rcq_cid;
+	__le32 srq_cid;
+	__le32 pd_id;
+	__le64 sq_pbl;
+	__le64 rq_pbl;
+};
+
+/* Destroy QP1 command (24 bytes) */
+struct cmdq_destroy_qp1 {
+	u8 opcode;
+	#define CMDQ_DESTROY_QP1_OPCODE_DESTROY_QP1		   0x14UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 qp1_cid;
+	__le32 unused_0;
+};
+
+/* Create AH command (64 bytes) */
+struct cmdq_create_ah {
+	u8 opcode;
+	#define CMDQ_CREATE_AH_OPCODE_CREATE_AH		   0x15UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 ah_handle;
+	__le32 dgid[4];
+	u8 type;
+	#define CMDQ_CREATE_AH_TYPE_V1				   0x0UL
+	#define CMDQ_CREATE_AH_TYPE_V2IPV4			   0x2UL
+	#define CMDQ_CREATE_AH_TYPE_V2IPV6			   0x3UL
+	u8 hop_limit;
+	__le16 sgid_index;
+	__le32 dest_vlan_id_flow_label;
+	#define CMDQ_CREATE_AH_FLOW_LABEL_MASK			    0xfffffUL
+	#define CMDQ_CREATE_AH_FLOW_LABEL_SFT			    0
+	#define CMDQ_CREATE_AH_DEST_VLAN_ID_MASK		    0xfff00000UL
+	#define CMDQ_CREATE_AH_DEST_VLAN_ID_SFT		    20
+	__le32 pd_id;
+	__le32 unused_0;
+	__le16 dest_mac[3];
+	u8 traffic_class;
+	u8 unused_1;
+};
+
+/* Destroy AH command (24 bytes) */
+struct cmdq_destroy_ah {
+	u8 opcode;
+	#define CMDQ_DESTROY_AH_OPCODE_DESTROY_AH		   0x16UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 ah_cid;
+	__le32 unused_0;
+};
+
+/* Initialize Firmware command (112 bytes) */
+struct cmdq_initialize_fw {
+	u8 opcode;
+	#define CMDQ_INITIALIZE_FW_OPCODE_INITIALIZE_FW	   0x80UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	u8 qpc_pg_size_qpc_lvl;
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 mrw_pg_size_mrw_lvl;
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 srq_pg_size_srq_lvl;
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 cq_pg_size_cq_lvl;
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_MASK			    0xfUL
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 tqm_pg_size_tqm_lvl;
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 tim_pg_size_tim_lvl;
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_1G		   (0x5UL << 4)
+	__le16 reserved16;
+	__le64 qpc_page_dir;
+	__le64 mrw_page_dir;
+	__le64 srq_page_dir;
+	__le64 cq_page_dir;
+	__le64 tqm_page_dir;
+	__le64 tim_page_dir;
+	__le32 number_of_qp;
+	__le32 number_of_mrw;
+	__le32 number_of_srq;
+	__le32 number_of_cq;
+	__le32 max_qp_per_vf;
+	__le32 max_mrw_per_vf;
+	__le32 max_srq_per_vf;
+	__le32 max_cq_per_vf;
+	__le32 max_gid_per_vf;
+	__le32 stat_ctx_id;
+};
+
+/* De-initialize Firmware command (16 bytes) */
+struct cmdq_deinitialize_fw {
+	u8 opcode;
+	#define CMDQ_DEINITIALIZE_FW_OPCODE_DEINITIALIZE_FW       0x81UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Stop function command (16 bytes) */
+struct cmdq_stop_func {
+	u8 opcode;
+	#define CMDQ_STOP_FUNC_OPCODE_STOP_FUNC		   0x82UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Query function command (16 bytes) */
+struct cmdq_query_func {
+	u8 opcode;
+	#define CMDQ_QUERY_FUNC_OPCODE_QUERY_FUNC		   0x83UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Set function resources command (16 bytes) */
+struct cmdq_set_func_resources {
+	u8 opcode;
+	#define CMDQ_SET_FUNC_RESOURCES_OPCODE_SET_FUNC_RESOURCES 0x84UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Read hardware resource context command (24 bytes) */
+struct cmdq_read_context {
+	u8 opcode;
+	#define CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT		   0x85UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 type_xid;
+	#define CMDQ_READ_CONTEXT_XID_MASK			    0xffffffUL
+	#define CMDQ_READ_CONTEXT_XID_SFT			    0
+	#define CMDQ_READ_CONTEXT_TYPE_MASK			    0xff000000UL
+	#define CMDQ_READ_CONTEXT_TYPE_SFT			    24
+	#define CMDQ_READ_CONTEXT_TYPE_QPC			   (0x0UL << 24)
+	#define CMDQ_READ_CONTEXT_TYPE_CQ			   (0x1UL << 24)
+	#define CMDQ_READ_CONTEXT_TYPE_MRW			   (0x2UL << 24)
+	#define CMDQ_READ_CONTEXT_TYPE_SRQ			   (0x3UL << 24)
+	__le32 unused_0;
+};
+
+/* Map TC to COS. Can only be issued from a PF (24 bytes) */
+struct cmdq_map_tc_to_cos {
+	u8 opcode;
+	#define CMDQ_MAP_TC_TO_COS_OPCODE_MAP_TC_TO_COS	   0x8aUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le16 cos0;
+	#define CMDQ_MAP_TC_TO_COS_COS0_NO_CHANGE		   0xffffUL
+	__le16 cos1;
+	#define CMDQ_MAP_TC_TO_COS_COS1_DISABLE		   0x8000UL
+	#define CMDQ_MAP_TC_TO_COS_COS1_NO_CHANGE		   0xffffUL
+	__le32 unused_0;
+};
+
+/* Query version command (16 bytes) */
+struct cmdq_query_version {
+	u8 opcode;
+	#define CMDQ_QUERY_VERSION_OPCODE_QUERY_VERSION	   0x8bUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Command-Response Event Queue (CREQ) Structures */
+/* Base CREQ Record (16 bytes) */
+struct creq_base {
+	u8 type;
+	#define CREQ_BASE_TYPE_MASK				    0x3fUL
+	#define CREQ_BASE_TYPE_SFT				    0
+	#define CREQ_BASE_TYPE_QP_EVENT			   0x38UL
+	#define CREQ_BASE_TYPE_FUNC_EVENT			   0x3aUL
+	#define CREQ_BASE_RESERVED2_MASK			    0xc0UL
+	#define CREQ_BASE_RESERVED2_SFT			    6
+	u8 reserved56[7];
+	u8 v;
+	#define CREQ_BASE_V					    0x1UL
+	#define CREQ_BASE_RESERVED7_MASK			    0xfeUL
+	#define CREQ_BASE_RESERVED7_SFT			    1
+	u8 event;
+	__le16 reserved48[3];
+};
+
+/* RoCE Function Async Event Notification (16 bytes) */
+struct creq_func_event {
+	u8 type;
+	#define CREQ_FUNC_EVENT_TYPE_MASK			    0x3fUL
+	#define CREQ_FUNC_EVENT_TYPE_SFT			    0
+	#define CREQ_FUNC_EVENT_TYPE_FUNC_EVENT		   0x3aUL
+	#define CREQ_FUNC_EVENT_RESERVED2_MASK			    0xc0UL
+	#define CREQ_FUNC_EVENT_RESERVED2_SFT			    6
+	u8 reserved56[7];
+	u8 v;
+	#define CREQ_FUNC_EVENT_V				    0x1UL
+	#define CREQ_FUNC_EVENT_RESERVED7_MASK			    0xfeUL
+	#define CREQ_FUNC_EVENT_RESERVED7_SFT			    1
+	u8 event;
+	#define CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR		   0x1UL
+	#define CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR		   0x2UL
+	#define CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR		   0x3UL
+	#define CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR		   0x4UL
+	#define CREQ_FUNC_EVENT_EVENT_CQ_ERROR			   0x5UL
+	#define CREQ_FUNC_EVENT_EVENT_TQM_ERROR		   0x6UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR		   0x7UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCS_ERROR		   0x8UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCC_ERROR		   0x9UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCM_ERROR		   0xaUL
+	#define CREQ_FUNC_EVENT_EVENT_TIM_ERROR		   0xbUL
+	#define CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST		   0x80UL
+	#define CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED	   0x81UL
+	__le16 reserved48[3];
+};
+
+/* RoCE Slowpath Command Completion (16 bytes) */
+struct creq_qp_event {
+	u8 type;
+	#define CREQ_QP_EVENT_TYPE_MASK			    0x3fUL
+	#define CREQ_QP_EVENT_TYPE_SFT				    0
+	#define CREQ_QP_EVENT_TYPE_QP_EVENT			   0x38UL
+	#define CREQ_QP_EVENT_RESERVED2_MASK			    0xc0UL
+	#define CREQ_QP_EVENT_RESERVED2_SFT			    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_QP_EVENT_V				    0x1UL
+	#define CREQ_QP_EVENT_RESERVED7_MASK			    0xfeUL
+	#define CREQ_QP_EVENT_RESERVED7_SFT			    1
+	u8 event;
+	#define CREQ_QP_EVENT_EVENT_CREATE_QP			   0x1UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_QP			   0x2UL
+	#define CREQ_QP_EVENT_EVENT_MODIFY_QP			   0x3UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_QP			   0x4UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_SRQ			   0x5UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_SRQ		   0x6UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_SRQ			   0x8UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_CQ			   0x9UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_CQ			   0xaUL
+	#define CREQ_QP_EVENT_EVENT_RESIZE_CQ			   0xcUL
+	#define CREQ_QP_EVENT_EVENT_ALLOCATE_MRW		   0xdUL
+	#define CREQ_QP_EVENT_EVENT_DEALLOCATE_KEY		   0xeUL
+	#define CREQ_QP_EVENT_EVENT_REGISTER_MR		   0xfUL
+	#define CREQ_QP_EVENT_EVENT_DEREGISTER_MR		   0x10UL
+	#define CREQ_QP_EVENT_EVENT_ADD_GID			   0x11UL
+	#define CREQ_QP_EVENT_EVENT_DELETE_GID			   0x12UL
+	#define CREQ_QP_EVENT_EVENT_MODIFY_GID			   0x17UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_GID			   0x18UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_QP1			   0x13UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_QP1		   0x14UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_AH			   0x15UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_AH			   0x16UL
+	#define CREQ_QP_EVENT_EVENT_INITIALIZE_FW		   0x80UL
+	#define CREQ_QP_EVENT_EVENT_DEINITIALIZE_FW		   0x81UL
+	#define CREQ_QP_EVENT_EVENT_STOP_FUNC			   0x82UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_FUNC			   0x83UL
+	#define CREQ_QP_EVENT_EVENT_SET_FUNC_RESOURCES		   0x84UL
+	#define CREQ_QP_EVENT_EVENT_MAP_TC_TO_COS		   0x8aUL
+	#define CREQ_QP_EVENT_EVENT_QUERY_VERSION		   0x8bUL
+	#define CREQ_QP_EVENT_EVENT_MODIFY_CC			   0x8cUL
+	#define CREQ_QP_EVENT_EVENT_QUERY_CC			   0x8dUL
+	#define CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION	   0xc0UL
+	__le16 reserved48[3];
+};
+
+/* Create QP command response (16 bytes) */
+struct creq_create_qp_resp {
+	u8 type;
+	#define CREQ_CREATE_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_QP_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_QP_RESP_V				    0x1UL
+	#define CREQ_CREATE_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_QP_RESP_EVENT_CREATE_QP		   0x1UL
+	__le16 reserved48[3];
+};
+
+/* Destroy QP command response (16 bytes) */
+struct creq_destroy_qp_resp {
+	u8 type;
+	#define CREQ_DESTROY_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DESTROY_QP_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_QP_RESP_V				    0x1UL
+	#define CREQ_DESTROY_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_QP_RESP_EVENT_DESTROY_QP		   0x2UL
+	__le16 reserved48[3];
+};
+
+/* Modify QP command response (16 bytes) */
+struct creq_modify_qp_resp {
+	u8 type;
+	#define CREQ_MODIFY_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_MODIFY_QP_RESP_TYPE_SFT			    0
+	#define CREQ_MODIFY_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MODIFY_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MODIFY_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_MODIFY_QP_RESP_V				    0x1UL
+	#define CREQ_MODIFY_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MODIFY_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MODIFY_QP_RESP_EVENT_MODIFY_QP		   0x3UL
+	__le16 reserved48[3];
+};
+
+/* Query QP command response (16 bytes) */
+struct creq_query_qp_resp {
+	u8 type;
+	#define CREQ_QUERY_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_QP_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_QP_RESP_V				    0x1UL
+	#define CREQ_QUERY_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_QP_RESP_EVENT_QUERY_QP		   0x4UL
+	__le16 reserved48[3];
+};
+
+/* Query QP command response side buffer structure (104 bytes) */
+struct creq_query_qp_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_QP_RESP_SB_OPCODE_QUERY_QP		   0x4UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le32 xid;
+	u8 en_sqd_async_notify_state;
+	#define CREQ_QUERY_QP_RESP_SB_STATE_MASK		    0xfUL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_STATE_RESET		   0x0UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_INIT		   0x1UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_RTR		   0x2UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_RTS		   0x3UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_SQD		   0x4UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_SQE		   0x5UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_ERR		   0x6UL
+	#define CREQ_QUERY_QP_RESP_SB_EN_SQD_ASYNC_NOTIFY	    0x10UL
+	u8 access;
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_LOCAL_WRITE	    0x1UL
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_WRITE	    0x2UL
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_READ	    0x4UL
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_ATOMIC	    0x8UL
+	__le16 pkey;
+	__le32 qkey;
+	__le32 reserved32;
+	__le32 dgid[4];
+	__le32 flow_label;
+	__le16 sgid_index;
+	u8 hop_limit;
+	u8 traffic_class;
+	__le16 dest_mac[3];
+	__le16 path_mtu_dest_vlan_id;
+	#define CREQ_QUERY_QP_RESP_SB_DEST_VLAN_ID_MASK	    0xfffUL
+	#define CREQ_QUERY_QP_RESP_SB_DEST_VLAN_ID_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MASK		    0xf000UL
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_SFT		    12
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_256		   (0x0UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_512		   (0x1UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_1024	   (0x2UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_2048	   (0x3UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_4096	   (0x4UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_8192	   (0x5UL << 12)
+	u8 timeout;
+	u8 retry_cnt;
+	u8 rnr_retry;
+	u8 min_rnr_timer;
+	__le32 rq_psn;
+	__le32 sq_psn;
+	u8 max_rd_atomic;
+	u8 max_dest_rd_atomic;
+	u8 tos_dscp_tos_ecn;
+	#define CREQ_QUERY_QP_RESP_SB_TOS_ECN_MASK		    0x3UL
+	#define CREQ_QUERY_QP_RESP_SB_TOS_ECN_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_TOS_DSCP_MASK		    0xfcUL
+	#define CREQ_QUERY_QP_RESP_SB_TOS_DSCP_SFT		    2
+	u8 enable_cc;
+	#define CREQ_QUERY_QP_RESP_SB_ENABLE_CC		    0x1UL
+	#define CREQ_QUERY_QP_RESP_SB_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_QP_RESP_SB_RESERVED7_SFT		    1
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_sge;
+	__le16 rq_sge;
+	__le32 max_inline_data;
+	__le32 dest_qp_id;
+	__le32 unused_1;
+	__le16 src_mac[3];
+	__le16 vlan_pcp_vlan_dei_vlan_id;
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_ID_MASK		    0xfffUL
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_ID_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_DEI			    0x1000UL
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_PCP_MASK		    0xe000UL
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_PCP_SFT		    13
+};
+
+/* Create SRQ command response (16 bytes) */
+struct creq_create_srq_resp {
+	u8 type;
+	#define CREQ_CREATE_SRQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_SRQ_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_SRQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_SRQ_RESP_V				    0x1UL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_SRQ_RESP_EVENT_CREATE_SRQ		   0x5UL
+	__le16 reserved48[3];
+};
+
+/* Destroy SRQ command response (16 bytes) */
+struct creq_destroy_srq_resp {
+	u8 type;
+	#define CREQ_DESTROY_SRQ_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DESTROY_SRQ_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_SRQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_SRQ_RESP_V			    0x1UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_SRQ_RESP_EVENT_DESTROY_SRQ	   0x6UL
+	__le16 enable_for_arm[3];
+	#define CREQ_DESTROY_SRQ_RESP_ENABLE_FOR_ARM_MASK	    0x30000UL
+	#define CREQ_DESTROY_SRQ_RESP_ENABLE_FOR_ARM_SFT	    16
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED46_MASK		    0xfffc0000UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED46_SFT		    18
+};
+
+/* Query SRQ command response (16 bytes) */
+struct creq_query_srq_resp {
+	u8 type;
+	#define CREQ_QUERY_SRQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_SRQ_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_SRQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_SRQ_RESP_V				    0x1UL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_SRQ_RESP_EVENT_QUERY_SRQ		   0x8UL
+	__le16 reserved48[3];
+};
+
+/* Query SRQ command response side buffer structure (24 bytes) */
+struct creq_query_srq_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_SRQ_RESP_SB_OPCODE_QUERY_SRQ	   0x8UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le32 xid;
+	__le16 srq_limit;
+	__le16 reserved16;
+	__le32 data[4];
+};
+
+/* Create CQ command Response (16 bytes) */
+struct creq_create_cq_resp {
+	u8 type;
+	#define CREQ_CREATE_CQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_CQ_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_CQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_CQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_CQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_CQ_RESP_V				    0x1UL
+	#define CREQ_CREATE_CQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_CQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_CQ_RESP_EVENT_CREATE_CQ		   0x9UL
+	__le16 reserved48[3];
+};
+
+/* Destroy CQ command response (16 bytes) */
+struct creq_destroy_cq_resp {
+	u8 type;
+	#define CREQ_DESTROY_CQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DESTROY_CQ_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_CQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_CQ_RESP_V				    0x1UL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_CQ_RESP_EVENT_DESTROY_CQ		   0xaUL
+	__le16 cq_arm_lvl;
+	#define CREQ_DESTROY_CQ_RESP_CQ_ARM_LVL_MASK		    0x3UL
+	#define CREQ_DESTROY_CQ_RESP_CQ_ARM_LVL_SFT		    0
+	#define CREQ_DESTROY_CQ_RESP_RESERVED14_MASK		    0xfffcUL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED14_SFT		    2
+	__le16 total_cnq_events;
+	__le16 reserved16;
+};
+
+/* Resize CQ command response (16 bytes) */
+struct creq_resize_cq_resp {
+	u8 type;
+	#define CREQ_RESIZE_CQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_RESIZE_CQ_RESP_TYPE_SFT			    0
+	#define CREQ_RESIZE_CQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_RESIZE_CQ_RESP_V				    0x1UL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_RESIZE_CQ_RESP_EVENT_RESIZE_CQ		   0xcUL
+	__le16 reserved48[3];
+};
+
+/* Allocate MRW command response (16 bytes) */
+struct creq_allocate_mrw_resp {
+	u8 type;
+	#define CREQ_ALLOCATE_MRW_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_ALLOCATE_MRW_RESP_TYPE_SFT		    0
+	#define CREQ_ALLOCATE_MRW_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_ALLOCATE_MRW_RESP_V			    0x1UL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_ALLOCATE_MRW_RESP_EVENT_ALLOCATE_MRW	   0xdUL
+	__le16 reserved48[3];
+};
+
+/* De-allocate key command response (16 bytes) */
+struct creq_deallocate_key_resp {
+	u8 type;
+	#define CREQ_DEALLOCATE_KEY_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DEALLOCATE_KEY_RESP_TYPE_SFT		    0
+	#define CREQ_DEALLOCATE_KEY_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED2_MASK	    0xc0UL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DEALLOCATE_KEY_RESP_V			    0x1UL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED7_MASK	    0xfeUL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DEALLOCATE_KEY_RESP_EVENT_DEALLOCATE_KEY     0xeUL
+	__le16 reserved16;
+	__le32 bound_window_info;
+};
+
+/* Register MR command response (16 bytes) */
+struct creq_register_mr_resp {
+	u8 type;
+	#define CREQ_REGISTER_MR_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_REGISTER_MR_RESP_TYPE_SFT			    0
+	#define CREQ_REGISTER_MR_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_REGISTER_MR_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_REGISTER_MR_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_REGISTER_MR_RESP_V			    0x1UL
+	#define CREQ_REGISTER_MR_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_REGISTER_MR_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_REGISTER_MR_RESP_EVENT_REGISTER_MR	   0xfUL
+	__le16 reserved48[3];
+};
+
+/* Deregister MR command response (16 bytes) */
+struct creq_deregister_mr_resp {
+	u8 type;
+	#define CREQ_DEREGISTER_MR_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DEREGISTER_MR_RESP_TYPE_SFT		    0
+	#define CREQ_DEREGISTER_MR_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DEREGISTER_MR_RESP_V			    0x1UL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DEREGISTER_MR_RESP_EVENT_DEREGISTER_MR       0x10UL
+	__le16 reserved16;
+	__le32 bound_windows;
+};
+
+/* Add GID command response (16 bytes) */
+struct creq_add_gid_resp {
+	u8 type;
+	#define CREQ_ADD_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_ADD_GID_RESP_TYPE_SFT			    0
+	#define CREQ_ADD_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_ADD_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_ADD_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_ADD_GID_RESP_V				    0x1UL
+	#define CREQ_ADD_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_ADD_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_ADD_GID_RESP_EVENT_ADD_GID		   0x11UL
+	__le16 reserved48[3];
+};
+
+/* Delete GID command response (16 bytes) */
+struct creq_delete_gid_resp {
+	u8 type;
+	#define CREQ_DELETE_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DELETE_GID_RESP_TYPE_SFT			    0
+	#define CREQ_DELETE_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DELETE_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DELETE_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DELETE_GID_RESP_V				    0x1UL
+	#define CREQ_DELETE_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DELETE_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DELETE_GID_RESP_EVENT_DELETE_GID		   0x12UL
+	__le16 reserved48[3];
+};
+
+/* Modify GID command response (16 bytes) */
+struct creq_modify_gid_resp {
+	u8 type;
+	#define CREQ_MODIFY_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_MODIFY_GID_RESP_TYPE_SFT			    0
+	#define CREQ_MODIFY_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MODIFY_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MODIFY_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_MODIFY_GID_RESP_V				    0x1UL
+	#define CREQ_MODIFY_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MODIFY_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MODIFY_GID_RESP_EVENT_ADD_GID		   0x11UL
+	__le16 reserved48[3];
+};
+
+/* Query GID command response (16 bytes) */
+struct creq_query_gid_resp {
+	u8 type;
+	#define CREQ_QUERY_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_GID_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_GID_RESP_V				    0x1UL
+	#define CREQ_QUERY_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_GID_RESP_EVENT_QUERY_GID		   0x18UL
+	__le16 reserved48[3];
+};
+
+/* Query GID command response side buffer structure (40 bytes) */
+struct creq_query_gid_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_GID_RESP_SB_OPCODE_QUERY_GID	   0x18UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le32 gid[4];
+	__le16 src_mac[3];
+	__le16 vlan;
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_VLAN_ID_MASK	    0xfffUL
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_VLAN_ID_SFT	    0
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_MASK		    0x7000UL
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_SFT		    12
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_88A8	   (0x0UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_8100	   (0x1UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_9100	   (0x2UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_9200	   (0x3UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_9300	   (0x4UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG1	   (0x5UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG2	   (0x6UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG3	   (0x7UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_LAST	\
+				CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG3
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_VLAN_EN		    0x8000UL
+	__le16 ipid;
+	__le16 gid_index;
+	__le32 unused_0;
+};
+
+/* Create QP1 command response (16 bytes) */
+struct creq_create_qp1_resp {
+	u8 type;
+	#define CREQ_CREATE_QP1_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_QP1_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_QP1_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_QP1_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_QP1_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_QP1_RESP_V				    0x1UL
+	#define CREQ_CREATE_QP1_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_QP1_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_QP1_RESP_EVENT_CREATE_QP1		   0x13UL
+	__le16 reserved48[3];
+};
+
+/* Destroy QP1 command response (16 bytes) */
+struct creq_destroy_qp1_resp {
+	u8 type;
+	#define CREQ_DESTROY_QP1_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DESTROY_QP1_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_QP1_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_QP1_RESP_V			    0x1UL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_QP1_RESP_EVENT_DESTROY_QP1	   0x14UL
+	__le16 reserved48[3];
+};
+
+/* Create AH command response (16 bytes) */
+struct creq_create_ah_resp {
+	u8 type;
+	#define CREQ_CREATE_AH_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_AH_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_AH_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_AH_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_AH_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_AH_RESP_V				    0x1UL
+	#define CREQ_CREATE_AH_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_AH_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_AH_RESP_EVENT_CREATE_AH		   0x15UL
+	__le16 reserved48[3];
+};
+
+/* Destroy AH command response (16 bytes) */
+struct creq_destroy_ah_resp {
+	u8 type;
+	#define CREQ_DESTROY_AH_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DESTROY_AH_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_AH_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_AH_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_AH_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_AH_RESP_V				    0x1UL
+	#define CREQ_DESTROY_AH_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_AH_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_AH_RESP_EVENT_DESTROY_AH		   0x16UL
+	__le16 reserved48[3];
+};
+
+/* Initialize Firmware command response (16 bytes) */
+struct creq_initialize_fw_resp {
+	u8 type;
+	#define CREQ_INITIALIZE_FW_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_INITIALIZE_FW_RESP_TYPE_SFT		    0
+	#define CREQ_INITIALIZE_FW_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_INITIALIZE_FW_RESP_V			    0x1UL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_INITIALIZE_FW_RESP_EVENT_INITIALIZE_FW       0x80UL
+	__le16 reserved48[3];
+};
+
+/* De-initialize Firmware command response (16 bytes) */
+struct creq_deinitialize_fw_resp {
+	u8 type;
+	#define CREQ_DEINITIALIZE_FW_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DEINITIALIZE_FW_RESP_TYPE_SFT		    0
+	#define CREQ_DEINITIALIZE_FW_RESP_TYPE_QP_EVENT	   0x38UL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED2_MASK	    0xc0UL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED2_SFT	    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_DEINITIALIZE_FW_RESP_V			    0x1UL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED7_MASK	    0xfeUL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED7_SFT	    1
+	u8 event;
+	#define CREQ_DEINITIALIZE_FW_RESP_EVENT_DEINITIALIZE_FW   0x81UL
+	__le16 reserved48[3];
+};
+
+/* Stop function command response (16 bytes) */
+struct creq_stop_func_resp {
+	u8 type;
+	#define CREQ_STOP_FUNC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_STOP_FUNC_RESP_TYPE_SFT			    0
+	#define CREQ_STOP_FUNC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_STOP_FUNC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_STOP_FUNC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_STOP_FUNC_RESP_V				    0x1UL
+	#define CREQ_STOP_FUNC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_STOP_FUNC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_STOP_FUNC_RESP_EVENT_STOP_FUNC		   0x82UL
+	__le16 reserved48[3];
+};
+
+/* Query function command response (16 bytes) */
+struct creq_query_func_resp {
+	u8 type;
+	#define CREQ_QUERY_FUNC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_FUNC_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_FUNC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_FUNC_RESP_V				    0x1UL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_FUNC_RESP_EVENT_QUERY_FUNC		   0x83UL
+	__le16 reserved48[3];
+};
+
+/* Query function command response side buffer structure (88 bytes) */
+struct creq_query_func_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_FUNC_RESP_SB_OPCODE_QUERY_FUNC	   0x83UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 max_mr_size;
+	__le32 max_qp;
+	__le16 max_qp_wr;
+	__le16 dev_cap_flags;
+	#define CREQ_QUERY_FUNC_RESP_SB_DEV_CAP_FLAGS_RESIZE_QP   0x1UL
+	__le32 max_cq;
+	__le32 max_cqe;
+	__le32 max_pd;
+	u8 max_sge;
+	u8 max_srq_sge;
+	u8 max_qp_rd_atom;
+	u8 max_qp_init_rd_atom;
+	__le32 max_mr;
+	__le32 max_mw;
+	__le32 max_raw_eth_qp;
+	__le32 max_ah;
+	__le32 max_fmr;
+	__le32 max_srq_wr;
+	__le32 max_pkeys;
+	__le32 max_inline_data;
+	u8 max_map_per_fmr;
+	u8 l2_db_space_size;
+	__le16 max_srq;
+	__le32 max_gid;
+	__le32 tqm_alloc_reqs[8];
+};
+
+/* Set resources command response (16 bytes) */
+struct creq_set_func_resources_resp {
+	u8 type;
+	#define CREQ_SET_FUNC_RESOURCES_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_TYPE_SFT		    0
+	#define CREQ_SET_FUNC_RESOURCES_RESP_TYPE_QP_EVENT	   0x38UL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED2_MASK	    0xc0UL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED2_SFT	    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_SET_FUNC_RESOURCES_RESP_V			    0x1UL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED7_MASK	    0xfeUL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED7_SFT	    1
+	u8 event;
+	#define CREQ_SET_FUNC_RESOURCES_RESP_EVENT_SET_FUNC_RESOURCES 0x84UL
+	__le16 reserved48[3];
+};
+
+/* Map TC to COS response (16 bytes) */
+struct creq_map_tc_to_cos_resp {
+	u8 type;
+	#define CREQ_MAP_TC_TO_COS_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_MAP_TC_TO_COS_RESP_TYPE_SFT		    0
+	#define CREQ_MAP_TC_TO_COS_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_MAP_TC_TO_COS_RESP_V			    0x1UL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MAP_TC_TO_COS_RESP_EVENT_MAP_TC_TO_COS       0x8aUL
+	__le16 reserved48[3];
+};
+
+/* Query version response (16 bytes) */
+struct creq_query_version_resp {
+	u8 type;
+	#define CREQ_QUERY_VERSION_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_QUERY_VERSION_RESP_TYPE_SFT		    0
+	#define CREQ_QUERY_VERSION_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	u8 fw_maj;
+	u8 fw_minor;
+	u8 fw_bld;
+	u8 fw_rsvd;
+	u8 v;
+	#define CREQ_QUERY_VERSION_RESP_V			    0x1UL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_VERSION_RESP_EVENT_QUERY_VERSION       0x8bUL
+	__le16 reserved16;
+	u8 intf_maj;
+	u8 intf_minor;
+	u8 intf_bld;
+	u8 intf_rsvd;
+};
+
+/* Modify congestion control command response (16 bytes) */
+struct creq_modify_cc_resp {
+	u8 type;
+	#define CREQ_MODIFY_CC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_MODIFY_CC_RESP_TYPE_SFT			    0
+	#define CREQ_MODIFY_CC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MODIFY_CC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MODIFY_CC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_MODIFY_CC_RESP_V				    0x1UL
+	#define CREQ_MODIFY_CC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MODIFY_CC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MODIFY_CC_RESP_EVENT_MODIFY_CC		   0x8cUL
+	__le16 reserved48[3];
+};
+
+/* Query congestion control command response (16 bytes) */
+struct creq_query_cc_resp {
+	u8 type;
+	#define CREQ_QUERY_CC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_CC_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_CC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_CC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_CC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_CC_RESP_V				    0x1UL
+	#define CREQ_QUERY_CC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_CC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_CC_RESP_EVENT_QUERY_CC		   0x8dUL
+	__le16 reserved48[3];
+};
+
+/* Query congestion control command response side buffer structure (32 bytes) */
+struct creq_query_cc_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_CC_RESP_SB_OPCODE_QUERY_CC		   0x8dUL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	u8 enable_cc;
+	#define CREQ_QUERY_CC_RESP_SB_ENABLE_CC		    0x1UL
+	u8 g;
+	#define CREQ_QUERY_CC_RESP_SB_G_MASK			    0x7UL
+	#define CREQ_QUERY_CC_RESP_SB_G_SFT			    0
+	u8 num_phases_per_state;
+	__le16 init_cr;
+	u8 unused_2;
+	__le16 unused_3;
+	u8 unused_4;
+	__le16 init_tr;
+	u8 tos_dscp_tos_ecn;
+	#define CREQ_QUERY_CC_RESP_SB_TOS_ECN_MASK		    0x3UL
+	#define CREQ_QUERY_CC_RESP_SB_TOS_ECN_SFT		    0
+	#define CREQ_QUERY_CC_RESP_SB_TOS_DSCP_MASK		    0xfcUL
+	#define CREQ_QUERY_CC_RESP_SB_TOS_DSCP_SFT		    2
+	__le64 reserved64;
+	__le64 reserved64_1;
+};
+
+/* QP error notification event (16 bytes) */
+struct creq_qp_error_notification {
+	u8 type;
+	#define CREQ_QP_ERROR_NOTIFICATION_TYPE_MASK		    0x3fUL
+	#define CREQ_QP_ERROR_NOTIFICATION_TYPE_SFT		    0
+	#define CREQ_QP_ERROR_NOTIFICATION_TYPE_QP_EVENT	   0x38UL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED2_MASK	    0xc0UL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED2_SFT	    6
+	u8 status;
+	u8 req_slow_path_state;
+	u8 req_err_state_reason;
+	__le32 xid;
+	u8 v;
+	#define CREQ_QP_ERROR_NOTIFICATION_V			    0x1UL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED7_MASK	    0xfeUL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED7_SFT	    1
+	u8 event;
+	#define CREQ_QP_ERROR_NOTIFICATION_EVENT_QP_ERROR_NOTIFICATION 0xc0UL
+	u8 res_slow_path_state;
+	u8 res_err_state_reason;
+	__le16 sq_cons_idx;
+	__le16 rq_cons_idx;
+};
+
+/* RoCE Slowpath HSI Specification 1.6.0 */
+#define ROCE_SP_HSI_VERSION_MAJOR	1
+#define ROCE_SP_HSI_VERSION_MINOR	6
+#define ROCE_SP_HSI_VERSION_UPDATE	0
+
+#define ROCE_SP_HSI_VERSION_STR	"1.6.0"
+/*
+ * Following is the signature for ROCE_SP_HSI message field that indicates not
+ * applicable (All F's). Need to cast it the size of the field if needed.
+ */
+#define ROCE_SP_HSI_NA_SIGNATURE	((__le32)(-1))
+#endif /* __BNXT_RE_HSI_H__ */
diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h
new file mode 100644
index 000000000000..e2c8a3f0ccec
--- /dev/null
+++ b/include/uapi/rdma/bnxt_re-abi.h
@@ -0,0 +1,89 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Uverbs ABI header file
+ */
+
+#ifndef __BNXT_RE_UVERBS_ABI_H__
+#define __BNXT_RE_UVERBS_ABI_H__
+
+#define BNXT_RE_ABI_VERSION	1
+
+struct bnxt_re_uctx_resp {
+	__u32 dev_id;
+	__u32 max_qp;
+	__u32 pg_size;
+	__u32 cqe_sz;
+	__u32 max_cqd;
+	__u32 rsvd;
+};
+
+struct bnxt_re_pd_resp {
+	__u32 pdid;
+	__u32 dpi;
+	__u64 dbr;
+};
+
+struct bnxt_re_cq_req {
+	__u64 cq_va;
+	__u64 cq_handle;
+};
+
+struct bnxt_re_cq_resp {
+	__u32 cqid;
+	__u32 tail;
+	__u32 phase;
+	__u32 rsvd;
+};
+
+struct bnxt_re_qp_req {
+	__u64 qpsva;
+	__u64 qprva;
+	__u64 qp_handle;
+};
+
+struct bnxt_re_qp_resp {
+	__u32 qpid;
+	__u32 rsvd;
+};
+
+enum bnxt_re_shpg_offt {
+	BNXT_RE_BEG_RESV_OFFT	= 0x00,
+	BNXT_RE_AVID_OFFT	= 0x10,
+	BNXT_RE_AVID_SIZE	= 0x04,
+	BNXT_RE_END_RESV_OFFT	= 0xFF0
+};
+
+#endif /* __BNXT_RE_UVERBS_ABI_H__*/
-- 
cgit v1.2.3


From 94e03f11ad1f8c947b69fa187412ff04783b2a96 Mon Sep 17 00:00:00 2001
From: Moses Reuben <mosesr@mellanox.com>
Date: Wed, 18 Jan 2017 14:59:49 +0200
Subject: IB/uverbs: Add support for flow tag

The struct ib_uverbs_flow_spec_action_tag associates a tag_id with the
flow defined by any number of other flow_spec entries which can reference
L2, L3, and L4 packet contents.

Use of ib_uverbs_flow_spec_action_tag allows the consumer to identify
the set of rules which where matched by
the packet by examining the tag_id in the CQE.

Signed-off-by: Moses Reuben <mosesr@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs.h     |  1 +
 drivers/infiniband/core/uverbs_cmd.c | 35 +++++++++++++++++++++++++++++++++--
 include/uapi/rdma/ib_user_verbs.h    | 13 +++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 455034ac994e..e1bedf0bac04 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -228,6 +228,7 @@ struct ib_uverbs_flow_spec {
 		struct ib_uverbs_flow_spec_ipv4    ipv4;
 		struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
 		struct ib_uverbs_flow_spec_ipv6    ipv6;
+		struct ib_uverbs_flow_spec_action_tag	flow_tag;
 	};
 };
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 700782203483..0834dce0a490 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3143,6 +3143,25 @@ out_put:
 	return ret ? ret : in_len;
 }
 
+static int kern_spec_to_ib_spec_action(struct ib_uverbs_flow_spec *kern_spec,
+				       union ib_flow_spec *ib_spec)
+{
+	ib_spec->type = kern_spec->type;
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ACTION_TAG:
+		if (kern_spec->flow_tag.size !=
+		    sizeof(struct ib_uverbs_flow_spec_action_tag))
+			return -EINVAL;
+
+		ib_spec->flow_tag.size = sizeof(struct ib_flow_spec_action_tag);
+		ib_spec->flow_tag.tag_id = kern_spec->flow_tag.tag_id;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec)
 {
 	/* Returns user space filter size, includes padding */
@@ -3167,8 +3186,8 @@ static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size,
 	return kern_filter_size;
 }
 
-static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
-				union ib_flow_spec *ib_spec)
+static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
+				       union ib_flow_spec *ib_spec)
 {
 	ssize_t actual_filter_sz;
 	ssize_t kern_filter_sz;
@@ -3263,6 +3282,18 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
 	return 0;
 }
 
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+				union ib_flow_spec *ib_spec)
+{
+	if (kern_spec->reserved)
+		return -EINVAL;
+
+	if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG)
+		return kern_spec_to_ib_spec_action(kern_spec, ib_spec);
+	else
+		return kern_spec_to_ib_spec_filter(kern_spec, ib_spec);
+}
+
 int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
 			   struct ib_device *ib_dev,
 			   struct ib_udata *ucore,
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index dfdfe4e92d31..b458fea590b6 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -929,6 +929,19 @@ struct ib_uverbs_flow_spec_ipv6 {
 	struct ib_uverbs_flow_ipv6_filter mask;
 };
 
+struct ib_uverbs_flow_spec_action_tag {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	__u32			      tag_id;
+	__u32			      reserved1;
+};
+
 struct ib_uverbs_flow_tunnel_filter {
 	__be32 tunnel_id;
 };
-- 
cgit v1.2.3


From 5f23d4265f8ee4d7b76356992931abec7888d372 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Wed, 18 Jan 2017 15:39:58 +0200
Subject: IB/uverbs: Expose vlan offloads capabilities

Expose raw packet capabilities to user space as part of query device.

Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 6 ++++++
 include/uapi/rdma/ib_user_verbs.h    | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 0834dce0a490..e4ae0c62df36 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -4354,6 +4354,12 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
 	resp.max_wq_type_rq = attr.max_wq_type_rq;
 	resp.response_length += sizeof(resp.max_wq_type_rq);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.raw_packet_caps))
+		goto end;
+
+	resp.raw_packet_caps = attr.raw_packet_caps;
+	resp.response_length += sizeof(resp.raw_packet_caps);
 end:
 	err = ib_copy_to_udata(ucore, &resp, resp.response_length);
 	return err;
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index b458fea590b6..0db9e646edd3 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -247,7 +247,7 @@ struct ib_uverbs_ex_query_device_resp {
 	__u64 device_cap_flags_ex;
 	struct ib_uverbs_rss_caps rss_caps;
 	__u32  max_wq_type_rq;
-	__u32 reserved;
+	__u32 raw_packet_caps;
 };
 
 struct ib_uverbs_query_port {
-- 
cgit v1.2.3


From af1cb95d2e34133e0cf7f48d6045da888414b867 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Wed, 18 Jan 2017 15:39:59 +0200
Subject: IB/uverbs: Enable WQ creation and modification with cvlan offload

Enable user space application via WQ creation and modification to
turn on and off cvlan offload.

Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 9 ++++++++-
 include/uapi/rdma/ib_user_verbs.h    | 4 ++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e4ae0c62df36..0eb204380bff 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3356,6 +3356,9 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
 	wq_init_attr.wq_context = file;
 	wq_init_attr.wq_type = cmd.wq_type;
 	wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+	if (ucore->inlen >= (offsetof(typeof(cmd), create_flags) +
+			     sizeof(cmd.create_flags)))
+		wq_init_attr.create_flags = cmd.create_flags;
 	obj->uevent.events_reported = 0;
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 	wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
@@ -3511,7 +3514,7 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
 	if (!cmd.attr_mask)
 		return -EINVAL;
 
-	if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE))
+	if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS))
 		return -EINVAL;
 
 	wq = idr_read_wq(cmd.wq_handle, file->ucontext);
@@ -3520,6 +3523,10 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
 
 	wq_attr.curr_wq_state = cmd.curr_wq_state;
 	wq_attr.wq_state = cmd.wq_state;
+	if (cmd.attr_mask & IB_WQ_FLAGS) {
+		wq_attr.flags = cmd.flags;
+		wq_attr.flags_mask = cmd.flags_mask;
+	}
 	ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
 	put_wq_read(wq);
 	return ret;
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 0db9e646edd3..f8723580ffed 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -1061,6 +1061,8 @@ struct ib_uverbs_ex_create_wq  {
 	__u32 cq_handle;
 	__u32 max_wr;
 	__u32 max_sge;
+	__u32 create_flags; /* Use enum ib_wq_flags */
+	__u32 reserved;
 };
 
 struct ib_uverbs_ex_create_wq_resp {
@@ -1089,6 +1091,8 @@ struct ib_uverbs_ex_modify_wq  {
 	__u32 wq_handle;
 	__u32 wq_state;
 	__u32 curr_wq_state;
+	__u32 flags; /* Use enum ib_wq_flags */
+	__u32 flags_mask; /* Use enum ib_wq_flags */
 };
 
 /* Prevent memory allocation rather than max expected size */
-- 
cgit v1.2.3


From ab520be8cd5d56867fc95cfbc34b90880faf1f9d Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Mon, 13 Feb 2017 17:03:23 +0000
Subject: xen/privcmd: Add IOCTL_PRIVCMD_DM_OP

Recently a new dm_op[1] hypercall was added to Xen to provide a mechanism
for restricting device emulators (such as QEMU) to a limited set of
hypervisor operations, and being able to audit those operations in the
kernel of the domain in which they run.

This patch adds IOCTL_PRIVCMD_DM_OP as gateway for __HYPERVISOR_dm_op.

NOTE: There is no requirement for user-space code to bounce data through
      locked memory buffers (as with IOCTL_PRIVCMD_HYPERCALL) since
      privcmd has enough information to lock the original buffers
      directly.

[1] http://xenbits.xen.org/gitweb/?p=xen.git;a=commit;h=524a98c2

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/arm/xen/enlighten.c             |   1 +
 arch/arm/xen/hypercall.S             |   1 +
 arch/arm64/xen/hypercall.S           |   1 +
 arch/x86/include/asm/xen/hypercall.h |   7 ++
 drivers/xen/privcmd.c                | 139 +++++++++++++++++++++++++++++++++++
 include/uapi/xen/privcmd.h           |  13 ++++
 include/xen/arm/hypercall.h          |   1 +
 include/xen/interface/hvm/dm_op.h    |  32 ++++++++
 include/xen/interface/xen.h          |   1 +
 9 files changed, 196 insertions(+)
 create mode 100644 include/xen/interface/hvm/dm_op.h

(limited to 'include/uapi')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 11d9f2898b16..81e3217b12d3 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -457,4 +457,5 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op);
 EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op);
 EXPORT_SYMBOL_GPL(HYPERVISOR_multicall);
 EXPORT_SYMBOL_GPL(HYPERVISOR_vm_assist);
+EXPORT_SYMBOL_GPL(HYPERVISOR_dm_op);
 EXPORT_SYMBOL_GPL(privcmd_call);
diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S
index a648dfc3be30..b0b80c0f09f3 100644
--- a/arch/arm/xen/hypercall.S
+++ b/arch/arm/xen/hypercall.S
@@ -92,6 +92,7 @@ HYPERCALL1(tmem_op);
 HYPERCALL1(platform_op_raw);
 HYPERCALL2(multicall);
 HYPERCALL2(vm_assist);
+HYPERCALL3(dm_op);
 
 ENTRY(privcmd_call)
 	stmdb sp!, {r4}
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 947830a459d2..401ceb71540c 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -84,6 +84,7 @@ HYPERCALL1(tmem_op);
 HYPERCALL1(platform_op_raw);
 HYPERCALL2(multicall);
 HYPERCALL2(vm_assist);
+HYPERCALL3(dm_op);
 
 ENTRY(privcmd_call)
 	mov x16, x0
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a12a047184ee..f6d20f6cca12 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -472,6 +472,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
 	return _hypercall2(int, xenpmu_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_dm_op(
+	domid_t dom, unsigned int nr_bufs, void *bufs)
+{
+	return _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 5e5c7aef0c9f..1a6f1860e008 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -22,6 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
+#include <linux/moduleparam.h>
 
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -32,6 +33,7 @@
 #include <xen/xen.h>
 #include <xen/privcmd.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/hvm/dm_op.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/xen-ops.h>
@@ -43,6 +45,17 @@ MODULE_LICENSE("GPL");
 
 #define PRIV_VMA_LOCKED ((void *)1)
 
+static unsigned int privcmd_dm_op_max_num = 16;
+module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644);
+MODULE_PARM_DESC(dm_op_max_nr_bufs,
+		 "Maximum number of buffers per dm_op hypercall");
+
+static unsigned int privcmd_dm_op_buf_max_size = 4096;
+module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
+		   0644);
+MODULE_PARM_DESC(dm_op_buf_max_size,
+		 "Maximum size of a dm_op hypercall buffer");
+
 static int privcmd_vma_range_is_mapped(
                struct vm_area_struct *vma,
                unsigned long addr,
@@ -548,6 +561,128 @@ out_unlock:
 	goto out;
 }
 
+static int lock_pages(
+	struct privcmd_dm_op_buf kbufs[], unsigned int num,
+	struct page *pages[], unsigned int nr_pages)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		unsigned int requested;
+		int pinned;
+
+		requested = DIV_ROUND_UP(
+			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
+			PAGE_SIZE);
+		if (requested > nr_pages)
+			return -ENOSPC;
+
+		pinned = get_user_pages_fast(
+			(unsigned long) kbufs[i].uptr,
+			requested, FOLL_WRITE, pages);
+		if (pinned < 0)
+			return pinned;
+
+		nr_pages -= pinned;
+		pages += pinned;
+	}
+
+	return 0;
+}
+
+static void unlock_pages(struct page *pages[], unsigned int nr_pages)
+{
+	unsigned int i;
+
+	if (!pages)
+		return;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i])
+			put_page(pages[i]);
+	}
+}
+
+static long privcmd_ioctl_dm_op(void __user *udata)
+{
+	struct privcmd_dm_op kdata;
+	struct privcmd_dm_op_buf *kbufs;
+	unsigned int nr_pages = 0;
+	struct page **pages = NULL;
+	struct xen_dm_op_buf *xbufs = NULL;
+	unsigned int i;
+	long rc;
+
+	if (copy_from_user(&kdata, udata, sizeof(kdata)))
+		return -EFAULT;
+
+	if (kdata.num == 0)
+		return 0;
+
+	if (kdata.num > privcmd_dm_op_max_num)
+		return -E2BIG;
+
+	kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL);
+	if (!kbufs)
+		return -ENOMEM;
+
+	if (copy_from_user(kbufs, kdata.ubufs,
+			   sizeof(*kbufs) * kdata.num)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	for (i = 0; i < kdata.num; i++) {
+		if (kbufs[i].size > privcmd_dm_op_buf_max_size) {
+			rc = -E2BIG;
+			goto out;
+		}
+
+		if (!access_ok(VERIFY_WRITE, kbufs[i].uptr,
+			       kbufs[i].size)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		nr_pages += DIV_ROUND_UP(
+			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
+			PAGE_SIZE);
+	}
+
+	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL);
+	if (!xbufs) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = lock_pages(kbufs, kdata.num, pages, nr_pages);
+	if (rc)
+		goto out;
+
+	for (i = 0; i < kdata.num; i++) {
+		set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
+		xbufs[i].size = kbufs[i].size;
+	}
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
+	xen_preemptible_hcall_end();
+
+out:
+	unlock_pages(pages, nr_pages);
+	kfree(xbufs);
+	kfree(pages);
+	kfree(kbufs);
+
+	return rc;
+}
+
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
 {
@@ -571,6 +706,10 @@ static long privcmd_ioctl(struct file *file,
 		ret = privcmd_ioctl_mmap_batch(udata, 2);
 		break;
 
+	case IOCTL_PRIVCMD_DM_OP:
+		ret = privcmd_ioctl_dm_op(udata);
+		break;
+
 	default:
 		break;
 	}
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index 7ddeeda93809..f8c5d75b99e1 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -77,6 +77,17 @@ struct privcmd_mmapbatch_v2 {
 	int __user *err;  /* array of error codes */
 };
 
+struct privcmd_dm_op_buf {
+	void __user *uptr;
+	size_t size;
+};
+
+struct privcmd_dm_op {
+	domid_t dom;
+	__u16 num;
+	const struct privcmd_dm_op_buf __user *ubufs;
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -98,5 +109,7 @@ struct privcmd_mmapbatch_v2 {
 	_IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
 #define IOCTL_PRIVCMD_MMAPBATCH_V2				\
 	_IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2))
+#define IOCTL_PRIVCMD_DM_OP					\
+	_IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op))
 
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/arm/hypercall.h b/include/xen/arm/hypercall.h
index 9d874db13c0e..73db4b2eeb89 100644
--- a/include/xen/arm/hypercall.h
+++ b/include/xen/arm/hypercall.h
@@ -53,6 +53,7 @@ int HYPERVISOR_physdev_op(int cmd, void *arg);
 int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args);
 int HYPERVISOR_tmem_op(void *arg);
 int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type);
+int HYPERVISOR_dm_op(domid_t domid, unsigned int nr_bufs, void *bufs);
 int HYPERVISOR_platform_op_raw(void *arg);
 static inline int HYPERVISOR_platform_op(struct xen_platform_op *op)
 {
diff --git a/include/xen/interface/hvm/dm_op.h b/include/xen/interface/hvm/dm_op.h
new file mode 100644
index 000000000000..ee9e480bc559
--- /dev/null
+++ b/include/xen/interface/hvm/dm_op.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Citrix Systems Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_DM_OP_H__
+#define __XEN_PUBLIC_HVM_DM_OP_H__
+
+struct xen_dm_op_buf {
+	GUEST_HANDLE(void) h;
+	xen_ulong_t size;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_dm_op_buf);
+
+#endif /* __XEN_PUBLIC_HVM_DM_OP_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 1b0d189cd3d3..4f4830ef8f93 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -81,6 +81,7 @@
 #define __HYPERVISOR_tmem_op              38
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
 #define __HYPERVISOR_xenpmu_op            40
+#define __HYPERVISOR_dm_op                41
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
-- 
cgit v1.2.3


From 4610d240d691768203fdd210a5da0a2e02eddb76 Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Mon, 13 Feb 2017 17:03:24 +0000
Subject: xen/privcmd: add IOCTL_PRIVCMD_RESTRICT

The purpose if this ioctl is to allow a user of privcmd to restrict its
operation such that it will no longer service arbitrary hypercalls via
IOCTL_PRIVCMD_HYPERCALL, and will check for a matching domid when
servicing IOCTL_PRIVCMD_DM_OP or IOCTL_PRIVCMD_MMAP*. The aim of this
is to limit the attack surface for a compromised device model.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/privcmd.c      | 88 +++++++++++++++++++++++++++++++++++++++++-----
 include/uapi/xen/privcmd.h |  2 ++
 2 files changed, 81 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 1a6f1860e008..2077a3ac7c0c 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -56,16 +56,25 @@ module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
 MODULE_PARM_DESC(dm_op_buf_max_size,
 		 "Maximum size of a dm_op hypercall buffer");
 
+struct privcmd_data {
+	domid_t domid;
+};
+
 static int privcmd_vma_range_is_mapped(
                struct vm_area_struct *vma,
                unsigned long addr,
                unsigned long nr_pages);
 
-static long privcmd_ioctl_hypercall(void __user *udata)
+static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
 {
+	struct privcmd_data *data = file->private_data;
 	struct privcmd_hypercall hypercall;
 	long ret;
 
+	/* Disallow arbitrary hypercalls if restricted */
+	if (data->domid != DOMID_INVALID)
+		return -EPERM;
+
 	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
 		return -EFAULT;
 
@@ -242,8 +251,9 @@ static int mmap_gfn_range(void *data, void *state)
 	return 0;
 }
 
-static long privcmd_ioctl_mmap(void __user *udata)
+static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
 {
+	struct privcmd_data *data = file->private_data;
 	struct privcmd_mmap mmapcmd;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -258,6 +268,10 @@ static long privcmd_ioctl_mmap(void __user *udata)
 	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
 		return -EFAULT;
 
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom)
+		return -EPERM;
+
 	rc = gather_array(&pagelist,
 			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 			  mmapcmd.entry);
@@ -429,8 +443,10 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
 
 static const struct vm_operations_struct privcmd_vm_ops;
 
-static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
+static long privcmd_ioctl_mmap_batch(
+	struct file *file, void __user *udata, int version)
 {
+	struct privcmd_data *data = file->private_data;
 	int ret;
 	struct privcmd_mmapbatch_v2 m;
 	struct mm_struct *mm = current->mm;
@@ -459,6 +475,10 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		return -EINVAL;
 	}
 
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != m.dom)
+		return -EPERM;
+
 	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
 	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
 		return -EINVAL;
@@ -603,8 +623,9 @@ static void unlock_pages(struct page *pages[], unsigned int nr_pages)
 	}
 }
 
-static long privcmd_ioctl_dm_op(void __user *udata)
+static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
 {
+	struct privcmd_data *data = file->private_data;
 	struct privcmd_dm_op kdata;
 	struct privcmd_dm_op_buf *kbufs;
 	unsigned int nr_pages = 0;
@@ -616,6 +637,10 @@ static long privcmd_ioctl_dm_op(void __user *udata)
 	if (copy_from_user(&kdata, udata, sizeof(kdata)))
 		return -EFAULT;
 
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
+		return -EPERM;
+
 	if (kdata.num == 0)
 		return 0;
 
@@ -683,6 +708,23 @@ out:
 	return rc;
 }
 
+static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	domid_t dom;
+
+	if (copy_from_user(&dom, udata, sizeof(dom)))
+		return -EFAULT;
+
+	/* Set restriction to the specified domain, or check it matches */
+	if (data->domid == DOMID_INVALID)
+		data->domid = dom;
+	else if (data->domid != dom)
+		return -EINVAL;
+
+	return 0;
+}
+
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
 {
@@ -691,23 +733,27 @@ static long privcmd_ioctl(struct file *file,
 
 	switch (cmd) {
 	case IOCTL_PRIVCMD_HYPERCALL:
-		ret = privcmd_ioctl_hypercall(udata);
+		ret = privcmd_ioctl_hypercall(file, udata);
 		break;
 
 	case IOCTL_PRIVCMD_MMAP:
-		ret = privcmd_ioctl_mmap(udata);
+		ret = privcmd_ioctl_mmap(file, udata);
 		break;
 
 	case IOCTL_PRIVCMD_MMAPBATCH:
-		ret = privcmd_ioctl_mmap_batch(udata, 1);
+		ret = privcmd_ioctl_mmap_batch(file, udata, 1);
 		break;
 
 	case IOCTL_PRIVCMD_MMAPBATCH_V2:
-		ret = privcmd_ioctl_mmap_batch(udata, 2);
+		ret = privcmd_ioctl_mmap_batch(file, udata, 2);
 		break;
 
 	case IOCTL_PRIVCMD_DM_OP:
-		ret = privcmd_ioctl_dm_op(udata);
+		ret = privcmd_ioctl_dm_op(file, udata);
+		break;
+
+	case IOCTL_PRIVCMD_RESTRICT:
+		ret = privcmd_ioctl_restrict(file, udata);
 		break;
 
 	default:
@@ -717,6 +763,28 @@ static long privcmd_ioctl(struct file *file,
 	return ret;
 }
 
+static int privcmd_open(struct inode *ino, struct file *file)
+{
+	struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+	if (!data)
+		return -ENOMEM;
+
+	/* DOMID_INVALID implies no restriction */
+	data->domid = DOMID_INVALID;
+
+	file->private_data = data;
+	return 0;
+}
+
+static int privcmd_release(struct inode *ino, struct file *file)
+{
+	struct privcmd_data *data = file->private_data;
+
+	kfree(data);
+	return 0;
+}
+
 static void privcmd_close(struct vm_area_struct *vma)
 {
 	struct page **pages = vma->vm_private_data;
@@ -785,6 +853,8 @@ static int privcmd_vma_range_is_mapped(
 const struct file_operations xen_privcmd_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = privcmd_ioctl,
+	.open = privcmd_open,
+	.release = privcmd_release,
 	.mmap = privcmd_mmap,
 };
 EXPORT_SYMBOL_GPL(xen_privcmd_fops);
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index f8c5d75b99e1..63ee95c9dabb 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -111,5 +111,7 @@ struct privcmd_dm_op {
 	_IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2))
 #define IOCTL_PRIVCMD_DM_OP					\
 	_IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op))
+#define IOCTL_PRIVCMD_RESTRICT					\
+	_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
 
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
-- 
cgit v1.2.3


From 24bff4d78a572d25fe2a0818f55bebda8a2d4709 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Tue, 14 Feb 2017 17:29:35 -0700
Subject: uapi: sed-opal fix IOW for activate lsp to use correct struct

The IOC_OPAL_ACTIVATE_LSP took the wrong strcure which would
give us the wrong size when using _IOC_SIZE, switch it to the
right structure.

Fixes: 058f8a2 ("Include: Uapi: Add user ABI for Sed/Opal")

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/sed-opal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index fc06e3a20a51..c72e0735532d 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -106,7 +106,7 @@ struct opal_mbr_data {
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
-#define IOC_OPAL_ACTIVATE_LSP       _IOW('p', 223, struct opal_key)
+#define IOC_OPAL_ACTIVATE_LSP       _IOW('p', 223, struct opal_lr_act)
 #define IOC_OPAL_SET_PW             _IOW('p', 224, struct opal_new_pw)
 #define IOC_OPAL_ACTIVATE_USR       _IOW('p', 225, struct opal_session_info)
 #define IOC_OPAL_REVERT_TPR         _IOW('p', 226, struct opal_key)
-- 
cgit v1.2.3


From a725eb15db80643a160310ed6bcfd6c5a6c907f2 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Wed, 15 Feb 2017 05:23:26 +0300
Subject: uapi: fix linux/if_pppol2tp.h userspace compilation errors

Because of <linux/libc-compat.h> interface limitations, <netinet/in.h>
provided by libc cannot be included after <linux/in.h>, therefore any
header that includes <netinet/in.h> cannot be included after <linux/in.h>.

Change uapi/linux/l2tp.h, the last uapi header that includes
<netinet/in.h>, to include <linux/in.h> and <linux/in6.h> instead of
<netinet/in.h> and use __SOCK_SIZE__ instead of sizeof(struct sockaddr)
the same way as uapi/linux/in.h does, to fix linux/if_pppol2tp.h userspace
compilation errors like this:

In file included from /usr/include/linux/l2tp.h:12:0,
                 from /usr/include/linux/if_pppol2tp.h:21,
/usr/include/netinet/in.h:31:8: error: redefinition of 'struct in_addr'

Fixes: 47c3e7783be4 ("net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_*")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 85ddb74fcd1c..b23c1914a182 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -9,9 +9,8 @@
 
 #include <linux/types.h>
 #include <linux/socket.h>
-#ifndef __KERNEL__
-#include <netinet/in.h>
-#endif
+#include <linux/in.h>
+#include <linux/in6.h>
 
 #define IPPROTO_L2TP		115
 
@@ -31,7 +30,7 @@ struct sockaddr_l2tpip {
 	__u32		l2tp_conn_id;	/* Connection ID of tunnel */
 
 	/* Pad to size of `struct sockaddr'. */
-	unsigned char	__pad[sizeof(struct sockaddr) -
+	unsigned char	__pad[__SOCK_SIZE__ -
 			      sizeof(__kernel_sa_family_t) -
 			      sizeof(__be16) - sizeof(struct in_addr) -
 			      sizeof(__u32)];
-- 
cgit v1.2.3


From 460df4c1fc7c00829050c08d6368dc6e6beef307 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 8 Feb 2017 11:50:15 +0100
Subject: KVM: race-free exit from KVM_RUN without POSIX signals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick"
a VCPU out of KVM_RUN through a POSIX signal.  A signal is attached
to a dummy signal handler; by blocking the signal outside KVM_RUN and
unblocking it inside, this possible race is closed:

          VCPU thread                     service thread
   --------------------------------------------------------------
        check flag
                                          set flag
                                          raise signal
        (signal handler does nothing)
        KVM_RUN

However, one issue with KVM_SET_SIGNAL_MASK is that it has to take
tsk->sighand->siglock on every KVM_RUN.  This lock is often on a
remote NUMA node, because it is on the node of a thread's creator.
Taking this lock can be very expensive if there are many userspace
exits (as is the case for SMP Windows VMs without Hyper-V reference
time counter).

As an alternative, we can put the flag directly in kvm_run so that
KVM can see it:

          VCPU thread                     service thread
   --------------------------------------------------------------
                                          raise signal
        signal handler
          set run->immediate_exit
        KVM_RUN
          check run->immediate_exit

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 13 ++++++++++++-
 arch/arm/kvm/arm.c                |  4 ++++
 arch/mips/kvm/mips.c              |  7 ++++++-
 arch/powerpc/kvm/powerpc.c        |  6 +++++-
 arch/s390/kvm/kvm-s390.c          |  4 ++++
 arch/x86/kvm/x86.c                |  6 +++++-
 include/uapi/linux/kvm.h          |  4 +++-
 7 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e4f2cdcf78eb..069450938b79 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3389,7 +3389,18 @@ struct kvm_run {
 Request that KVM_RUN return when it becomes possible to inject external
 interrupts into the guest.  Useful in conjunction with KVM_INTERRUPT.
 
-	__u8 padding1[7];
+	__u8 immediate_exit;
+
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+exits immediately, returning -EINTR.  In the common scenario where a
+signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
+to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
+Rather than blocking the signal outside KVM_RUN, userspace can set up
+a signal handler that sets run->immediate_exit to a non-zero value.
+
+This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
+
+	__u8 padding1[6];
 
 	/* out */
 	__u32 exit_reason;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 21c493a9e5c9..c9a2103faeb9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_PSCI_0_2:
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_MP_STATE:
+	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			return ret;
 	}
 
+	if (run->immediate_exit)
+		return -EINTR;
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 31ee5ee0010b..ed81e5ac1426 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-	int r = 0;
+	int r = -EINTR;
 	sigset_t sigsaved;
 
 	if (vcpu->sigset_active)
@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		vcpu->mmio_needed = 0;
 	}
 
+	if (run->immediate_exit)
+		goto out;
+
 	lose_fpu(1);
 
 	local_irq_disable();
@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	guest_exit_irqoff();
 	local_irq_enable();
 
+out:
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2b3e4e620078..1fe1391ba2c2 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
+	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
 	case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -1117,7 +1118,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
 	}
 
-	r = kvmppc_vcpu_run(run, vcpu);
+	if (run->immediate_exit)
+		r = -EINTR;
+	else
+		r = kvmppc_vcpu_run(run, vcpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 502de74ea984..99e35fe0dea8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_IRQCHIP:
 	case KVM_CAP_VM_ATTRIBUTES:
 	case KVM_CAP_MP_STATE:
+	case KVM_CAP_IMMEDIATE_EXIT:
 	case KVM_CAP_S390_INJECT_IRQ:
 	case KVM_CAP_S390_USER_SIGP:
 	case KVM_CAP_S390_USER_STSI:
@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int rc;
 	sigset_t sigsaved;
 
+	if (kvm_run->immediate_exit)
+		return -EINTR;
+
 	if (guestdbg_exit_pending(vcpu)) {
 		kvm_s390_prepare_debug_exit(vcpu);
 		return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0aa8db229e0a..8d3047c8cce7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
+	case KVM_CAP_IMMEDIATE_EXIT:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -7202,7 +7203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-	r = vcpu_run(vcpu);
+	if (kvm_run->immediate_exit)
+		r = -EINTR;
+	else
+		r = vcpu_run(vcpu);
 
 out:
 	post_kvm_run_save(vcpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7964b970b9ad..f51d5082a377 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
 struct kvm_run {
 	/* in */
 	__u8 request_interrupt_window;
-	__u8 padding1[7];
+	__u8 immediate_exit;
+	__u8 padding1[6];
 
 	/* out */
 	__u32 exit_reason;
@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_RESIZE_HPT 133
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From e696028acc458aa3d43ad899371a963eb28336d8 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 16 Feb 2017 10:31:12 +0200
Subject: net/sched: Reflect HW offload status

Currently there is no way of querying whether a filter is
offloaded to HW or not when using "both" policy (where none
of skip_sw or skip_hw flags are set by user-space).

Add two new flags, "in hw" and "not in hw" such that user
space can determine if a filter is actually offloaded to
hw or not. The "in hw" UAPI semantics was chosen so it's
similar to the "skip hw" flag logic.

If none of these two flags are set, this signals running
over older kernel.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 5 +++++
 include/uapi/linux/pkt_cls.h | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index be5c12a5c375..269fd78bb0ae 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -481,6 +481,11 @@ static inline bool tc_flags_valid(u32 flags)
 	return true;
 }
 
+static inline bool tc_in_hw(u32 flags)
+{
+	return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false;
+}
+
 enum tc_fl_command {
 	TC_CLSFLOWER_REPLACE,
 	TC_CLSFLOWER_DESTROY,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 345551e71410..7a69f2a4ca0c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -103,8 +103,10 @@ enum {
 #define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1)
 
 /* tca flags definitions */
-#define TCA_CLS_FLAGS_SKIP_HW	(1 << 0)
-#define TCA_CLS_FLAGS_SKIP_SW	(1 << 1)
+#define TCA_CLS_FLAGS_SKIP_HW	(1 << 0) /* don't offload filter to HW */
+#define TCA_CLS_FLAGS_SKIP_SW	(1 << 1) /* don't use filter in SW */
+#define TCA_CLS_FLAGS_IN_HW	(1 << 2) /* filter is offloaded to HW */
+#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */
 
 /* U32 filters */
 
-- 
cgit v1.2.3


From 762b6f00a995863afa274d6b5ffa3880dac1714b Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Wed, 15 Feb 2017 23:04:54 +0300
Subject: uapi: fix linux/target_core_user.h userspace compilation errors

Consistently use types from linux/types.h to fix the following
linux/target_core_user.h userspace compilation errors:

/usr/include/linux/target_core_user.h:108:4: error: unknown type name 'uint32_t'
    uint32_t iov_cnt;
/usr/include/linux/target_core_user.h:109:4: error: unknown type name 'uint32_t'
    uint32_t iov_bidi_cnt;
/usr/include/linux/target_core_user.h:110:4: error: unknown type name 'uint32_t'
    uint32_t iov_dif_cnt;
/usr/include/linux/target_core_user.h:111:4: error: unknown type name 'uint64_t'
    uint64_t cdb_off;
/usr/include/linux/target_core_user.h:112:4: error: unknown type name 'uint64_t'
    uint64_t __pad1;
/usr/include/linux/target_core_user.h:113:4: error: unknown type name 'uint64_t'
    uint64_t __pad2;
/usr/include/linux/target_core_user.h:117:4: error: unknown type name 'uint8_t'
    uint8_t scsi_status;
/usr/include/linux/target_core_user.h:118:4: error: unknown type name 'uint8_t'
    uint8_t __pad1;
/usr/include/linux/target_core_user.h:119:4: error: unknown type name 'uint16_t'
    uint16_t __pad2;
/usr/include/linux/target_core_user.h:120:4: error: unknown type name 'uint32_t'
    uint32_t __pad3;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/uapi/linux/target_core_user.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index c506cddb8165..af17b4154ef6 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -105,26 +105,26 @@ struct tcmu_cmd_entry {
 
 	union {
 		struct {
-			uint32_t iov_cnt;
-			uint32_t iov_bidi_cnt;
-			uint32_t iov_dif_cnt;
-			uint64_t cdb_off;
-			uint64_t __pad1;
-			uint64_t __pad2;
+			__u32 iov_cnt;
+			__u32 iov_bidi_cnt;
+			__u32 iov_dif_cnt;
+			__u64 cdb_off;
+			__u64 __pad1;
+			__u64 __pad2;
 			struct iovec iov[0];
 		} req;
 		struct {
-			uint8_t scsi_status;
-			uint8_t __pad1;
-			uint16_t __pad2;
-			uint32_t __pad3;
+			__u8 scsi_status;
+			__u8 __pad1;
+			__u16 __pad2;
+			__u32 __pad3;
 			char sense_buffer[TCMU_SENSE_BUFFERSIZE];
 		} rsp;
 	};
 
 } __packed;
 
-#define TCMU_OP_ALIGN_SIZE sizeof(uint64_t)
+#define TCMU_OP_ALIGN_SIZE sizeof(__u64)
 
 enum tcmu_genl_cmd {
 	TCMU_CMD_UNSPEC,
-- 
cgit v1.2.3


From 6c07ec0fa5712b01d0967cf74129fa9b4d234af8 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 16 Feb 2017 18:04:14 +0300
Subject: uapi: fix linux/ipv6_route.h userspace compilation errors

Include <linux/in6.h> to fix the following linux/ipv6_route.h userspace
compilation errors:

/usr/include/linux/ipv6_route.h:42:19: error: field 'rtmsg_dst' has incomplete type
  struct in6_addr  rtmsg_dst;
/usr/include/linux/ipv6_route.h:43:19: error: field 'rtmsg_src' has incomplete type
  struct in6_addr  rtmsg_src;
/ust/include/linux/ipv6_route.h:44:19: error: field 'rtmsg_gateway' has incomplete type
  struct in6_addr  rtmsg_gateway;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index f6598d1c886e..85bbb1799df3 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -14,6 +14,7 @@
 #define _UAPI_LINUX_IPV6_ROUTE_H
 
 #include <linux/types.h>
+#include <linux/in6.h>			/* For struct in6_addr. */
 
 #define RTF_DEFAULT	0x00010000	/* default - learned via ND	*/
 #define RTF_ALLONLINK	0x00020000	/* (deprecated and will be removed)
-- 
cgit v1.2.3


From 72aa107df6a275cf03359934ca5799a2be7a1bf7 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 16 Feb 2017 18:04:29 +0300
Subject: uapi: fix linux/mroute6.h userspace compilation errors

Include <linux/in6.h> to fix the following linux/mroute6.h userspace
compilation errors:

/usr/include/linux/mroute6.h:80:22: error: field 'mf6cc_origin' has incomplete type
  struct sockaddr_in6 mf6cc_origin;  /* Origin of mcast */
/usr/include/linux/mroute6.h:81:22: error: field 'mf6cc_mcastgrp' has incomplete type
  struct sockaddr_in6 mf6cc_mcastgrp;  /* Group in question */
/usr/include/linux/mroute6.h:91:22: error: field 'src' has incomplete type
  struct sockaddr_in6 src;
/usr/include/linux/mroute6.h:92:22: error: field 'grp' has incomplete type
  struct sockaddr_in6 grp;
/usr/include/linux/mroute6.h:132:18: error: field 'im6_src' has incomplete type
  struct in6_addr im6_src, im6_dst;
/usr/include/linux/mroute6.h:132:27: error: field 'im6_dst' has incomplete type
  struct in6_addr im6_src, im6_dst;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index 5062fb5751e1..ed5721148768 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/sockios.h>
+#include <linux/in6.h>		/* For struct sockaddr_in6. */
 
 /*
  *	Based on the MROUTING 3.5 defines primarily to keep
-- 
cgit v1.2.3


From bcb41c6bced1ee778d23c53a6b4807fb08cf5540 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 16 Feb 2017 18:04:46 +0300
Subject: uapi: fix linux/mroute.h userspace compilation errors

Include <linux/in.h> to fix the following linux/mroute.h userspace
compilation errors:

/usr/include/linux/mroute.h:58:18: error: field 'vifc_lcl_addr' has incomplete type
  struct in_addr vifc_lcl_addr;     /* Local interface address */
/usr/include/linux/mroute.h:61:17: error: field 'vifc_rmt_addr' has incomplete type
  struct in_addr vifc_rmt_addr; /* IPIP tunnel addr */
/usr/include/linux/mroute.h:72:17: error: field 'mfcc_origin' has incomplete type
  struct in_addr mfcc_origin;  /* Origin of mcast */
/usr/include/linux/mroute.h:73:17: error: field 'mfcc_mcastgrp' has incomplete type
  struct in_addr mfcc_mcastgrp;  /* Group in question */
/usr/include/linux/mroute.h:84:17: error: field 'src' has incomplete type
  struct in_addr src;
/usr/include/linux/mroute.h:85:17: error: field 'grp' has incomplete type
  struct in_addr grp;
/usr/include/linux/mroute.h:109:17: error: field 'im_src' has incomplete type
  struct in_addr im_src,im_dst;
/usr/include/linux/mroute.h:109:24: error: field 'im_dst' has incomplete type
  struct in_addr im_src,im_dst;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index cf943016930f..1fe4c1e7d66e 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -3,6 +3,7 @@
 
 #include <linux/sockios.h>
 #include <linux/types.h>
+#include <linux/in.h>		/* For struct in_addr. */
 
 /* Based on the MROUTING 3.5 defines primarily to keep
  * source compatibility with BSD.
-- 
cgit v1.2.3


From feb0869d90e51ce8b6fd8a46588465b1b5a26d09 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 16 Feb 2017 18:05:13 +0300
Subject: uapi: fix linux/rds.h userspace compilation errors

Consistently use types from linux/types.h to fix the following
linux/rds.h userspace compilation errors:

/usr/include/linux/rds.h:106:2: error: unknown type name 'uint8_t'
  uint8_t name[32];
/usr/include/linux/rds.h:107:2: error: unknown type name 'uint64_t'
  uint64_t value;
/usr/include/linux/rds.h:117:2: error: unknown type name 'uint64_t'
  uint64_t next_tx_seq;
/usr/include/linux/rds.h:118:2: error: unknown type name 'uint64_t'
  uint64_t next_rx_seq;
/usr/include/linux/rds.h:121:2: error: unknown type name 'uint8_t'
  uint8_t transport[TRANSNAMSIZ];  /* null term ascii */
/usr/include/linux/rds.h:122:2: error: unknown type name 'uint8_t'
  uint8_t flags;
/usr/include/linux/rds.h:129:2: error: unknown type name 'uint64_t'
  uint64_t seq;
/usr/include/linux/rds.h:130:2: error: unknown type name 'uint32_t'
  uint32_t len;
/usr/include/linux/rds.h:135:2: error: unknown type name 'uint8_t'
  uint8_t flags;
/usr/include/linux/rds.h:139:2: error: unknown type name 'uint32_t'
  uint32_t sndbuf;
/usr/include/linux/rds.h:144:2: error: unknown type name 'uint32_t'
  uint32_t rcvbuf;
/usr/include/linux/rds.h:145:2: error: unknown type name 'uint64_t'
  uint64_t inum;
/usr/include/linux/rds.h:153:2: error: unknown type name 'uint64_t'
  uint64_t       hdr_rem;
/usr/include/linux/rds.h:154:2: error: unknown type name 'uint64_t'
  uint64_t       data_rem;
/usr/include/linux/rds.h:155:2: error: unknown type name 'uint32_t'
  uint32_t       last_sent_nxt;
/usr/include/linux/rds.h:156:2: error: unknown type name 'uint32_t'
  uint32_t       last_expected_una;
/usr/include/linux/rds.h:157:2: error: unknown type name 'uint32_t'
  uint32_t       last_seen_una;
/usr/include/linux/rds.h:164:2: error: unknown type name 'uint8_t'
  uint8_t  src_gid[RDS_IB_GID_LEN];
/usr/include/linux/rds.h:165:2: error: unknown type name 'uint8_t'
  uint8_t  dst_gid[RDS_IB_GID_LEN];
/usr/include/linux/rds.h:167:2: error: unknown type name 'uint32_t'
  uint32_t max_send_wr;
/usr/include/linux/rds.h:168:2: error: unknown type name 'uint32_t'
  uint32_t max_recv_wr;
/usr/include/linux/rds.h:169:2: error: unknown type name 'uint32_t'
  uint32_t max_send_sge;
/usr/include/linux/rds.h:170:2: error: unknown type name 'uint32_t'
  uint32_t rdma_mr_max;
/usr/include/linux/rds.h:171:2: error: unknown type name 'uint32_t'
  uint32_t rdma_mr_size;
/usr/include/linux/rds.h:212:9: error: unknown type name 'uint64_t'
 typedef uint64_t rds_rdma_cookie_t;
/usr/include/linux/rds.h:215:2: error: unknown type name 'uint64_t'
  uint64_t addr;
/usr/include/linux/rds.h:216:2: error: unknown type name 'uint64_t'
  uint64_t bytes;
/usr/include/linux/rds.h:221:2: error: unknown type name 'uint64_t'
  uint64_t cookie_addr;
/usr/include/linux/rds.h:222:2: error: unknown type name 'uint64_t'
  uint64_t flags;
/usr/include/linux/rds.h:228:2: error: unknown type name 'uint64_t'
  uint64_t  cookie_addr;
/usr/include/linux/rds.h:229:2: error: unknown type name 'uint64_t'
  uint64_t  flags;
/usr/include/linux/rds.h:234:2: error: unknown type name 'uint64_t'
  uint64_t flags;
/usr/include/linux/rds.h:240:2: error: unknown type name 'uint64_t'
  uint64_t local_vec_addr;
/usr/include/linux/rds.h:241:2: error: unknown type name 'uint64_t'
  uint64_t nr_local;
/usr/include/linux/rds.h:242:2: error: unknown type name 'uint64_t'
  uint64_t flags;
/usr/include/linux/rds.h:243:2: error: unknown type name 'uint64_t'
  uint64_t user_token;
/usr/include/linux/rds.h:248:2: error: unknown type name 'uint64_t'
  uint64_t  local_addr;
/usr/include/linux/rds.h:249:2: error: unknown type name 'uint64_t'
  uint64_t  remote_addr;
/usr/include/linux/rds.h:252:4: error: unknown type name 'uint64_t'
    uint64_t compare;
/usr/include/linux/rds.h:253:4: error: unknown type name 'uint64_t'
    uint64_t swap;
/usr/include/linux/rds.h:256:4: error: unknown type name 'uint64_t'
    uint64_t add;
/usr/include/linux/rds.h:259:4: error: unknown type name 'uint64_t'
    uint64_t compare;
/usr/include/linux/rds.h:260:4: error: unknown type name 'uint64_t'
    uint64_t swap;
/usr/include/linux/rds.h:261:4: error: unknown type name 'uint64_t'
    uint64_t compare_mask;
/usr/include/linux/rds.h:262:4: error: unknown type name 'uint64_t'
    uint64_t swap_mask;
/usr/include/linux/rds.h:265:4: error: unknown type name 'uint64_t'
    uint64_t add;
/usr/include/linux/rds.h:266:4: error: unknown type name 'uint64_t'
    uint64_t nocarry_mask;
/usr/include/linux/rds.h:269:2: error: unknown type name 'uint64_t'
  uint64_t flags;
/usr/include/linux/rds.h:270:2: error: unknown type name 'uint64_t'
  uint64_t user_token;
/usr/include/linux/rds.h:274:2: error: unknown type name 'uint64_t'
  uint64_t user_token;
/usr/include/linux/rds.h:275:2: error: unknown type name 'int32_t'
  int32_t  status;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rds.h | 104 +++++++++++++++++++++++------------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 3833113ab2c0..f42d2112ccee 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -117,8 +117,8 @@
 #define RDS_INFO_LAST			10010
 
 struct rds_info_counter {
-	uint8_t	name[32];
-	uint64_t	value;
+	__u8	name[32];
+	__u64	value;
 } __attribute__((packed));
 
 #define RDS_INFO_CONNECTION_FLAG_SENDING	0x01
@@ -128,35 +128,35 @@ struct rds_info_counter {
 #define TRANSNAMSIZ	16
 
 struct rds_info_connection {
-	uint64_t	next_tx_seq;
-	uint64_t	next_rx_seq;
+	__u64		next_tx_seq;
+	__u64		next_rx_seq;
 	__be32		laddr;
 	__be32		faddr;
-	uint8_t	transport[TRANSNAMSIZ];		/* null term ascii */
-	uint8_t	flags;
+	__u8		transport[TRANSNAMSIZ];		/* null term ascii */
+	__u8		flags;
 } __attribute__((packed));
 
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
 
 struct rds_info_message {
-	uint64_t	seq;
-	uint32_t	len;
+	__u64		seq;
+	__u32		len;
 	__be32		laddr;
 	__be32		faddr;
 	__be16		lport;
 	__be16		fport;
-	uint8_t	flags;
+	__u8		flags;
 } __attribute__((packed));
 
 struct rds_info_socket {
-	uint32_t	sndbuf;
+	__u32		sndbuf;
 	__be32		bound_addr;
 	__be32		connected_addr;
 	__be16		bound_port;
 	__be16		connected_port;
-	uint32_t	rcvbuf;
-	uint64_t	inum;
+	__u32		rcvbuf;
+	__u64		inum;
 } __attribute__((packed));
 
 struct rds_info_tcp_socket {
@@ -164,25 +164,25 @@ struct rds_info_tcp_socket {
 	__be16          local_port;
 	__be32          peer_addr;
 	__be16          peer_port;
-	uint64_t       hdr_rem;
-	uint64_t       data_rem;
-	uint32_t       last_sent_nxt;
-	uint32_t       last_expected_una;
-	uint32_t       last_seen_una;
+	__u64           hdr_rem;
+	__u64           data_rem;
+	__u32           last_sent_nxt;
+	__u32           last_expected_una;
+	__u32           last_seen_una;
 } __attribute__((packed));
 
 #define RDS_IB_GID_LEN	16
 struct rds_info_rdma_connection {
 	__be32		src_addr;
 	__be32		dst_addr;
-	uint8_t		src_gid[RDS_IB_GID_LEN];
-	uint8_t		dst_gid[RDS_IB_GID_LEN];
-
-	uint32_t	max_send_wr;
-	uint32_t	max_recv_wr;
-	uint32_t	max_send_sge;
-	uint32_t	rdma_mr_max;
-	uint32_t	rdma_mr_size;
+	__u8		src_gid[RDS_IB_GID_LEN];
+	__u8		dst_gid[RDS_IB_GID_LEN];
+
+	__u32		max_send_wr;
+	__u32		max_recv_wr;
+	__u32		max_send_sge;
+	__u32		rdma_mr_max;
+	__u32		rdma_mr_size;
 };
 
 /* RDS message Receive Path Latency points */
@@ -242,70 +242,70 @@ struct rds_cmsg_rx_trace {
  * (so that the application does not have to worry about
  * alignment).
  */
-typedef uint64_t	rds_rdma_cookie_t;
+typedef __u64		rds_rdma_cookie_t;
 
 struct rds_iovec {
-	uint64_t	addr;
-	uint64_t	bytes;
+	__u64		addr;
+	__u64		bytes;
 };
 
 struct rds_get_mr_args {
 	struct rds_iovec vec;
-	uint64_t	cookie_addr;
-	uint64_t	flags;
+	__u64		cookie_addr;
+	__u64		flags;
 };
 
 struct rds_get_mr_for_dest_args {
 	struct sockaddr_storage	dest_addr;
 	struct rds_iovec 	vec;
-	uint64_t		cookie_addr;
-	uint64_t		flags;
+	__u64			cookie_addr;
+	__u64			flags;
 };
 
 struct rds_free_mr_args {
 	rds_rdma_cookie_t cookie;
-	uint64_t	flags;
+	__u64		flags;
 };
 
 struct rds_rdma_args {
 	rds_rdma_cookie_t cookie;
 	struct rds_iovec remote_vec;
-	uint64_t	local_vec_addr;
-	uint64_t	nr_local;
-	uint64_t	flags;
-	uint64_t	user_token;
+	__u64		local_vec_addr;
+	__u64		nr_local;
+	__u64		flags;
+	__u64		user_token;
 };
 
 struct rds_atomic_args {
 	rds_rdma_cookie_t cookie;
-	uint64_t 	local_addr;
-	uint64_t 	remote_addr;
+	__u64		local_addr;
+	__u64		remote_addr;
 	union {
 		struct {
-			uint64_t	compare;
-			uint64_t	swap;
+			__u64		compare;
+			__u64		swap;
 		} cswp;
 		struct {
-			uint64_t	add;
+			__u64		add;
 		} fadd;
 		struct {
-			uint64_t	compare;
-			uint64_t	swap;
-			uint64_t	compare_mask;
-			uint64_t	swap_mask;
+			__u64		compare;
+			__u64		swap;
+			__u64		compare_mask;
+			__u64		swap_mask;
 		} m_cswp;
 		struct {
-			uint64_t	add;
-			uint64_t	nocarry_mask;
+			__u64		add;
+			__u64		nocarry_mask;
 		} m_fadd;
 	};
-	uint64_t	flags;
-	uint64_t	user_token;
+	__u64		flags;
+	__u64		user_token;
 };
 
 struct rds_rdma_notify {
-	uint64_t	user_token;
-	int32_t		status;
+	__u64		user_token;
+	__s32		status;
 };
 
 #define RDS_RDMA_SUCCESS	0
-- 
cgit v1.2.3


From 1786dbf3702e33ce3afd2d3dbe630bd04b1d2e58 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 16 Feb 2017 18:05:45 +0300
Subject: uapi: fix linux/rds.h userspace compilation error

On the kernel side, sockaddr_storage is #define'd to
__kernel_sockaddr_storage.  Replacing struct sockaddr_storage with
struct __kernel_sockaddr_storage defined by <linux/socket.h> fixes
the following linux/rds.h userspace compilation error:

/usr/include/linux/rds.h:226:26: error: field 'dest_addr' has incomplete type
  struct sockaddr_storage dest_addr;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rds.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index f42d2112ccee..47c03ca5c404 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -35,6 +35,7 @@
 #define _LINUX_RDS_H
 
 #include <linux/types.h>
+#include <linux/socket.h>		/* For __kernel_sockaddr_storage. */
 
 #define RDS_IB_ABI_VERSION		0x301
 
@@ -256,7 +257,7 @@ struct rds_get_mr_args {
 };
 
 struct rds_get_mr_for_dest_args {
-	struct sockaddr_storage	dest_addr;
+	struct __kernel_sockaddr_storage dest_addr;
 	struct rds_iovec 	vec;
 	__u64			cookie_addr;
 	__u64			flags;
-- 
cgit v1.2.3


From 35ea82d611da59f8bea44a37996b3b11bb1d3fd7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 17 Feb 2017 12:45:38 +0800
Subject: sctp: add support for generating stream ssn reset event notification

This patch is to add Stream Reset Event described in rfc6525
section 6.1.1.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h |  4 ++++
 include/uapi/linux/sctp.h   | 16 ++++++++++++++++
 net/sctp/ulpevent.c         | 29 +++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 2c098cd7e7e2..324b5965fc4d 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -128,6 +128,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_authkey(
 struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 	const struct sctp_association *asoc, gfp_t gfp);
 
+struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
+	const struct sctp_association *asoc, __u16 flags,
+	__u16 stream_num, __u16 *stream_list, gfp_t gfp);
+
 void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
 				   struct msghdr *);
 void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event,
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index a91a9cccbae6..d3ae381fcf33 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -490,6 +490,18 @@ struct sctp_sender_dry_event {
 	sctp_assoc_t sender_dry_assoc_id;
 };
 
+#define SCTP_STREAM_RESET_INCOMING_SSN	0x0001
+#define SCTP_STREAM_RESET_OUTGOING_SSN	0x0002
+#define SCTP_STREAM_RESET_DENIED	0x0004
+#define SCTP_STREAM_RESET_FAILED	0x0008
+struct sctp_stream_reset_event {
+	__u16 strreset_type;
+	__u16 strreset_flags;
+	__u32 strreset_length;
+	sctp_assoc_t strreset_assoc_id;
+	__u16 strreset_stream_list[];
+};
+
 /*
  * Described in Section 7.3
  *   Ancillary Data and Notification Interest Options
@@ -505,6 +517,7 @@ struct sctp_event_subscribe {
 	__u8 sctp_adaptation_layer_event;
 	__u8 sctp_authentication_event;
 	__u8 sctp_sender_dry_event;
+	__u8 sctp_stream_reset_event;
 };
 
 /*
@@ -529,6 +542,7 @@ union sctp_notification {
 	struct sctp_pdapi_event sn_pdapi_event;
 	struct sctp_authkey_event sn_authkey_event;
 	struct sctp_sender_dry_event sn_sender_dry_event;
+	struct sctp_stream_reset_event sn_strreset_event;
 };
 
 /* Section 5.3.1
@@ -556,6 +570,8 @@ enum sctp_sn_type {
 #define SCTP_AUTHENTICATION_INDICATION	SCTP_AUTHENTICATION_EVENT
 	SCTP_SENDER_DRY_EVENT,
 #define SCTP_SENDER_DRY_EVENT		SCTP_SENDER_DRY_EVENT
+	SCTP_STREAM_RESET_EVENT,
+#define SCTP_STREAM_RESET_EVENT		SCTP_STREAM_RESET_EVENT
 };
 
 /* Notification error codes used to fill up the error fields in some
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index bea00058ce35..c8881bc542a0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
 	return event;
 }
 
+struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
+	const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
+	__u16 *stream_list, gfp_t gfp)
+{
+	struct sctp_stream_reset_event *sreset;
+	struct sctp_ulpevent *event;
+	struct sk_buff *skb;
+	int length, i;
+
+	length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
+	event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
+	if (!event)
+		return NULL;
+
+	skb = sctp_event2skb(event);
+	sreset = (struct sctp_stream_reset_event *)skb_put(skb, length);
+
+	sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
+	sreset->strreset_flags = flags;
+	sreset->strreset_length = length;
+	sctp_ulpevent_set_owner(event, asoc);
+	sreset->strreset_assoc_id = sctp_assoc2id(asoc);
+
+	for (i = 0; i < stream_num; i++)
+		sreset->strreset_stream_list[i] = ntohs(stream_list[i]);
+
+	return event;
+}
+
 /* Return the notification type, assuming this is a notification
  * event.
  */
-- 
cgit v1.2.3


From 24045a03b8796e3e1ddb370dfe4bc592a9f5f301 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 20 Feb 2017 08:03:30 -0800
Subject: net: mpls: Add support for netconf

Add netconf support to MPLS. Allows userpsace to learn and be notified
of changes to 'input' enable setting per interface.

Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netconf.h   |   1 +
 include/uapi/linux/rtnetlink.h |   2 +
 net/mpls/af_mpls.c             | 212 ++++++++++++++++++++++++++++++++++++++++-
 net/mpls/internal.h            |   2 +-
 4 files changed, 214 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index 45dfad509c4d..7e5f0f3e31bf 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -16,6 +16,7 @@ enum {
 	NETCONFA_MC_FORWARDING,
 	NETCONFA_PROXY_NEIGH,
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+	NETCONFA_INPUT,
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 8c93ad1ef9ab..6546917d605a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -658,6 +658,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_MPLS_ROUTE	RTNLGRP_MPLS_ROUTE
 	RTNLGRP_NSID,
 #define RTNLGRP_NSID		RTNLGRP_NSID
+	RTNLGRP_MPLS_NETCONF,
+#define RTNLGRP_MPLS_NETCONF	RTNLGRP_MPLS_NETCONF
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 64d3bf269a26..3818686182b2 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -7,6 +7,7 @@
 #include <linux/if_arp.h>
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
+#include <linux/netconf.h>
 #include <linux/vmalloc.h>
 #include <linux/percpu.h>
 #include <net/ip.h>
@@ -960,15 +961,215 @@ static size_t mpls_get_stats_af_size(const struct net_device *dev)
 	return nla_total_size_64bit(sizeof(struct mpls_link_stats));
 }
 
+static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev,
+				     u32 portid, u32 seq, int event,
+				     unsigned int flags, int type)
+{
+	struct nlmsghdr  *nlh;
+	struct netconfmsg *ncm;
+	bool all = false;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	ncm = nlmsg_data(nlh);
+	ncm->ncm_family = AF_MPLS;
+
+	if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0)
+		goto nla_put_failure;
+
+	if ((all || type == NETCONFA_INPUT) &&
+	    nla_put_s32(skb, NETCONFA_INPUT,
+			mdev->input_enabled) < 0)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int mpls_netconf_msgsize_devconf(int type)
+{
+	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+			+ nla_total_size(4); /* NETCONFA_IFINDEX */
+	bool all = false;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_INPUT)
+		size += nla_total_size(4);
+
+	return size;
+}
+
+static void mpls_netconf_notify_devconf(struct net *net, int type,
+					struct mpls_dev *mdev)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF,
+					0, type);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err);
+}
+
+static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = {
+	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
+};
+
+static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
+				    struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[NETCONFA_MAX + 1];
+	struct netconfmsg *ncm;
+	struct net_device *dev;
+	struct mpls_dev *mdev;
+	struct sk_buff *skb;
+	int ifindex;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+			  devconf_mpls_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	if (!tb[NETCONFA_IFINDEX])
+		goto errout;
+
+	ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		goto errout;
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		goto errout;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = mpls_netconf_fill_devconf(skb, mdev,
+					NETLINK_CB(in_skb).portid,
+					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+					NETCONFA_ALL);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+	return err;
+}
+
+static int mpls_netconf_dump_devconf(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct hlist_head *head;
+	struct net_device *dev;
+	struct mpls_dev *mdev;
+	int idx, s_idx;
+	int h, s_h;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		rcu_read_lock();
+		cb->seq = net->dev_base_seq;
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			mdev = mpls_dev_get(dev);
+			if (!mdev)
+				goto cont;
+			if (mpls_netconf_fill_devconf(skb, mdev,
+						      NETLINK_CB(cb->skb).portid,
+						      cb->nlh->nlmsg_seq,
+						      RTM_NEWNETCONF,
+						      NLM_F_MULTI,
+						      NETCONFA_ALL) < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+			idx++;
+		}
+		rcu_read_unlock();
+	}
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+
+	return skb->len;
+}
+
 #define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
 	(&((struct mpls_dev *)0)->field)
 
+static int mpls_conf_proc(struct ctl_table *ctl, int write,
+			  void __user *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	int oval = *(int *)ctl->data;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write) {
+		struct mpls_dev *mdev = ctl->extra1;
+		int i = (int *)ctl->data - (int *)mdev;
+		struct net *net = ctl->extra2;
+		int val = *(int *)ctl->data;
+
+		if (i == offsetof(struct mpls_dev, input_enabled) &&
+		    val != oval) {
+			mpls_netconf_notify_devconf(net,
+						    NETCONFA_INPUT,
+						    mdev);
+		}
+	}
+
+	return ret;
+}
+
 static const struct ctl_table mpls_dev_table[] = {
 	{
 		.procname	= "input",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= mpls_conf_proc,
 		.data		= MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
 	},
 	{ }
@@ -978,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
 				    struct mpls_dev *mdev)
 {
 	char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+	struct net *net = dev_net(dev);
 	struct ctl_table *table;
 	int i;
 
@@ -988,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
 	/* Table data contains only offsets relative to the base of
 	 * the mdev at this point, so make them absolute.
 	 */
-	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) {
 		table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+		table[i].extra1 = mdev;
+		table[i].extra2 = net;
+	}
 
 	snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
 
@@ -1041,6 +1246,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev)
 	if (err)
 		goto free;
 
+	mdev->dev = dev;
 	rcu_assign_pointer(dev->mpls_ptr, mdev);
 
 	return mdev;
@@ -1861,6 +2067,8 @@ static int __init mpls_init(void)
 	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
 	rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
+	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
+		      mpls_netconf_dump_devconf, NULL);
 	err = 0;
 out:
 	return err;
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index d97243034605..76360d8b9579 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -16,7 +16,7 @@ struct mpls_pcpu_stats {
 
 struct mpls_dev {
 	int				input_enabled;
-
+	struct net_device		*dev;
 	struct mpls_pcpu_stats __percpu	*stats;
 
 	struct ctl_table_header		*sysctl;
-- 
cgit v1.2.3


From ba896a05ad9375912ccebdac9623aab97845600f Mon Sep 17 00:00:00 2001
From: Ken-ichirou MATSUZAWA <chamaken@gmail.com>
Date: Thu, 16 Feb 2017 18:20:33 +0900
Subject: netfilter: nfnetlink_queue: fix NFQA_VLAN_MAX definition

Should be - 1 as in other _MAX definitions.

Signed-off-by: Ken-ichirou MATSUZAWA <chamas@h4.dion.ne.jp>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index ae30841ff94e..d42f0396fe30 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -36,7 +36,7 @@ enum nfqnl_vlan_attr {
 	NFQA_VLAN_TCI,			/* __be16 skb htons(vlan_tci) */
 	__NFQA_VLAN_MAX,
 };
-#define NFQA_VLAN_MAX (__NFQA_VLAN_MAX + 1)
+#define NFQA_VLAN_MAX (__NFQA_VLAN_MAX - 1)
 
 enum nfqnl_attr_type {
 	NFQA_UNSPEC,
-- 
cgit v1.2.3


From 2618be7dccf8739b89e1906b64bd8d551af351e6 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 20 Feb 2017 14:58:41 +0300
Subject: uapi: fix linux/if.h userspace compilation errors

Include <sys/socket.h> (guarded by ifndef __KERNEL__) to fix
the following linux/if.h userspace compilation errors:

/usr/include/linux/if.h:234:19: error: field 'ifru_addr' has incomplete type
   struct sockaddr ifru_addr;
/usr/include/linux/if.h:235:19: error: field 'ifru_dstaddr' has incomplete type
   struct sockaddr ifru_dstaddr;
/usr/include/linux/if.h:236:19: error: field 'ifru_broadaddr' has incomplete type
   struct sockaddr ifru_broadaddr;
/usr/include/linux/if.h:237:19: error: field 'ifru_netmask' has incomplete type
   struct sockaddr ifru_netmask;
/usr/include/linux/if.h:238:20: error: field 'ifru_hwaddr' has incomplete type
   struct  sockaddr ifru_hwaddr;

This also fixes userspace compilation of the following uapi headers:
  linux/atmbr2684.h
  linux/gsmmux.h
  linux/if_arp.h
  linux/if_bonding.h
  linux/if_frad.h
  linux/if_pppox.h
  linux/if_tunnel.h
  linux/netdevice.h
  linux/route.h
  linux/wireless.h

As no uapi header provides a definition of struct sockaddr, inclusion
of <sys/socket.h> seems to be the most conservative and the only safe
fix available.

All current users of <linux/if.h> are very likely to be including
<sys/socket.h> already because the latter is the sole provider
of struct sockaddr definition in libc, so adding a uapi header
with a definition of struct sockaddr would create a potential
conflict with <sys/socket.h>.

Replacing struct sockaddr in the definition of struct ifreq with
a different type would create a potential incompatibility with current
users of struct ifreq who might rely on ifru_addr et al members being
of type struct sockaddr.

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 1158a043342a..259617a551f2 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -24,6 +24,10 @@
 #include <linux/socket.h>		/* for "struct sockaddr" et al	*/
 #include <linux/compiler.h>		/* for "__user" et al           */
 
+#ifndef __KERNEL__
+#include <sys/socket.h>			/* for struct sockaddr.		*/
+#endif
+
 #if __UAPI_DEF_IF_IFNAMSIZ
 #define	IFNAMSIZ	16
 #endif /* __UAPI_DEF_IF_IFNAMSIZ */
-- 
cgit v1.2.3


From e067eba5871c6922539dc1728699c14e6b22590f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 22 Feb 2017 15:42:06 -0800
Subject: userfaultfd: document _IOR/_IOW

Patch series "userfaultfd tmpfs/hugetlbfs/non-cooperative", v2

These userfaultfd features are finished and are ready for larger
exposure in -mm and upstream merging.

1) tmpfs non present userfault
2) hugetlbfs non present userfault
3) non cooperative userfault for fork/madvise/mremap

qemu development code is already exercising 2) and container postcopy
live migration needs 3).

1) is not currently used but there's a self test and we know some qemu
user for various reasons uses tmpfs as backing for KVM so it'll need it
too to use postcopy live migration with tmpfs memory.

All review feedback from the previous submit has been handled and the
fixes are included.  There's no outstanding issue AFIK.

Upstream code just did a s/fe/vmf/ conversion in the page faults and
this has been converted as well incrementally.

In addition to the previous submits, this also wakes up stuck userfaults
during UFFDIO_UNREGISTER.  The non cooperative testcase actually
reproduced this problem by getting stuck instead of quitting clean in
some rare case as it could call UFFDIO_UNREGISTER while some userfault
could be still in flight.  The other option would have been to keep
leaving it up to userland to serialize itself and to patch the testcase
instead but the wakeup during unregister I think is preferable.

David also asked the UFFD_FEATURE_MISSING_HUGETLBFS and
UFFD_FEATURE_MISSING_SHMEM feature flags to be added so QEMU can avoid
to probe if the hugetlbfs/shmem missing support is available by calling
UFFDIO_REGISTER.  QEMU already checks HUGETLBFS_MAGIC with fstatfs so if
UFFD_FEATURE_MISSING_HUGETLBFS is also set, it knows UFFDIO_REGISTER
will succeed (or if it fails, it's for some other more concerning
reason).  There's no reason to worry about adding too many feature
flags.  There are 64 available and worst case we've to bump the API if
someday we're really going to run out of them.

The round-trip network latency of hugetlbfs userfaults during postcopy
live migration is still of the order of dozen milliseconds on 10GBit if
at 2MB hugepage granularity so it's working perfectly and it should
provide for higher bandwidth or lower CPU usage (which makes it
interesting to add an option in the future to support THP granularity
too for anonymous memory, UFFDIO_COPY would then have to create THP if
alignment/len allows for it).  1GB hugetlbfs granularity will require
big changes in hugetlbfs to work so it's deferred for later.

This patch (of 42):

This adds proper documentation (inline) to avoid the risk of further
misunderstandings about the semantics of _IOW/_IOR and it also reminds
whoever will bump the UFFDIO_API in the future, to change the two ioctl
to _IOW.

This was found while implementing strace support for those ioctl,
otherwise we could have never found it by just reviewing kernel code and
testing it.

_IOC_READ or _IOC_WRITE alters nothing but the ioctl number itself, so
it's only worth fixing if the UFFDIO_API is bumped someday.

Link: http://lkml.kernel.org/r/20161216144821.5183-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/ioctl.h | 10 +++++++++-
 include/uapi/linux/userfaultfd.h |  6 ++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/asm-generic/ioctl.h b/include/uapi/asm-generic/ioctl.h
index 7e7c11b52143..749b32fe5623 100644
--- a/include/uapi/asm-generic/ioctl.h
+++ b/include/uapi/asm-generic/ioctl.h
@@ -48,6 +48,9 @@
 /*
  * Direction bits, which any architecture can choose to override
  * before including this file.
+ *
+ * NOTE: _IOC_WRITE means userland is writing and kernel is
+ * reading. _IOC_READ means userland is reading and kernel is writing.
  */
 
 #ifndef _IOC_NONE
@@ -72,7 +75,12 @@
 #define _IOC_TYPECHECK(t) (sizeof(t))
 #endif
 
-/* used to create numbers */
+/*
+ * Used to create numbers.
+ *
+ * NOTE: _IOW means userland is writing and kernel is reading. _IOR
+ * means userland is reading and kernel is writing.
+ */
 #define _IO(type,nr)		_IOC(_IOC_NONE,(type),(nr),0)
 #define _IOR(type,nr,size)	_IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size)))
 #define _IOW(type,nr,size)	_IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9057d7af3ae1..94046b8aa6ad 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,6 +11,12 @@
 
 #include <linux/types.h>
 
+/*
+ * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
+ * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR.  In
+ * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ
+ * means the userland is reading).
+ */
 #define UFFD_API ((__u64)0xAA)
 /*
  * After implementing the respective features it will become:
-- 
cgit v1.2.3


From 893e26e61d04eac974ded0c11e1647b335c8cb7b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:27 -0800
Subject: userfaultfd: non-cooperative: Add fork() event

When the mm with uffd-ed vmas fork()-s the respective vmas notify their
uffds with the event which contains a descriptor with new uffd.  This
new descriptor can then be used to get events from the child and
populate its mm with data.  Note, that there can be different uffd-s
controlling different vmas within one mm, so first we should collect all
those uffds (and ctx-s) in a list and then notify them all one by one
but only once per fork().

The context is created at fork() time but the descriptor, file struct
and anon inode object is created at event read time.  So some trickery
is added to the userfaultfd_ctx_read() to handle the ctx queues' locking
vs file creation.

Another thing worth noticing is that the task that fork()-s waits for
the uffd event to get processed WITHOUT the mmap sem.

[aarcange@redhat.com: build warning fix]
  Link: http://lkml.kernel.org/r/20161216144821.5183-10-aarcange@redhat.com
Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 148 ++++++++++++++++++++++++++++++++++++++-
 include/linux/userfaultfd_k.h    |  13 ++++
 include/uapi/linux/userfaultfd.h |  15 ++--
 kernel/fork.c                    |  10 ++-
 4 files changed, 170 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87d31921b66c..6046e0b552b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
 	struct mm_struct *mm;
 };
 
+struct userfaultfd_fork_ctx {
+	struct userfaultfd_ctx *orig;
+	struct userfaultfd_ctx *new;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
 	return ret;
 }
 
-static int __maybe_unused userfaultfd_event_wait_completion(
-		struct userfaultfd_ctx *ctx,
-		struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+					     struct userfaultfd_wait_queue *ewq)
 {
 	int ret = 0;
 
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 }
 
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+	struct userfaultfd_ctx *ctx = NULL, *octx;
+	struct userfaultfd_fork_ctx *fctx;
+
+	octx = vma->vm_userfaultfd_ctx.ctx;
+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		return 0;
+	}
+
+	list_for_each_entry(fctx, fcs, list)
+		if (fctx->orig == octx) {
+			ctx = fctx->new;
+			break;
+		}
+
+	if (!ctx) {
+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+		if (!fctx)
+			return -ENOMEM;
+
+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx) {
+			kfree(fctx);
+			return -ENOMEM;
+		}
+
+		atomic_set(&ctx->refcount, 1);
+		ctx->flags = octx->flags;
+		ctx->state = UFFD_STATE_RUNNING;
+		ctx->features = octx->features;
+		ctx->released = false;
+		ctx->mm = vma->vm_mm;
+		atomic_inc(&ctx->mm->mm_users);
+
+		userfaultfd_ctx_get(octx);
+		fctx->orig = octx;
+		fctx->new = ctx;
+		list_add_tail(&fctx->list, fcs);
+	}
+
+	vma->vm_userfaultfd_ctx.ctx = ctx;
+	return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+	struct userfaultfd_ctx *ctx = fctx->orig;
+	struct userfaultfd_wait_queue ewq;
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_FORK;
+	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+	return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+	int ret = 0;
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		if (!ret)
+			ret = dup_fctx(fctx);
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	}
 }
 
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+				  struct userfaultfd_ctx *new,
+				  struct uffd_msg *msg)
+{
+	int fd;
+	struct file *file;
+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+				  O_RDWR | flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	msg->arg.reserved.reserved1 = 0;
+	msg->arg.fork.ufd = fd;
+
+	return 0;
+}
+
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
 	struct userfaultfd_wait_queue *uwq;
+	/*
+	 * Handling fork event requires sleeping operations, so
+	 * we drop the event_wqh lock, then do these ops, then
+	 * lock it back and wake up the waiter. While the lock is
+	 * dropped the ewq may go away so we keep track of it
+	 * carefully.
+	 */
+	LIST_HEAD(fork_event);
+	struct userfaultfd_ctx *fork_nctx = NULL;
 
 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		if (uwq) {
 			*msg = uwq->msg;
 
+			if (uwq->msg.event == UFFD_EVENT_FORK) {
+				fork_nctx = (struct userfaultfd_ctx *)
+					(unsigned long)
+					uwq->msg.arg.reserved.reserved1;
+				list_move(&uwq->wq.task_list, &fork_event);
+				spin_unlock(&ctx->event_wqh.lock);
+				ret = 0;
+				break;
+			}
+
 			userfaultfd_event_complete(ctx, uwq);
 			spin_unlock(&ctx->event_wqh.lock);
 			ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->fd_wqh.lock);
 
+	if (!ret && msg->event == UFFD_EVENT_FORK) {
+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+		if (!ret) {
+			spin_lock(&ctx->event_wqh.lock);
+			if (!list_empty(&fork_event)) {
+				uwq = list_first_entry(&fork_event,
+						       typeof(*uwq),
+						       wq.task_list);
+				list_del(&uwq->wq.task_list);
+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				userfaultfd_event_complete(ctx, uwq);
+			}
+			spin_unlock(&ctx->event_wqh.lock);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 11b92b047a1e..79002bca1f43 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
 }
 
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline int dup_userfaultfd(struct vm_area_struct *vma,
+				  struct list_head *l)
+{
+	return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *l)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 94046b8aa6ad..c8953c84fdcc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *			      UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -77,6 +72,10 @@ struct uffd_msg {
 			__u64	address;
 		} pagefault;
 
+		struct {
+			__u32	ufd;
+		} fork;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,9 +89,7 @@ struct uffd_msg {
  * Start at 0x12 and not at 0 to be more strict against bugs.
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
-#if 0 /* not available yet */
 #define UFFD_EVENT_FORK		0x13
-#endif
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
 	 * are to be considered implicitly always enabled in all kernels as
 	 * long as the uffdio_api.api requested matches UFFD_API.
 	 */
-#if 0 /* not available yet */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
-#endif
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..d12fcc4db8a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 #include <linux/rmap.h>
 #include <linux/ksm.h>
 #include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
+	LIST_HEAD(uf);
 
 	uprobe_start_dup_mmap();
 	if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (retval)
 			goto fail_nomem_policy;
 		tmp->vm_mm = mm;
+		retval = dup_userfaultfd(tmp, &uf);
+		if (retval)
+			goto fail_nomem_anon_vma_fork;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &=
-			~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
-		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+	dup_userfaultfd_complete(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
-- 
cgit v1.2.3


From 72f87654c69690ff4721bd9b4a39983f971de9a5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:34 -0800
Subject: userfaultfd: non-cooperative: add mremap() event

The event denotes that an area [start:end] moves to different location.
Length change isn't reported as "new" addresses, if they appear on the
uffd reader side they will not contain any data and the latter can just
zeromap them.

Waiting for the event ACK is also done outside of mmap sem, as for fork
event.

Link: http://lkml.kernel.org/r/20161216144821.5183-12-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    | 17 +++++++++++++++++
 include/uapi/linux/userfaultfd.h | 11 ++++++++++-
 mm/mremap.c                      | 17 ++++++++++++-----
 4 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 27978f249016..68f978beefac 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -596,6 +596,43 @@ void dup_userfaultfd_complete(struct list_head *fcs)
 	}
 }
 
+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+			     struct vm_userfaultfd_ctx *vm_ctx)
+{
+	struct userfaultfd_ctx *ctx;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+		vm_ctx->ctx = ctx;
+		userfaultfd_ctx_get(ctx);
+	}
+}
+
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx,
+				 unsigned long from, unsigned long to,
+				 unsigned long len)
+{
+	struct userfaultfd_ctx *ctx = vm_ctx.ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	if (!ctx)
+		return;
+
+	if (to & ~PAGE_MASK) {
+		userfaultfd_ctx_put(ctx);
+		return;
+	}
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_REMAP;
+	ewq.msg.arg.remap.from = from;
+	ewq.msg.arg.remap.to = to;
+	ewq.msg.arg.remap.len = len;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 79002bca1f43..7f318a46044b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -55,6 +55,12 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
 
+extern void mremap_userfaultfd_prep(struct vm_area_struct *,
+				    struct vm_userfaultfd_ctx *);
+extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx,
+					unsigned long from, unsigned long to,
+					unsigned long len);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -89,6 +95,17 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
 {
 }
 
+static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+					   struct vm_userfaultfd_ctx *ctx)
+{
+}
+
+static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx,
+					       unsigned long from,
+					       unsigned long to,
+					       unsigned long len)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index c8953c84fdcc..79a85e5bd388 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
+			   UFFD_FEATURE_EVENT_REMAP)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -76,6 +77,12 @@ struct uffd_msg {
 			__u32	ufd;
 		} fork;
 
+		struct {
+			__u64	from;
+			__u64	to;
+			__u64	len;
+		} remap;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,6 +97,7 @@ struct uffd_msg {
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
+#define UFFD_EVENT_REMAP	0x14
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -110,6 +118,7 @@ struct uffdio_api {
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/mremap.c b/mm/mremap.c
index 30d7d2482eea..504b560c013c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
-		unsigned long new_len, unsigned long new_addr, bool *locked)
+		unsigned long new_len, unsigned long new_addr,
+		bool *locked, struct vm_userfaultfd_ctx *uf)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		old_addr = new_addr;
 		new_addr = err;
 	} else {
+		mremap_userfaultfd_prep(new_vma, uf);
 		arch_remap(mm, old_addr, old_addr + old_len,
 			   new_addr, new_addr + new_len);
 	}
@@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-		unsigned long new_addr, unsigned long new_len, bool *locked)
+		unsigned long new_addr, unsigned long new_len, bool *locked,
+		struct vm_userfaultfd_ctx *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
 	if (!(offset_in_page(ret)))
 		goto out;
 out1:
@@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
 	bool locked = false;
+	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
@@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked);
+				&locked, &uf);
 		goto out;
 	}
 
@@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 			goto out;
 		}
 
-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+		ret = move_vma(vma, addr, old_len, new_len, new_addr,
+			       &locked, &uf);
 	}
 out:
 	if (offset_in_page(ret)) {
@@ -602,5 +608,6 @@ out:
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
+	mremap_userfaultfd_complete(uf, addr, new_addr, old_len);
 	return ret;
 }
-- 
cgit v1.2.3


From 05ce77249d5068b057082d24ec22d3824f4816ac Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:40 -0800
Subject: userfaultfd: non-cooperative: add madvise() event for MADV_DONTNEED
 request

If the page is punched out of the address space the uffd reader should
know this and zeromap the respective area in case of the #PF event.

Link: http://lkml.kernel.org/r/20161216144821.5183-14-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    | 12 ++++++++++++
 include/uapi/linux/userfaultfd.h | 10 +++++++++-
 mm/madvise.c                     |  2 ++
 4 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5d37c37854b0..ea9008254df4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -633,6 +633,34 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
+void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				struct vm_area_struct **prev,
+				unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+		return;
+
+	userfaultfd_ctx_get(ctx);
+	up_read(&mm->mmap_sem);
+
+	*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
+	ewq.msg.arg.madv_dn.start = start;
+	ewq.msg.arg.madv_dn.end = end;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+
+	down_read(&mm->mmap_sem);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 78ec197e8b47..f431861f22f1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,6 +61,11 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
+extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				       struct vm_area_struct **prev,
+				       unsigned long start,
+				       unsigned long end);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -106,6 +111,13 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 					       unsigned long len)
 {
 }
+
+static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
+					      struct vm_area_struct **prev,
+					      unsigned long start,
+					      unsigned long end)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 79a85e5bd388..2bbf32319cf5 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,7 +19,8 @@
  */
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
-			   UFFD_FEATURE_EVENT_REMAP)
+			   UFFD_FEATURE_EVENT_REMAP |	    \
+			   UFFD_FEATURE_EVENT_MADVDONTNEED)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -83,6 +84,11 @@ struct uffd_msg {
 			__u64	len;
 		} remap;
 
+		struct {
+			__u64	start;
+			__u64	end;
+		} madv_dn;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -98,6 +104,7 @@ struct uffd_msg {
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
+#define UFFD_EVENT_MADVDONTNEED	0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -119,6 +126,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
+#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e3828eae9f8..06ffb5a170de 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
 #include <linux/sched.h>
@@ -477,6 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	zap_page_range(vma, start, end - start, NULL);
+	madvise_userfault_dontneed(vma, prev, start, end);
 	return 0;
 }
 
-- 
cgit v1.2.3


From cab350afcbc9c8a744e0d164d1c26560568f770b Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 22 Feb 2017 15:43:04 -0800
Subject: userfaultfd: hugetlbfs: allow registration of ranges containing huge
 pages

Expand the userfaultfd_register/unregister routines to allow VM_HUGETLB
vmas.  huge page alignment checking is performed after a VM_HUGETLB vma
is encountered.

Also, since there is no UFFDIO_ZEROPAGE support for huge pages do not
return that as a valid ioctl method for huge page ranges.

Link: http://lkml.kernel.org/r/20161216144821.5183-22-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 55 ++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/userfaultfd.h |  3 +++
 2 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 26e1ef00b63c..5139d05f80e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -27,6 +27,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ioctl.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
@@ -1058,6 +1059,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	struct uffdio_register __user *user_uffdio_register;
 	unsigned long vm_flags, new_flags;
 	bool found;
+	bool huge_pages;
 	unsigned long start, end, vma_end;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1108,6 +1110,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	if (vma->vm_start >= end)
 		goto out_unlock;
 
+	/*
+	 * If the first vma contains huge pages, make sure start address
+	 * is aligned to huge page size.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+		if (start & (vma_hpagesize - 1))
+			goto out_unlock;
+	}
+
 	/*
 	 * Search for not compatible vmas.
 	 *
@@ -1116,6 +1129,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	 * on anonymous vmas).
 	 */
 	found = false;
+	huge_pages = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
@@ -1124,8 +1138,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_is_anonymous(cur))
+		if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur))
 			goto out_unlock;
+		/*
+		 * If this vma contains ending address, and huge pages
+		 * check alignment.
+		 */
+		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
+		    end > cur->vm_start) {
+			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+			ret = -EINVAL;
+
+			if (end & (vma_hpagesize - 1))
+				goto out_unlock;
+		}
 
 		/*
 		 * Check that this vma isn't already owned by a
@@ -1138,6 +1165,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		    cur->vm_userfaultfd_ctx.ctx != ctx)
 			goto out_unlock;
 
+		/*
+		 * Note vmas containing huge pages
+		 */
+		if (is_vm_hugetlb_page(cur))
+			huge_pages = true;
+
 		found = true;
 	}
 	BUG_ON(!found);
@@ -1149,7 +1182,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_is_anonymous(vma));
+		BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 
@@ -1207,7 +1240,8 @@ out_unlock:
 		 * userland which ioctls methods are guaranteed to
 		 * succeed on this range.
 		 */
-		if (put_user(UFFD_API_RANGE_IOCTLS,
+		if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE :
+			     UFFD_API_RANGE_IOCTLS,
 			     &user_uffdio_register->ioctls))
 			ret = -EFAULT;
 	}
@@ -1253,6 +1287,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	if (vma->vm_start >= end)
 		goto out_unlock;
 
+	/*
+	 * If the first vma contains huge pages, make sure start address
+	 * is aligned to huge page size.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+		if (start & (vma_hpagesize - 1))
+			goto out_unlock;
+	}
+
 	/*
 	 * Search for not compatible vmas.
 	 *
@@ -1275,7 +1320,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_is_anonymous(cur))
+		if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur))
 			goto out_unlock;
 
 		found = true;
@@ -1289,7 +1334,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_is_anonymous(vma));
+		BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2bbf32319cf5..a3828a9bc16e 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -29,6 +29,9 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE)
+#define UFFD_API_RANGE_IOCTLS_HPAGE		\
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
-- 
cgit v1.2.3


From 163e11bc4f6ebbfcfdf751c108bd212a26e492ee Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 22 Feb 2017 15:43:19 -0800
Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_HUGETLBFS

Userland developers asked to be notified immediately by the UFFDIO_API
ioctl if hugetlbfs missing mode is supported by userfaultfd in the
running kernel.  This avoids the need to run UFFDIO_REGISTER on a
hugetlbfs virtual memory range to find out.

Link: http://lkml.kernel.org/r/20161216144821.5183-27-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index a3828a9bc16e..7293321abdfb 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,9 +18,10 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
-			   UFFD_FEATURE_EVENT_REMAP |	    \
-			   UFFD_FEATURE_EVENT_MADVDONTNEED)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
+			   UFFD_FEATURE_EVENT_REMAP |		\
+			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
+			   UFFD_FEATURE_MISSING_HUGETLBFS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -125,11 +126,32 @@ struct uffdio_api {
 	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
 	 * are to be considered implicitly always enabled in all kernels as
 	 * long as the uffdio_api.api requested matches UFFD_API.
+	 *
+	 * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
+	 * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
+	 * hugetlbfs virtual memory ranges. Adding or not adding
+	 * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
+	 * no real functional effect after UFFDIO_API returns, but
+	 * it's only useful for an initial feature set probe at
+	 * UFFDIO_API time. There are two ways to use it:
+	 *
+	 * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
+	 *    uffdio_api.features before calling UFFDIO_API, an error
+	 *    will be returned by UFFDIO_API on a kernel without
+	 *    hugetlbfs missing support
+	 *
+	 * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
+	 *    uffdio_api.features and instead it will be set by the
+	 *    kernel in the uffdio_api.features if the kernel supports
+	 *    it, so userland can later check if the feature flag is
+	 *    present in uffdio_api.features after UFFDIO_API
+	 *    succeeded.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 #define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
+#define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3


From cac673292b9b39493bb0ff526b96c83ace6fdcd0 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2017 15:43:40 -0800
Subject: userfaultfd: shmem: allow registration of shared memory ranges

Expand the userfaultfd_register/unregister routines to allow shared
memory VMAs.

Currently, there is no UFFDIO_ZEROPAGE and write-protection support for
shared memory VMAs, which is reflected in ioctl methods supported by
uffdio_register.

Link: http://lkml.kernel.org/r/20161216144821.5183-34-aarcange@redhat.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                         | 21 +++++++--------------
 include/uapi/linux/userfaultfd.h         |  2 +-
 tools/testing/selftests/vm/userfaultfd.c |  2 +-
 3 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 593135c296bc..18406158e13f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1098,7 +1098,8 @@ static __always_inline int validate_range(struct mm_struct *mm,
 
 static inline bool vma_can_userfault(struct vm_area_struct *vma)
 {
-	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma);
+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+		vma_is_shmem(vma);
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1111,7 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	struct uffdio_register __user *user_uffdio_register;
 	unsigned long vm_flags, new_flags;
 	bool found;
-	bool huge_pages;
+	bool non_anon_pages;
 	unsigned long start, end, vma_end;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1175,13 +1176,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 	/*
 	 * Search for not compatible vmas.
-	 *
-	 * FIXME: this shall be relaxed later so that it doesn't fail
-	 * on tmpfs backed vmas (in addition to the current allowance
-	 * on anonymous vmas).
 	 */
 	found = false;
-	huge_pages = false;
+	non_anon_pages = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
@@ -1220,8 +1217,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		/*
 		 * Note vmas containing huge pages
 		 */
-		if (is_vm_hugetlb_page(cur))
-			huge_pages = true;
+		if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
+			non_anon_pages = true;
 
 		found = true;
 	}
@@ -1292,7 +1289,7 @@ out_unlock:
 		 * userland which ioctls methods are guaranteed to
 		 * succeed on this range.
 		 */
-		if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE :
+		if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
 			     UFFD_API_RANGE_IOCTLS,
 			     &user_uffdio_register->ioctls))
 			ret = -EFAULT;
@@ -1352,10 +1349,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
 	/*
 	 * Search for not compatible vmas.
-	 *
-	 * FIXME: this shall be relaxed later so that it doesn't fail
-	 * on tmpfs backed vmas (in addition to the current allowance
-	 * on anonymous vmas).
 	 */
 	found = false;
 	ret = -EINVAL;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7293321abdfb..10631a4cdb24 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -30,7 +30,7 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE)
-#define UFFD_API_RANGE_IOCTLS_HPAGE		\
+#define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY)
 
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 3011711212ca..d753a9161411 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -129,7 +129,7 @@ static void allocate_area(void **alloc_area)
 
 #else /* HUGETLB_TEST */
 
-#define EXPECTED_IOCTLS		UFFD_API_RANGE_IOCTLS_HPAGE
+#define EXPECTED_IOCTLS		UFFD_API_RANGE_IOCTLS_BASIC
 
 static int release_pages(char *rel_area)
 {
-- 
cgit v1.2.3


From 47dd924508f5fb10480afc69de04539fa3d14034 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 22 Feb 2017 15:43:58 -0800
Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_SHMEM

Userland developers asked to be notified immediately by the UFFDIO_API
ioctl if shmem missing mode is supported by userfaultfd in the running
kernel.  This avoids the need to run UFFDIO_REGISTER on a shmem virtual
memory range to find out.

Link: http://lkml.kernel.org/r/20161216144821.5183-38-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 10631a4cdb24..9ac4b68c54d1 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -21,7 +21,8 @@
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
-			   UFFD_FEATURE_MISSING_HUGETLBFS)
+			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
+			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -146,12 +147,17 @@ struct uffdio_api {
 	 *    it, so userland can later check if the feature flag is
 	 *    present in uffdio_api.features after UFFDIO_API
 	 *    succeeded.
+	 *
+	 * UFFD_FEATURE_MISSING_SHMEM works the same as
+	 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
+	 * (i.e. tmpfs and other shmem based APIs).
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 #define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
+#define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3


From 557d7acd754543ca6f7166b9abde0a1af01ed848 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 23 Feb 2017 01:38:03 +0300
Subject: uapi: fix linux/ip6_tunnel.h userspace compilation errors

Include <linux/if.h> and <linux/in6.h> to fix the following
linux/ip6_tunnel.h userspace compilation errors:

/usr/include/linux/ip6_tunnel.h:23:12: error: 'IFNAMSIZ' undeclared here (not in a function)
  char name[IFNAMSIZ]; /* name of tunnel device */
/usr/include/linux/ip6_tunnel.h:30:18: error: field 'laddr' has incomplete type
  struct in6_addr laddr; /* local tunnel end-point address */

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ip6_tunnel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h
index 48af63c9a48d..425926c467d7 100644
--- a/include/uapi/linux/ip6_tunnel.h
+++ b/include/uapi/linux/ip6_tunnel.h
@@ -2,6 +2,8 @@
 #define _IP6_TUNNEL_H
 
 #include <linux/types.h>
+#include <linux/if.h>		/* For IFNAMSIZ. */
+#include <linux/in6.h>		/* For struct in6_addr. */
 
 #define IPV6_TLV_TNL_ENCAP_LIMIT 4
 #define IPV6_DEFAULT_TNL_ENCAP_LIMIT 4
-- 
cgit v1.2.3


From 40df93be6a7084e09688e1ddc45615d13df133fc Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 23 Feb 2017 01:38:26 +0300
Subject: uapi: fix linux/llc.h userspace compilation error

Include <linux/if.h> to fix the following linux/llc.h userspace
compilation error:

/usr/include/linux/llc.h:26:27: error: 'IFHWADDRLEN' undeclared here (not in a function)
  unsigned char   sllc_mac[IFHWADDRLEN];

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/llc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/llc.h b/include/uapi/linux/llc.h
index 9c987a402473..a6c17f66ee94 100644
--- a/include/uapi/linux/llc.h
+++ b/include/uapi/linux/llc.h
@@ -14,6 +14,7 @@
 #define _UAPI__LINUX_LLC_H
 
 #include <linux/socket.h>
+#include <linux/if.h> 		/* For IFHWADDRLEN. */
 
 #define __LLC_SOCK_SIZE__ 16	/* sizeof(sockaddr_llc), word align. */
 struct sockaddr_llc {
-- 
cgit v1.2.3


From ea3ebc73b46fbdb049dafd47543bb22efaa09c8e Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 23 Feb 2017 14:30:34 +0300
Subject: uapi: fix linux/seg6.h and linux/seg6_iptunnel.h userspace
 compilation errors

Include <linux/in6.h> in uapi/linux/seg6.h to fix the following
linux/seg6.h userspace compilation error:

/usr/include/linux/seg6.h:31:18: error: array type has incomplete element type 'struct in6_addr'
  struct in6_addr segments[0];

Include <linux/seg6.h> in uapi/linux/seg6_iptunnel.h to fix
the following linux/seg6_iptunnel.h userspace compilation error:

/usr/include/linux/seg6_iptunnel.h:26:21: error: array type has incomplete element type 'struct ipv6_sr_hdr'
  struct ipv6_sr_hdr srh[0];

Fixes: a50a05f497a2 ("ipv6: sr: add missing Kbuild export for header files")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6.h          | 1 +
 include/uapi/linux/seg6_iptunnel.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
index 61df8d392f41..7278511d339e 100644
--- a/include/uapi/linux/seg6.h
+++ b/include/uapi/linux/seg6.h
@@ -15,6 +15,7 @@
 #define _UAPI_LINUX_SEG6_H
 
 #include <linux/types.h>
+#include <linux/in6.h>		/* For struct in6_addr. */
 
 /*
  * SRH
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index 7a7183d4062a..b6e5a0a1afd7 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -14,6 +14,8 @@
 #ifndef _UAPI_LINUX_SEG6_IPTUNNEL_H
 #define _UAPI_LINUX_SEG6_IPTUNNEL_H
 
+#include <linux/seg6.h>		/* For struct ipv6_sr_hdr. */
+
 enum {
 	SEG6_IPTUNNEL_UNSPEC,
 	SEG6_IPTUNNEL_SRH,
-- 
cgit v1.2.3


From c12f4d761dd2313ae4f457041df3ec0c603aa76a Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 23 Feb 2017 14:35:23 +0300
Subject: uapi: fix linux/rds.h userspace compilation errors

Consistently use types from linux/types.h to fix the following
linux/rds.h userspace compilation errors:

/usr/include/linux/rds.h:198:2: error: unknown type name 'u8'
  u8 rx_traces;
/usr/include/linux/rds.h:199:2: error: unknown type name 'u8'
  u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
/usr/include/linux/rds.h:203:2: error: unknown type name 'u8'
  u8 rx_traces;
/usr/include/linux/rds.h:204:2: error: unknown type name 'u8'
  u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
/usr/include/linux/rds.h:205:2: error: unknown type name 'u64'
  u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];

Fixes: 3289025aedc0 ("RDS: add receive message trace used by application")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rds.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 47c03ca5c404..198892b95f09 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -195,14 +195,14 @@ enum rds_message_rxpath_latency {
 };
 
 struct rds_rx_trace_so {
-	u8 rx_traces;
-	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	__u8 rx_traces;
+	__u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
 };
 
 struct rds_cmsg_rx_trace {
-	u8 rx_traces;
-	u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
-	u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	__u8 rx_traces;
+	__u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	__u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
 };
 
 /*
-- 
cgit v1.2.3


From 47b1f6fd6ebc8b6b72a5aee39cb0513a38e9c099 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 23 Feb 2017 05:49:28 +0300
Subject: uapi: stop including linux/sysctl.h in uapi/linux/netfilter.h

linux/netfilter.h is the last uapi header file that includes
linux/sysctl.h but it does not depend on definitions provided
by this essentially dead header file.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index 7550e9176a54..c111a91adcc0 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -3,7 +3,6 @@
 
 #include <linux/types.h>
 #include <linux/compiler.h>
-#include <linux/sysctl.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 
-- 
cgit v1.2.3


From d811914d87576c562e849c00d9f9beff45038801 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:56:02 -0800
Subject: userfaultfd: non-cooperative: rename *EVENT_MADVDONTNEED to
 *EVENT_REMOVE

Patch series "userfaultfd: non-cooperative: add madvise() event for
MADV_REMOVE request".

These patches add notification of madvise(MADV_REMOVE) event to
non-cooperative userfaultfd monitor.

The first pacth renames EVENT_MADVDONTNEED to EVENT_REMOVE along with
relevant functions and structures.  Using _REMOVE instead of
_MADVDONTNEED describes the event semantics more clearly and I hope it's
not too late for such change in the ABI.

This patch (of 3):

The UFFD_EVENT_MADVDONTNEED purpose is to notify uffd monitor about
removal of certain range from address space tracked by userfaultfd.
Hence, UFFD_EVENT_REMOVE seems to better reflect the operation
semantics.  Respectively, 'madv_dn' field of uffd_msg is renamed to
'remove' and the madvise_userfault_dontneed callback is renamed to
userfaultfd_remove.

Link: http://lkml.kernel.org/r/1484814154-1557-2-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                         | 14 +++++++-------
 include/linux/userfaultfd_k.h            | 16 ++++++++--------
 include/uapi/linux/userfaultfd.h         |  8 ++++----
 mm/madvise.c                             |  2 +-
 tools/testing/selftests/vm/userfaultfd.c | 16 ++++++++--------
 5 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 18406158e13f..8fe601b4875e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -681,16 +681,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
-void madvise_userfault_dontneed(struct vm_area_struct *vma,
-				struct vm_area_struct **prev,
-				unsigned long start, unsigned long end)
+void userfaultfd_remove(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue ewq;
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
-	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 		return;
 
 	userfaultfd_ctx_get(ctx);
@@ -700,9 +700,9 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma,
 
 	msg_init(&ewq.msg);
 
-	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
-	ewq.msg.arg.madv_dn.start = start;
-	ewq.msg.arg.madv_dn.end = end;
+	ewq.msg.event = UFFD_EVENT_REMOVE;
+	ewq.msg.arg.remove.start = start;
+	ewq.msg.arg.remove.end = end;
 
 	userfaultfd_event_wait_completion(ctx, &ewq);
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f431861f22f1..2521542f6c07 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,10 +61,10 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
-extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
-				       struct vm_area_struct **prev,
-				       unsigned long start,
-				       unsigned long end);
+extern void userfaultfd_remove(struct vm_area_struct *vma,
+			       struct vm_area_struct **prev,
+			       unsigned long start,
+			       unsigned long end);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -112,10 +112,10 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 {
 }
 
-static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
-					      struct vm_area_struct **prev,
-					      unsigned long start,
-					      unsigned long end)
+static inline void userfaultfd_remove(struct vm_area_struct *vma,
+				      struct vm_area_struct **prev,
+				      unsigned long start,
+				      unsigned long end)
 {
 }
 #endif /* CONFIG_USERFAULTFD */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9ac4b68c54d1..b742c40c2880 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -20,7 +20,7 @@
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
-			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
+			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
@@ -92,7 +92,7 @@ struct uffd_msg {
 		struct {
 			__u64	start;
 			__u64	end;
-		} madv_dn;
+		} remove;
 
 		struct {
 			/* unused reserved fields */
@@ -109,7 +109,7 @@ struct uffd_msg {
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
-#define UFFD_EVENT_MADVDONTNEED	0x15
+#define UFFD_EVENT_REMOVE	0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -155,7 +155,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
-#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
+#define UFFD_FEATURE_EVENT_REMOVE		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 	__u64 features;
diff --git a/mm/madvise.c b/mm/madvise.c
index b530a4986035..ab5ef141cc9b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -479,7 +479,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 	if (!can_madv_dontneed_vma(vma))
 		return -EINVAL;
 
-	madvise_userfault_dontneed(vma, prev, start, end);
+	userfaultfd_remove(vma, prev, start, end);
 	zap_page_range(vma, start, end - start);
 	return 0;
 }
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 5a840a605a16..9eb77df568f7 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -398,12 +398,12 @@ static void *uffd_poll_thread(void *arg)
 			uffd = msg.arg.fork.ufd;
 			pollfd[0].fd = uffd;
 			break;
-		case UFFD_EVENT_MADVDONTNEED:
-			uffd_reg.range.start = msg.arg.madv_dn.start;
-			uffd_reg.range.len = msg.arg.madv_dn.end -
-				msg.arg.madv_dn.start;
+		case UFFD_EVENT_REMOVE:
+			uffd_reg.range.start = msg.arg.remove.start;
+			uffd_reg.range.len = msg.arg.remove.end -
+				msg.arg.remove.start;
 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
-				fprintf(stderr, "madv_dn failure\n"), exit(1);
+				fprintf(stderr, "remove failure\n"), exit(1);
 			break;
 		case UFFD_EVENT_REMAP:
 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
@@ -570,7 +570,7 @@ static int userfaultfd_open(int features)
  * mremap, the entire monitored area is accessed in a single pass for
  * HUGETLB_TEST.
  * The release of the pages currently generates event only for
- * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
  * for hugetlb and shmem.
  */
 static int faulting_process(void)
@@ -715,14 +715,14 @@ static int userfaultfd_events_test(void)
 	pid_t pid;
 	char c;
 
-	printf("testing events (fork, remap, madv_dn): ");
+	printf("testing events (fork, remap, remove): ");
 	fflush(stdout);
 
 	if (release_pages(area_dst))
 		return 1;
 
 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
-		UFFD_FEATURE_EVENT_MADVDONTNEED;
+		UFFD_FEATURE_EVENT_REMOVE;
 	if (userfaultfd_open(features) < 0)
 		return 1;
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-- 
cgit v1.2.3


From 897ab3e0c49e24b62e2d54d165c7afec6bbca65b Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:58:22 -0800
Subject: userfaultfd: non-cooperative: add event for memory unmaps

When a non-cooperative userfaultfd monitor copies pages in the
background, it may encounter regions that were already unmapped.
Addition of UFFD_EVENT_UNMAP allows the uffd monitor to track precisely
changes in the virtual memory layout.

Since there might be different uffd contexts for the affected VMAs, we
first should create a temporary representation for the unmap event for
each uffd context and then notify them one by one to the appropriate
userfault file descriptors.

The event notification occurs after the mmap_sem has been released.

[arnd@arndb.de: fix nommu build]
  Link: http://lkml.kernel.org/r/20170203165141.3665284-1-arnd@arndb.de
[mhocko@suse.com: fix nommu build]
  Link: http://lkml.kernel.org/r/20170202091503.GA22823@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/vdso.c          |  2 +-
 arch/tile/mm/elf.c               |  2 +-
 arch/x86/entry/vdso/vma.c        |  2 +-
 arch/x86/mm/mpx.c                |  4 +--
 fs/aio.c                         |  2 +-
 fs/proc/vmcore.c                 |  4 +--
 fs/userfaultfd.c                 | 65 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h               | 14 +++++----
 include/linux/userfaultfd_k.h    | 18 +++++++++++
 include/uapi/linux/userfaultfd.h |  3 ++
 ipc/shm.c                        |  8 ++---
 mm/mmap.c                        | 46 ++++++++++++++++++----------
 mm/mremap.c                      | 23 ++++++++------
 mm/nommu.c                       |  7 +++--
 mm/util.c                        |  5 +++-
 15 files changed, 160 insertions(+), 45 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index f9dbfb14af33..093517e85a6c 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -111,7 +111,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
 			   VM_READ|VM_WRITE|VM_EXEC|
 			   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-			   0);
+			   0, NULL);
 	if (IS_ERR_VALUE(base)) {
 		ret = base;
 		goto out;
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 6225cc998db1..889901824400 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -143,7 +143,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 		unsigned long addr = MEM_USER_INTRPT;
 		addr = mmap_region(NULL, addr, INTRPT_SIZE,
 				   VM_READ|VM_EXEC|
-				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0, NULL);
 		if (addr > (unsigned long) -PAGE_SIZE)
 			retval = (int) addr;
 	}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 10820f6cefbf..572cee3fccff 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -186,7 +186,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
 
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
-		do_munmap(mm, text_start, image->size);
+		do_munmap(mm, text_start, image->size, NULL);
 	} else {
 		current->mm->context.vdso = (void __user *)text_start;
 		current->mm->context.vdso_image = image;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index aad4ac386f98..c98079684bdb 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -51,7 +51,7 @@ static unsigned long mpx_mmap(unsigned long len)
 
 	down_write(&mm->mmap_sem);
 	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
+		       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
 	up_write(&mm->mmap_sem);
 	if (populate)
 		mm_populate(addr, populate);
@@ -893,7 +893,7 @@ static int unmap_entire_bt(struct mm_struct *mm,
 	 * avoid recursion, do_munmap() will check whether it comes
 	 * from one bounds table through VM_MPX flag.
 	 */
-	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
+	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
 }
 
 static int try_unmap_single_bt(struct mm_struct *mm,
diff --git a/fs/aio.c b/fs/aio.c
index 873b4ca82ccb..7e2ab9c8e39c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,7 +512,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
 				       PROT_READ | PROT_WRITE,
-				       MAP_SHARED, 0, &unused);
+				       MAP_SHARED, 0, &unused, NULL);
 	up_write(&mm->mmap_sem);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		ctx->mmap_size = 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f52d8e857ff7..885d445afa0d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -388,7 +388,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
 	}
 	return 0;
 fail:
-	do_munmap(vma->vm_mm, from, len);
+	do_munmap(vma->vm_mm, from, len, NULL);
 	return -EAGAIN;
 }
 
@@ -481,7 +481,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 
 	return 0;
 fail:
-	do_munmap(vma->vm_mm, vma->vm_start, len);
+	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
 	return -EAGAIN;
 }
 #else
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8fe601b4875e..4c78458ea78d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -71,6 +71,13 @@ struct userfaultfd_fork_ctx {
 	struct list_head list;
 };
 
+struct userfaultfd_unmap_ctx {
+	struct userfaultfd_ctx *ctx;
+	unsigned long start;
+	unsigned long end;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -709,6 +716,64 @@ void userfaultfd_remove(struct vm_area_struct *vma,
 	down_read(&mm->mmap_sem);
 }
 
+static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
+			  unsigned long start, unsigned long end)
+{
+	struct userfaultfd_unmap_ctx *unmap_ctx;
+
+	list_for_each_entry(unmap_ctx, unmaps, list)
+		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
+		    unmap_ctx->end == end)
+			return true;
+
+	return false;
+}
+
+int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+			   unsigned long start, unsigned long end,
+			   struct list_head *unmaps)
+{
+	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+		struct userfaultfd_unmap_ctx *unmap_ctx;
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
+		    has_unmap_ctx(ctx, unmaps, start, end))
+			continue;
+
+		unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
+		if (!unmap_ctx)
+			return -ENOMEM;
+
+		userfaultfd_ctx_get(ctx);
+		unmap_ctx->ctx = ctx;
+		unmap_ctx->start = start;
+		unmap_ctx->end = end;
+		list_add_tail(&unmap_ctx->list, unmaps);
+	}
+
+	return 0;
+}
+
+void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
+{
+	struct userfaultfd_unmap_ctx *ctx, *n;
+	struct userfaultfd_wait_queue ewq;
+
+	list_for_each_entry_safe(ctx, n, uf, list) {
+		msg_init(&ewq.msg);
+
+		ewq.msg.event = UFFD_EVENT_UNMAP;
+		ewq.msg.arg.remove.start = ctx->start;
+		ewq.msg.arg.remove.end = ctx->end;
+
+		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
+
+		list_del(&ctx->list);
+		kfree(ctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c65aa43b5712..c6fcba1d1ae5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2090,18 +2090,22 @@ extern int install_special_mapping(struct mm_struct *mm,
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
-	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
+	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+	struct list_head *uf);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
-extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf);
+extern int do_munmap(struct mm_struct *, unsigned long, size_t,
+		     struct list_head *uf);
 
 static inline unsigned long
 do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	unsigned long pgoff, unsigned long *populate)
+	unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf)
 {
-	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
 }
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 2521542f6c07..a40be5d0661b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -66,6 +66,12 @@ extern void userfaultfd_remove(struct vm_area_struct *vma,
 			       unsigned long start,
 			       unsigned long end);
 
+extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  struct list_head *uf);
+extern void userfaultfd_unmap_complete(struct mm_struct *mm,
+				       struct list_head *uf);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -118,6 +124,18 @@ static inline void userfaultfd_remove(struct vm_area_struct *vma,
 				      unsigned long end)
 {
 }
+
+static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+					 unsigned long start, unsigned long end,
+					 struct list_head *uf)
+{
+	return 0;
+}
+
+static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
+					      struct list_head *uf)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index b742c40c2880..3b059530dac9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -21,6 +21,7 @@
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
+			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
@@ -110,6 +111,7 @@ struct uffd_msg {
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
+#define UFFD_EVENT_UNMAP	0x16
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -158,6 +160,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_REMOVE		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
+#define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7f6537b84ef5..d7805acb44fd 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1222,7 +1222,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 			goto invalid;
 	}
 
-	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
 	*raddr = addr;
 	err = 0;
 	if (IS_ERR_VALUE(addr))
@@ -1329,7 +1329,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 			 */
 			file = vma->vm_file;
 			size = i_size_read(file_inode(vma->vm_file));
-			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 			/*
 			 * We discovered the size of the shm segment, so
 			 * break out of here and fall through to the next
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 		if ((vma->vm_ops == &shm_vm_ops) &&
 		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
 		    (vma->vm_file == file))
-			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 		vma = next;
 	}
 
@@ -1365,7 +1365,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 	 * given
 	 */
 	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
-		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 		retval = 0;
 	}
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 13d16a2b7623..1cec28d20583 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static int do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	struct mm_struct *mm = current->mm;
 	unsigned long min_brk;
 	bool populate;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
@@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 
 	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
-		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
 			goto set_brk;
 		goto out;
 	}
@@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) < 0)
+	if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
 		goto out;
 
 set_brk:
 	mm->brk = brk;
 	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	if (populate)
 		mm_populate(oldbrk, newbrk - oldbrk);
 	return brk;
@@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm,
 unsigned long do_mmap(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, vm_flags_t vm_flags,
-			unsigned long pgoff, unsigned long *populate)
+			unsigned long pgoff, unsigned long *populate,
+			struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	int pkey = 0;
@@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
-	addr = mmap_region(file, addr, len, vm_flags, pgoff);
+	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
@@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
-		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
+		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+		struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Clear old maps */
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
-		if (do_munmap(mm, addr, len))
+		if (do_munmap(mm, addr, len, uf))
 			return -ENOMEM;
 	}
 
@@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardinge <jeremy@goop.org>
  */
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+	      struct list_head *uf)
 {
 	unsigned long end;
 	struct vm_area_struct *vma, *prev, *last;
@@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	if (vma->vm_start >= end)
 		return 0;
 
+	if (uf) {
+		int error = userfaultfd_unmap_prep(vma, start, end, uf);
+
+		if (error)
+			return error;
+	}
+
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
@@ -2668,12 +2680,14 @@ int vm_munmap(unsigned long start, size_t len)
 {
 	int ret;
 	struct mm_struct *mm = current->mm;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_munmap(mm, start, len);
+	ret = do_munmap(mm, start, len, &uf);
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	return ret;
 }
 EXPORT_SYMBOL(vm_munmap);
@@ -2773,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
 	file = get_file(vma->vm_file);
 	ret = do_mmap_pgoff(vma->vm_file, start, size,
-			prot, flags, pgoff, &populate);
+			prot, flags, pgoff, &populate, NULL);
 	fput(file);
 out:
 	up_write(&mm->mmap_sem);
@@ -2799,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -2838,7 +2852,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
 	 */
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
-		if (do_munmap(mm, addr, len))
+		if (do_munmap(mm, addr, len, uf))
 			return -ENOMEM;
 	}
 
@@ -2885,9 +2899,9 @@ out:
 	return 0;
 }
 
-static int do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
 {
-	return do_brk_flags(addr, len, 0);
+	return do_brk_flags(addr, len, 0, uf);
 }
 
 int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
@@ -2895,13 +2909,15 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
 	struct mm_struct *mm = current->mm;
 	int ret;
 	bool populate;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_brk_flags(addr, len, flags);
+	ret = do_brk_flags(addr, len, flags, &uf);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8779928d6a70..8233b0105c82 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -252,7 +252,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
 		unsigned long new_len, unsigned long new_addr,
-		bool *locked, struct vm_userfaultfd_ctx *uf)
+		bool *locked, struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -341,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn_moved(vma);
 
-	if (do_munmap(mm, old_addr, old_len) < 0) {
+	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
@@ -417,7 +418,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
-		struct vm_userfaultfd_ctx *uf)
+		struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -435,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (addr + old_len > new_addr && new_addr + new_len > addr)
 		goto out;
 
-	ret = do_munmap(mm, new_addr, new_len);
+	ret = do_munmap(mm, new_addr, new_len, NULL);
 	if (ret)
 		goto out;
 
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
 		if (ret && old_len != new_len)
 			goto out;
 		old_len = new_len;
@@ -462,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+		       uf_unmap);
 	if (!(offset_in_page(ret)))
 		goto out;
 out1:
@@ -502,6 +505,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long charged = 0;
 	bool locked = false;
 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+	LIST_HEAD(uf_unmap);
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
@@ -528,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf);
+				&locked, &uf, &uf_unmap);
 		goto out;
 	}
 
@@ -538,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	 * do_munmap does all the needed commit accounting
 	 */
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
 		if (ret && old_len != new_len)
 			goto out;
 		ret = addr;
@@ -598,7 +602,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 
 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
-			       &locked, &uf);
+			       &locked, &uf, &uf_unmap);
 	}
 out:
 	if (offset_in_page(ret)) {
@@ -609,5 +613,6 @@ out:
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
 	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+	userfaultfd_unmap_complete(mm, &uf_unmap);
 	return ret;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 215c62296028..fe9f4fa4a7a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1205,7 +1205,8 @@ unsigned long do_mmap(struct file *file,
 			unsigned long flags,
 			vm_flags_t vm_flags,
 			unsigned long pgoff,
-			unsigned long *populate)
+			unsigned long *populate,
+			struct list_head *uf)
 {
 	struct vm_area_struct *vma;
 	struct vm_region *region;
@@ -1577,7 +1578,7 @@ static int shrink_vma(struct mm_struct *mm,
  * - under NOMMU conditions the chunk to be unmapped must be backed by a single
  *   VMA, though it need not cover the whole VMA
  */
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
 {
 	struct vm_area_struct *vma;
 	unsigned long end;
@@ -1643,7 +1644,7 @@ int vm_munmap(unsigned long addr, size_t len)
 	int ret;
 
 	down_write(&mm->mmap_sem);
-	ret = do_munmap(mm, addr, len);
+	ret = do_munmap(mm, addr, len, NULL);
 	up_write(&mm->mmap_sem);
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
index 3cb2164f4099..b8f538863b5a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,6 +11,7 @@
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -297,14 +298,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
 	unsigned long populate;
+	LIST_HEAD(uf);
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
 		if (down_write_killable(&mm->mmap_sem))
 			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
-				    &populate);
+				    &populate, &uf);
 		up_write(&mm->mmap_sem);
+		userfaultfd_unmap_complete(mm, &uf);
 		if (populate)
 			mm_populate(ret, populate);
 	}
-- 
cgit v1.2.3


From ca49ca7114553587736fe78319e22f073b631380 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:58:25 -0800
Subject: userfaultfd: non-cooperative: add event for exit() notification

Allow userfaultfd monitor track termination of the processes that have
memory backed by the uffd.

[rppt@linux.vnet.ibm.com: add comment]
  Link: http://lkml.kernel.org/r/20170202135448.GB19804@rapoport-lnxLink: http://lkml.kernel.org/r/1485542673-24387-4-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    |  7 +++++++
 include/uapi/linux/userfaultfd.h |  5 ++++-
 kernel/exit.c                    |  2 ++
 4 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4c78458ea78d..b676575f2268 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -774,6 +774,34 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 	}
 }
 
+void userfaultfd_exit(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = mm->mmap;
+
+	/*
+	 * We can do the vma walk without locking because the caller
+	 * (exit_mm) knows it now has exclusive access
+	 */
+	while (vma) {
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
+			struct userfaultfd_wait_queue ewq;
+
+			userfaultfd_ctx_get(ctx);
+
+			msg_init(&ewq.msg);
+			ewq.msg.event = UFFD_EVENT_EXIT;
+
+			userfaultfd_event_wait_completion(ctx, &ewq);
+
+			ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
+		}
+
+		vma = vma->vm_next;
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a40be5d0661b..0468548acebf 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -72,6 +72,8 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 
+extern void userfaultfd_exit(struct mm_struct *mm);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -136,6 +138,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 					      struct list_head *uf)
 {
 }
+
+static inline void userfaultfd_exit(struct mm_struct *mm)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..c055947c5c98 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT |		\
+			   UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
@@ -112,6 +113,7 @@ struct uffd_msg {
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
 #define UFFD_EVENT_UNMAP	0x16
+#define UFFD_EVENT_EXIT		0x17
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -161,6 +163,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_EVENT_EXIT			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/exit.c b/kernel/exit.c
index 9960accbf2ab..90b09ca35c84 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -45,6 +45,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
@@ -547,6 +548,7 @@ static void exit_mm(void)
 	enter_lazy_tlb(mm, current);
 	task_unlock(current);
 	mm_update_next_owner(mm);
+	userfaultfd_exit(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
-- 
cgit v1.2.3


From 1ca5eebb894a3625b2a543c7b550aa4ae33ba3cc Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@chromium.org>
Date: Fri, 24 Feb 2017 15:00:26 -0800
Subject: uapi: mqueue.h: add missing linux/types.h include

Commit 63159f5dcccb ("uapi: Use __kernel_long_t in struct mq_attr")
changed the types from long to __kernel_long_t, but didn't add a
linux/types.h include.  Code that tries to include this header directly
breaks:

  /usr/include/linux/mqueue.h:26:2: error: unknown type name '__kernel_long_t'
  __kernel_long_t mq_flags; /* message queue flags   */

This also upsets configure tests for this header:

  checking linux/mqueue.h usability... no
  checking linux/mqueue.h presence... yes
  configure: WARNING: linux/mqueue.h: present but cannot be compiled
  configure: WARNING: linux/mqueue.h:     check for missing prerequisite headers?
  configure: WARNING: linux/mqueue.h: see the Autoconf documentation
  configure: WARNING: linux/mqueue.h:     section "Present But Cannot Be Compiled"
  configure: WARNING: linux/mqueue.h: proceeding with the compiler's result
  checking for linux/mqueue.h... no

Link: http://lkml.kernel.org/r/20170119194644.4403-1-vapier@gentoo.org
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/mqueue.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mqueue.h b/include/uapi/linux/mqueue.h
index d0a2b8e89813..bbd5116ea739 100644
--- a/include/uapi/linux/mqueue.h
+++ b/include/uapi/linux/mqueue.h
@@ -18,6 +18,8 @@
 #ifndef _LINUX_MQUEUE_H
 #define _LINUX_MQUEUE_H
 
+#include <linux/types.h>
+
 #define MQ_PRIO_MAX 	32768
 /* per-uid limit of kernel memory used by mqueue, in bytes */
 #define MQ_BYTES_MAX	819200
-- 
cgit v1.2.3


From f2168273427c9358a21d1c77c629353c00d46c34 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Fri, 24 Feb 2017 03:23:20 +0300
Subject: uapi: fix linux/netfilter/xt_hashlimit.h userspace compilation error

Include <linux/limits.h> like some of uapi/linux/netfilter/xt_*.h
headers do to fix the following linux/netfilter/xt_hashlimit.h
userspace compilation error:

/usr/include/linux/netfilter/xt_hashlimit.h:90:12: error: 'NAME_MAX' undeclared here (not in a function)
  char name[NAME_MAX];

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_hashlimit.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index 3efc0ca18345..79da349f1060 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -2,6 +2,7 @@
 #define _UAPI_XT_HASHLIMIT_H
 
 #include <linux/types.h>
+#include <linux/limits.h>
 #include <linux/if.h>
 
 /* timings are in milliseconds. */
-- 
cgit v1.2.3


From 51be7a9a261ce18c520fb3928b168feb77522745 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 12 Jan 2017 23:36:32 +0200
Subject: virtio_mmio: expose header to userspace

It's handy for userspace emulators like QEMU.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_mmio.c     |   2 +-
 include/linux/virtio_mmio.h      | 141 ---------------------------------------
 include/uapi/linux/Kbuild        |   1 +
 include/uapi/linux/virtio_mmio.h | 141 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 143 insertions(+), 142 deletions(-)
 delete mode 100644 include/linux/virtio_mmio.h
 create mode 100644 include/uapi/linux/virtio_mmio.h

(limited to 'include/uapi')

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index c71fde5fe835..08357d70a891 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -70,7 +70,7 @@
 #include <linux/spinlock.h>
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
-#include <linux/virtio_mmio.h>
+#include <uapi/linux/virtio_mmio.h>
 #include <linux/virtio_ring.h>
 
 
diff --git a/include/linux/virtio_mmio.h b/include/linux/virtio_mmio.h
deleted file mode 100644
index c4b09689ab64..000000000000
--- a/include/linux/virtio_mmio.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Virtio platform device driver
- *
- * Copyright 2011, ARM Ltd.
- *
- * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
- *
- * This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _LINUX_VIRTIO_MMIO_H
-#define _LINUX_VIRTIO_MMIO_H
-
-/*
- * Control registers
- */
-
-/* Magic value ("virt" string) - Read Only */
-#define VIRTIO_MMIO_MAGIC_VALUE		0x000
-
-/* Virtio device version - Read Only */
-#define VIRTIO_MMIO_VERSION		0x004
-
-/* Virtio device ID - Read Only */
-#define VIRTIO_MMIO_DEVICE_ID		0x008
-
-/* Virtio vendor ID - Read Only */
-#define VIRTIO_MMIO_VENDOR_ID		0x00c
-
-/* Bitmask of the features supported by the device (host)
- * (32 bits per set) - Read Only */
-#define VIRTIO_MMIO_DEVICE_FEATURES	0x010
-
-/* Device (host) features set selector - Write Only */
-#define VIRTIO_MMIO_DEVICE_FEATURES_SEL	0x014
-
-/* Bitmask of features activated by the driver (guest)
- * (32 bits per set) - Write Only */
-#define VIRTIO_MMIO_DRIVER_FEATURES	0x020
-
-/* Activated features set selector - Write Only */
-#define VIRTIO_MMIO_DRIVER_FEATURES_SEL	0x024
-
-
-#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
-
-/* Guest's memory page size in bytes - Write Only */
-#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028
-
-#endif
-
-
-/* Queue selector - Write Only */
-#define VIRTIO_MMIO_QUEUE_SEL		0x030
-
-/* Maximum size of the currently selected queue - Read Only */
-#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034
-
-/* Queue size for the currently selected queue - Write Only */
-#define VIRTIO_MMIO_QUEUE_NUM		0x038
-
-
-#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
-
-/* Used Ring alignment for the currently selected queue - Write Only */
-#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c
-
-/* Guest's PFN for the currently selected queue - Read Write */
-#define VIRTIO_MMIO_QUEUE_PFN		0x040
-
-#endif
-
-
-/* Ready bit for the currently selected queue - Read Write */
-#define VIRTIO_MMIO_QUEUE_READY		0x044
-
-/* Queue notifier - Write Only */
-#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050
-
-/* Interrupt status - Read Only */
-#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060
-
-/* Interrupt acknowledge - Write Only */
-#define VIRTIO_MMIO_INTERRUPT_ACK	0x064
-
-/* Device status register - Read Write */
-#define VIRTIO_MMIO_STATUS		0x070
-
-/* Selected queue's Descriptor Table address, 64 bits in two halves */
-#define VIRTIO_MMIO_QUEUE_DESC_LOW	0x080
-#define VIRTIO_MMIO_QUEUE_DESC_HIGH	0x084
-
-/* Selected queue's Available Ring address, 64 bits in two halves */
-#define VIRTIO_MMIO_QUEUE_AVAIL_LOW	0x090
-#define VIRTIO_MMIO_QUEUE_AVAIL_HIGH	0x094
-
-/* Selected queue's Used Ring address, 64 bits in two halves */
-#define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
-#define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
-
-/* Configuration atomicity value */
-#define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
-
-/* The config space is defined by each driver as
- * the per-driver configuration space - Read Write */
-#define VIRTIO_MMIO_CONFIG		0x100
-
-
-
-/*
- * Interrupt flags (re: interrupt status & acknowledge registers)
- */
-
-#define VIRTIO_MMIO_INT_VRING		(1 << 0)
-#define VIRTIO_MMIO_INT_CONFIG		(1 << 1)
-
-#endif
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index f330ba4547cf..718fa73310e1 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -458,6 +458,7 @@ header-y += virtio_console.h
 header-y += virtio_gpu.h
 header-y += virtio_ids.h
 header-y += virtio_input.h
+header-y += virtio_mmio.h
 header-y += virtio_net.h
 header-y += virtio_pci.h
 header-y += virtio_ring.h
diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h
new file mode 100644
index 000000000000..c4b09689ab64
--- /dev/null
+++ b/include/uapi/linux/virtio_mmio.h
@@ -0,0 +1,141 @@
+/*
+ * Virtio platform device driver
+ *
+ * Copyright 2011, ARM Ltd.
+ *
+ * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_VIRTIO_MMIO_H
+#define _LINUX_VIRTIO_MMIO_H
+
+/*
+ * Control registers
+ */
+
+/* Magic value ("virt" string) - Read Only */
+#define VIRTIO_MMIO_MAGIC_VALUE		0x000
+
+/* Virtio device version - Read Only */
+#define VIRTIO_MMIO_VERSION		0x004
+
+/* Virtio device ID - Read Only */
+#define VIRTIO_MMIO_DEVICE_ID		0x008
+
+/* Virtio vendor ID - Read Only */
+#define VIRTIO_MMIO_VENDOR_ID		0x00c
+
+/* Bitmask of the features supported by the device (host)
+ * (32 bits per set) - Read Only */
+#define VIRTIO_MMIO_DEVICE_FEATURES	0x010
+
+/* Device (host) features set selector - Write Only */
+#define VIRTIO_MMIO_DEVICE_FEATURES_SEL	0x014
+
+/* Bitmask of features activated by the driver (guest)
+ * (32 bits per set) - Write Only */
+#define VIRTIO_MMIO_DRIVER_FEATURES	0x020
+
+/* Activated features set selector - Write Only */
+#define VIRTIO_MMIO_DRIVER_FEATURES_SEL	0x024
+
+
+#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
+
+/* Guest's memory page size in bytes - Write Only */
+#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028
+
+#endif
+
+
+/* Queue selector - Write Only */
+#define VIRTIO_MMIO_QUEUE_SEL		0x030
+
+/* Maximum size of the currently selected queue - Read Only */
+#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034
+
+/* Queue size for the currently selected queue - Write Only */
+#define VIRTIO_MMIO_QUEUE_NUM		0x038
+
+
+#ifndef VIRTIO_MMIO_NO_LEGACY /* LEGACY DEVICES ONLY! */
+
+/* Used Ring alignment for the currently selected queue - Write Only */
+#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c
+
+/* Guest's PFN for the currently selected queue - Read Write */
+#define VIRTIO_MMIO_QUEUE_PFN		0x040
+
+#endif
+
+
+/* Ready bit for the currently selected queue - Read Write */
+#define VIRTIO_MMIO_QUEUE_READY		0x044
+
+/* Queue notifier - Write Only */
+#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050
+
+/* Interrupt status - Read Only */
+#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060
+
+/* Interrupt acknowledge - Write Only */
+#define VIRTIO_MMIO_INTERRUPT_ACK	0x064
+
+/* Device status register - Read Write */
+#define VIRTIO_MMIO_STATUS		0x070
+
+/* Selected queue's Descriptor Table address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_DESC_LOW	0x080
+#define VIRTIO_MMIO_QUEUE_DESC_HIGH	0x084
+
+/* Selected queue's Available Ring address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_AVAIL_LOW	0x090
+#define VIRTIO_MMIO_QUEUE_AVAIL_HIGH	0x094
+
+/* Selected queue's Used Ring address, 64 bits in two halves */
+#define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
+#define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
+
+/* Configuration atomicity value */
+#define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
+
+/* The config space is defined by each driver as
+ * the per-driver configuration space - Read Write */
+#define VIRTIO_MMIO_CONFIG		0x100
+
+
+
+/*
+ * Interrupt flags (re: interrupt status & acknowledge registers)
+ */
+
+#define VIRTIO_MMIO_INT_VRING		(1 << 0)
+#define VIRTIO_MMIO_INT_CONFIG		(1 << 1)
+
+#endif
-- 
cgit v1.2.3


From 53a020c661741f3b87ad3ac6fa545088aaebac9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Feb 2017 18:15:20 +0100
Subject: virtio_pci: don't duplicate the msix_enable flag in struct pci_dev

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_common.c | 5 ++---
 drivers/virtio/virtio_pci_common.h | 2 --
 drivers/virtio/virtio_pci_legacy.c | 2 +-
 drivers/virtio/virtio_pci_modern.c | 2 +-
 include/uapi/linux/virtio_pci.h    | 2 +-
 5 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 274dc1ff09c0..b83053082875 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -125,7 +125,7 @@ void vp_del_vqs(struct virtio_device *vdev)
 
 	vp_remove_vqs(vdev);
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		for (i = 0; i < vp_dev->msix_vectors; i++)
 			free_cpumask_var(vp_dev->msix_affinity_masks[i]);
 
@@ -244,7 +244,6 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 			allocated_vectors++;
 	}
 
-	vp_dev->msix_enabled = 1;
 	return 0;
 
 out_remove_vqs:
@@ -340,7 +339,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 	if (!vq->callback)
 		return -EINVAL;
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		int vec = vp_dev->msix_vector_map[vq->index];
 		struct cpumask *mask = vp_dev->msix_affinity_masks[vec];
 		unsigned int irq = pci_irq_vector(vp_dev->pci_dev, vec);
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 85593867e712..217ca876eed7 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -64,8 +64,6 @@ struct virtio_pci_device {
 	/* the IO mapping for the PCI config space */
 	void __iomem *ioaddr;
 
-	/* MSI-X support */
-	int msix_enabled;
 	cpumask_var_t *msix_affinity_masks;
 	/* Name strings for interrupts. This size should be enough,
 	 * and I'm too lazy to allocate each name separately. */
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 47292dad0ff9..2ab6aee51bf6 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -165,7 +165,7 @@ static void del_vq(struct virtqueue *vq)
 
 	iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
 			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 00e6fc1df407..e5ce31091953 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -412,7 +412,7 @@ static void del_vq(struct virtqueue *vq)
 
 	vp_iowrite16(vq->index, &vp_dev->common->queue_select);
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		vp_iowrite16(VIRTIO_MSI_NO_VECTOR,
 			     &vp_dev->common->queue_msix_vector);
 		/* Flush the write out to device */
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 90007a1abcab..15b4385a2be1 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -79,7 +79,7 @@
  * configuration space */
 #define VIRTIO_PCI_CONFIG_OFF(msix_enabled)	((msix_enabled) ? 24 : 20)
 /* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
-#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
+#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->pci_dev->msix_enabled)
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
-- 
cgit v1.2.3


From 8723890d1d5d9216634ed662e55ee5bdacbe1ac7 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Mon, 27 Feb 2017 14:26:59 -0800
Subject: autofs: remove wrong comment

This format seems to have been taken from device mapper header, but
autofs has no such file:function in both kernel and userspace.

Link: http://lkml.kernel.org/r/148577164094.9801.4775075118014742496.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/auto_dev-ioctl.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 021ed331dd71..388739b429b0 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -120,10 +120,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
 	in->ioctlfd = -1;
 }
 
-/*
- * If you change this make sure you make the corresponding change
- * to autofs-dev-ioctl.c:lookup_ioctl()
- */
 enum {
 	/* Get various version info */
 	AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
-- 
cgit v1.2.3


From 0fae77feca339699dbd6c079ded37bbd72432eb3 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Mon, 27 Feb 2017 14:27:11 -0800
Subject: autofs: add command enum/macros for root-dir ioctls

Sync root-dir ioctl with misc-char-dev ioctl's enum/macro format since
these two types of ioctls aren't completely independent of each other in
terms of command nr.  No functional changes.

Link: http://lkml.kernel.org/r/148577166143.9801.15511796506678428145.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/auto_dev-ioctl.h |  2 --
 include/uapi/linux/auto_fs.h        | 25 ++++++++++++++++++-------
 include/uapi/linux/auto_fs4.h       | 16 +++++++++++-----
 3 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 388739b429b0..af107aa7b685 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -156,8 +156,6 @@ enum {
 	AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
 };
 
-#define AUTOFS_IOCTL 0x93
-
 #define AUTOFS_DEV_IOCTL_VERSION \
 	_IOWR(AUTOFS_IOCTL, \
 	      AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 1bfc3ed8b284..aa63451ef20a 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -61,12 +61,23 @@ struct autofs_packet_expire {
 	char name[NAME_MAX+1];
 };
 
-#define AUTOFS_IOC_READY      _IO(0x93, 0x60)
-#define AUTOFS_IOC_FAIL       _IO(0x93, 0x61)
-#define AUTOFS_IOC_CATATONIC  _IO(0x93, 0x62)
-#define AUTOFS_IOC_PROTOVER   _IOR(0x93, 0x63, int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
-#define AUTOFS_IOC_EXPIRE     _IOR(0x93, 0x65, struct autofs_packet_expire)
+#define AUTOFS_IOCTL 0x93
+
+enum {
+	AUTOFS_IOC_READY_CMD = 0x60,
+	AUTOFS_IOC_FAIL_CMD,
+	AUTOFS_IOC_CATATONIC_CMD,
+	AUTOFS_IOC_PROTOVER_CMD,
+	AUTOFS_IOC_SETTIMEOUT_CMD,
+	AUTOFS_IOC_EXPIRE_CMD,
+};
+
+#define AUTOFS_IOC_READY        _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD)
+#define AUTOFS_IOC_FAIL         _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD)
+#define AUTOFS_IOC_CATATONIC    _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD)
+#define AUTOFS_IOC_PROTOVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOVER_CMD, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT   _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, unsigned long)
+#define AUTOFS_IOC_EXPIRE       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_CMD, struct autofs_packet_expire)
 
 #endif /* _UAPI_LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 8f8f1bdcca8c..7c6da423d54e 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -148,10 +148,16 @@ union autofs_v5_packet_union {
 	autofs_packet_expire_direct_t expire_direct;
 };
 
-#define AUTOFS_IOC_EXPIRE_MULTI		_IOW(0x93, 0x66, int)
-#define AUTOFS_IOC_EXPIRE_INDIRECT	AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_EXPIRE_DIRECT	AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93, 0x67, int)
-#define AUTOFS_IOC_ASKUMOUNT		_IOR(0x93, 0x70, int)
+enum {
+	AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */
+	AUTOFS_IOC_PROTOSUBVER_CMD,
+	AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */
+};
+
+#define AUTOFS_IOC_EXPIRE_MULTI    _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
+#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_EXPIRE_DIRECT   AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_PROTOSUBVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
+#define AUTOFS_IOC_ASKUMOUNT       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
 
 #endif /* _LINUX_AUTO_FS4_H */
-- 
cgit v1.2.3


From 3bb2fbdaba16386f520b5750d649a30643a4ab9e Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Mon, 27 Feb 2017 14:27:14 -0800
Subject: autofs: remove duplicated AUTOFS_DEV_IOCTL_SIZE definition

This macro is already defined in uapi header.  Also use this macro where
possible.

Link: http://lkml.kernel.org/r/148577166656.9801.10322423666945951186.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c              | 2 --
 include/uapi/linux/auto_dev-ioctl.h | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 6f48d670c941..806df746f1a9 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -38,8 +38,6 @@
  * which have been left busy at at service shutdown.
  */
 
-#define AUTOFS_DEV_IOCTL_SIZE	sizeof(struct autofs_dev_ioctl)
-
 typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
 			struct autofs_dev_ioctl *);
 
diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index af107aa7b685..744b3d060968 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -113,10 +113,10 @@ struct autofs_dev_ioctl {
 
 static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
 {
-	memset(in, 0, sizeof(struct autofs_dev_ioctl));
+	memset(in, 0, AUTOFS_DEV_IOCTL_SIZE);
 	in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
 	in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
-	in->size = sizeof(struct autofs_dev_ioctl);
+	in->size = AUTOFS_DEV_IOCTL_SIZE;
 	in->ioctlfd = -1;
 }
 
-- 
cgit v1.2.3


From ae7e81c077d60507dcec139e40a6d10cf932cf4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:07:51 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <uapi/linux/sched/types.h>

We are going to move scheduler ABI details to <uapi/linux/sched/types.h>,
which will be used from a number of .c files.

Create empty placeholder header that maps to <linux/types.h>.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/common/bL_switcher.c                 | 1 +
 crypto/crypto_engine.c                        | 1 +
 drivers/acpi/acpi_pad.c                       | 1 +
 drivers/block/drbd/drbd_receiver.c            | 1 +
 drivers/firmware/psci_checker.c               | 1 +
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 1 +
 drivers/gpu/drm/i915/intel_breadcrumbs.c      | 1 +
 drivers/media/pci/ivtv/ivtv-driver.c          | 1 +
 drivers/mmc/core/sdio_irq.c                   | 1 +
 drivers/spi/spi.c                             | 1 +
 drivers/staging/android/ion/ion_heap.c        | 1 +
 drivers/thermal/intel_powerclamp.c            | 1 +
 drivers/tty/serial/sc16is7xx.c                | 1 +
 include/uapi/linux/sched/types.h              | 6 ++++++
 kernel/irq/manage.c                           | 1 +
 kernel/kthread.c                              | 1 +
 kernel/locking/locktorture.c                  | 1 +
 kernel/rcu/rcuperf.c                          | 1 +
 kernel/rcu/rcutorture.c                       | 1 +
 kernel/rcu/tree.c                             | 1 +
 kernel/rcu/tree_plugin.h                      | 1 +
 kernel/sched/core.c                           | 1 +
 kernel/sched/cpufreq_schedutil.c              | 1 +
 kernel/trace/ring_buffer_benchmark.c          | 1 +
 kernel/trace/trace_selftest.c                 | 1 +
 kernel/watchdog.c                             | 1 +
 26 files changed, 31 insertions(+)
 create mode 100644 include/uapi/linux/sched/types.h

(limited to 'include/uapi')

diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
index 46730017b3c5..083c9e517d22 100644
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/interrupt.h>
 #include <linux/cpu_pm.h>
 #include <linux/cpu.h>
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index f1bf3418d968..727bd5c3569e 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <crypto/engine.h>
 #include <crypto/internal/hash.h>
+#include <uapi/linux/sched/types.h>
 #include "internal.h"
 
 #define CRYPTO_ENGINE_MAX_QLEN 10
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index eb76a4c10dbf..754431031282 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/tick.h>
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7728dd77230..8b40a5b2f8e6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -36,6 +36,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/slab.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/pkt_sched.h>
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
diff --git a/drivers/firmware/psci_checker.c b/drivers/firmware/psci_checker.c
index 29d58feaf675..6523ce962865 100644
--- a/drivers/firmware/psci_checker.c
+++ b/drivers/firmware/psci_checker.c
@@ -20,6 +20,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/module.h>
 #include <linux/preempt.h>
 #include <linux/psci.h>
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bf83ed113b3..16f96563cd2b 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -24,6 +24,7 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <drm/drmP.h>
 #include "gpu_scheduler.h"
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index fcfa423d08bd..7044e9a6abf7 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 
 #include "i915_drv.h"
 
diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c
index ab2ae53618e8..e73c153285f0 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.c
+++ b/drivers/media/pci/ivtv/ivtv-driver.c
@@ -59,6 +59,7 @@
 #include <media/tveeprom.h>
 #include <media/i2c/saa7115.h>
 #include "tuner-xc2028.h"
+#include <uapi/linux/sched/types.h>
 
 /* If you have already X v4l cards, then set this to X. This way
    the device numbers stay matched. Example: you have a WinTV card
diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index d29faf2addfe..6d4b72080d51 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -15,6 +15,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/wait.h>
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 44222ef9471e..90b5b2efafbf 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -33,6 +33,7 @@
 #include <linux/pm_domain.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/ioport.h>
diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c
index 4e5c0f17f579..c69d0bd53693 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/rtmutex.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/scatterlist.h>
 #include <linux/vmalloc.h>
 #include "ion.h"
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index a47103a659fa..d718cd179ddb 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -50,6 +50,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 
 #include <asm/nmi.h>
 #include <asm/msr.h>
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 793395451982..ca54ce074a5f 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -29,6 +29,7 @@
 #include <linux/tty_flip.h>
 #include <linux/spi/spi.h>
 #include <linux/uaccess.h>
+#include <uapi/linux/sched/types.h>
 
 #define SC16IS7XX_NAME			"sc16is7xx"
 #define SC16IS7XX_MAX_DEVS		8
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
new file mode 100644
index 000000000000..d162d315f4b5
--- /dev/null
+++ b/include/uapi/linux/sched/types.h
@@ -0,0 +1,6 @@
+#ifndef _UAPI_LINUX_SCHED_TYPES_H
+#define _UAPI_LINUX_SCHED_TYPES_H
+
+#include <linux/types.h>
+
+#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 944d068b6c48..09740952e4de 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/task_work.h>
 
 #include "internals.h"
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8461a4372e8a..ef9b9eb809c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -5,6 +5,7 @@
  * even if we're invoked from userspace (think modprobe, hotplug cpu,
  * etc.).
  */
+#include <uapi/linux/sched/types.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/completion.h>
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 28350dc8ecbb..5ea0a8969ee2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -32,6 +32,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 123ccbd22449..a4a86fb47e4a 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -30,6 +30,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d81345be730e..6a28b79710f0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -33,6 +33,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb62ce23ffc7..e456327a63d6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -49,6 +49,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/prefetch.h>
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a240f3308be6..9dabb04003be 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,6 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
+#include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
 
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a7fd3d21e5a..ed39d1d0b64a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..8f8de3d4d6b7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,7 @@
 
 #include <linux/cpufreq.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 6df9a83e20d7..c190a4d5013c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -6,6 +6,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/module.h>
 #include <linux/ktime.h>
 #include <asm/local.h>
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea77881..cb917cebae29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
 /* Include in trace.c */
 
+#include <uapi/linux/sched/types.h>
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 144d7b1b0364..52718f4512e9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
-- 
cgit v1.2.3


From e2d1e2aec572a2138dea74d53be54a1406d419c0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:07:51 +0100
Subject: sched/headers: Move various ABI definitions to
 <uapi/linux/sched/types.h>

Move scheduler ABI types (struct sched_attr, struct sched_param, etc.) into
the new UAPI header.

This further reduces the size and complexity of <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h            | 70 ++--------------------------------------
 include/uapi/linux/sched/types.h | 68 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 68 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d6998858fa3..5c481906e835 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -5,11 +5,6 @@
 
 #include <linux/sched/prio.h>
 
-
-struct sched_param {
-	int sched_priority;
-};
-
 #include <asm/param.h>	/* for HZ */
 
 #include <linux/capability.h>
@@ -64,69 +59,8 @@ struct sched_param {
 
 #include <asm/processor.h>
 
-#define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
-
-/*
- * Extended scheduling parameters data structure.
- *
- * This is needed because the original struct sched_param can not be
- * altered without introducing ABI issues with legacy applications
- * (e.g., in sched_getparam()).
- *
- * However, the possibility of specifying more than just a priority for
- * the tasks may be useful for a wide variety of application fields, e.g.,
- * multimedia, streaming, automation and control, and many others.
- *
- * This variant (sched_attr) is meant at describing a so-called
- * sporadic time-constrained task. In such model a task is specified by:
- *  - the activation period or minimum instance inter-arrival time;
- *  - the maximum (or average, depending on the actual scheduling
- *    discipline) computation time of all instances, a.k.a. runtime;
- *  - the deadline (relative to the actual activation time) of each
- *    instance.
- * Very briefly, a periodic (sporadic) task asks for the execution of
- * some specific computation --which is typically called an instance--
- * (at most) every period. Moreover, each instance typically lasts no more
- * than the runtime and must be completed by time instant t equal to
- * the instance activation time + the deadline.
- *
- * This is reflected by the actual fields of the sched_attr structure:
- *
- *  @size		size of the structure, for fwd/bwd compat.
- *
- *  @sched_policy	task's scheduling policy
- *  @sched_flags	for customizing the scheduler behaviour
- *  @sched_nice		task's nice value      (SCHED_NORMAL/BATCH)
- *  @sched_priority	task's static priority (SCHED_FIFO/RR)
- *  @sched_deadline	representative of the task's deadline
- *  @sched_runtime	representative of the task's runtime
- *  @sched_period	representative of the task's period
- *
- * Given this task model, there are a multiplicity of scheduling algorithms
- * and policies, that can be used to ensure all the tasks will make their
- * timing constraints.
- *
- * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
- * only user of this new interface. More information about the algorithm
- * available in the scheduling class file or in Documentation/.
- */
-struct sched_attr {
-	u32 size;
-
-	u32 sched_policy;
-	u64 sched_flags;
-
-	/* SCHED_NORMAL, SCHED_BATCH */
-	s32 sched_nice;
-
-	/* SCHED_FIFO, SCHED_RR */
-	u32 sched_priority;
-
-	/* SCHED_DEADLINE */
-	u64 sched_runtime;
-	u64 sched_deadline;
-	u64 sched_period;
-};
+struct sched_attr;
+struct sched_param;
 
 struct futex_pi_state;
 struct robust_list_head;
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index d162d315f4b5..307acbc82d80 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -3,4 +3,72 @@
 
 #include <linux/types.h>
 
+struct sched_param {
+	int sched_priority;
+};
+
+#define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ *  - the activation period or minimum instance inter-arrival time;
+ *  - the maximum (or average, depending on the actual scheduling
+ *    discipline) computation time of all instances, a.k.a. runtime;
+ *  - the deadline (relative to the actual activation time) of each
+ *    instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ *  @size		size of the structure, for fwd/bwd compat.
+ *
+ *  @sched_policy	task's scheduling policy
+ *  @sched_flags	for customizing the scheduler behaviour
+ *  @sched_nice		task's nice value      (SCHED_NORMAL/BATCH)
+ *  @sched_priority	task's static priority (SCHED_FIFO/RR)
+ *  @sched_deadline	representative of the task's deadline
+ *  @sched_runtime	representative of the task's runtime
+ *  @sched_period	representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ *
+ * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
+ * only user of this new interface. More information about the algorithm
+ * available in the scheduling class file or in Documentation/.
+ */
+struct sched_attr {
+	u32 size;
+
+	u32 sched_policy;
+	u64 sched_flags;
+
+	/* SCHED_NORMAL, SCHED_BATCH */
+	s32 sched_nice;
+
+	/* SCHED_FIFO, SCHED_RR */
+	u32 sched_priority;
+
+	/* SCHED_DEADLINE */
+	u64 sched_runtime;
+	u64 sched_deadline;
+	u64 sched_period;
+};
+
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
-- 
cgit v1.2.3


From a528d35e8bfcc521d7cb70aaf03e1bd296c8493f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 31 Jan 2017 16:46:22 +0000
Subject: statx: Add a system call to make enhanced file info available

Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.

The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode.  This change is propagated to the vfs_getattr*()
function.

Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.

========
OVERVIEW
========

The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.

A number of requests were gathered for features to be included.  The
following have been included:

 (1) Make the fields a consistent size on all arches and make them large.

 (2) Spare space, request flags and information flags are provided for
     future expansion.

 (3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
     __s64).

 (4) Creation time: The SMB protocol carries the creation time, which could
     be exported by Samba, which will in turn help CIFS make use of
     FS-Cache as that can be used for coherency data (stx_btime).

     This is also specified in NFSv4 as a recommended attribute and could
     be exported by NFSD [Steve French].

 (5) Lightweight stat: Ask for just those details of interest, and allow a
     netfs (such as NFS) to approximate anything not of interest, possibly
     without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
     Dilger] (AT_STATX_DONT_SYNC).

 (6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
     its cached attributes are up to date [Trond Myklebust]
     (AT_STATX_FORCE_SYNC).

And the following have been left out for future extension:

 (7) Data version number: Could be used by userspace NFS servers [Aneesh
     Kumar].

     Can also be used to modify fill_post_wcc() in NFSD which retrieves
     i_version directly, but has just called vfs_getattr().  It could get
     it from the kstat struct if it used vfs_xgetattr() instead.

     (There's disagreement on the exact semantics of a single field, since
     not all filesystems do this the same way).

 (8) BSD stat compatibility: Including more fields from the BSD stat such
     as creation time (st_btime) and inode generation number (st_gen)
     [Jeremy Allison, Bernd Schubert].

 (9) Inode generation number: Useful for FUSE and userspace NFS servers
     [Bernd Schubert].

     (This was asked for but later deemed unnecessary with the
     open-by-handle capability available and caused disagreement as to
     whether it's a security hole or not).

(10) Extra coherency data may be useful in making backups [Andreas Dilger].

     (No particular data were offered, but things like last backup
     timestamp, the data version number and the DOS archive bit would come
     into this category).

(11) Allow the filesystem to indicate what it can/cannot provide: A
     filesystem can now say it doesn't support a standard stat feature if
     that isn't available, so if, for instance, inode numbers or UIDs don't
     exist or are fabricated locally...

     (This requires a separate system call - I have an fsinfo() call idea
     for this).

(12) Store a 16-byte volume ID in the superblock that can be returned in
     struct xstat [Steve French].

     (Deferred to fsinfo).

(13) Include granularity fields in the time data to indicate the
     granularity of each of the times (NFSv4 time_delta) [Steve French].

     (Deferred to fsinfo).

(14) FS_IOC_GETFLAGS value.  These could be translated to BSD's st_flags.
     Note that the Linux IOC flags are a mess and filesystems such as Ext4
     define flags that aren't in linux/fs.h, so translation in the kernel
     may be a necessity (or, possibly, we provide the filesystem type too).

     (Some attributes are made available in stx_attributes, but the general
     feeling was that the IOC flags were to ext[234]-specific and shouldn't
     be exposed through statx this way).

(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
     Michael Kerrisk].

     (Deferred, probably to fsinfo.  Finding out if there's an ACL or
     seclabal might require extra filesystem operations).

(16) Femtosecond-resolution timestamps [Dave Chinner].

     (A __reserved field has been left in the statx_timestamp struct for
     this - if there proves to be a need).

(17) A set multiple attributes syscall to go with this.

===============
NEW SYSTEM CALL
===============

The new system call is:

	int ret = statx(int dfd,
			const char *filename,
			unsigned int flags,
			unsigned int mask,
			struct statx *buffer);

The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat().  There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags.  There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.

Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):

 (1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
     respect.

 (2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
     its attributes with the server - which might require data writeback to
     occur to get the timestamps correct.

 (3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
     network filesystem.  The resulting values should be considered
     approximate.

mask is a bitmask indicating the fields in struct statx that are of
interest to the caller.  The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat().  It should be noted that asking for
more information may entail extra I/O operations.

buffer points to the destination for the data.  This must be 256 bytes in
size.

======================
MAIN ATTRIBUTES RECORD
======================

The following structures are defined in which to return the main attribute
set:

	struct statx_timestamp {
		__s64	tv_sec;
		__s32	tv_nsec;
		__s32	__reserved;
	};

	struct statx {
		__u32	stx_mask;
		__u32	stx_blksize;
		__u64	stx_attributes;
		__u32	stx_nlink;
		__u32	stx_uid;
		__u32	stx_gid;
		__u16	stx_mode;
		__u16	__spare0[1];
		__u64	stx_ino;
		__u64	stx_size;
		__u64	stx_blocks;
		__u64	__spare1[1];
		struct statx_timestamp	stx_atime;
		struct statx_timestamp	stx_btime;
		struct statx_timestamp	stx_ctime;
		struct statx_timestamp	stx_mtime;
		__u32	stx_rdev_major;
		__u32	stx_rdev_minor;
		__u32	stx_dev_major;
		__u32	stx_dev_minor;
		__u64	__spare2[14];
	};

The defined bits in request_mask and stx_mask are:

	STATX_TYPE		Want/got stx_mode & S_IFMT
	STATX_MODE		Want/got stx_mode & ~S_IFMT
	STATX_NLINK		Want/got stx_nlink
	STATX_UID		Want/got stx_uid
	STATX_GID		Want/got stx_gid
	STATX_ATIME		Want/got stx_atime{,_ns}
	STATX_MTIME		Want/got stx_mtime{,_ns}
	STATX_CTIME		Want/got stx_ctime{,_ns}
	STATX_INO		Want/got stx_ino
	STATX_SIZE		Want/got stx_size
	STATX_BLOCKS		Want/got stx_blocks
	STATX_BASIC_STATS	[The stuff in the normal stat struct]
	STATX_BTIME		Want/got stx_btime{,_ns}
	STATX_ALL		[All currently available stuff]

stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.

Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution.  Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.

The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does.  The following
attributes map to FS_*_FL flags and are the same numerical value:

	STATX_ATTR_COMPRESSED		File is compressed by the fs
	STATX_ATTR_IMMUTABLE		File is marked immutable
	STATX_ATTR_APPEND		File is append-only
	STATX_ATTR_NODUMP		File is not to be dumped
	STATX_ATTR_ENCRYPTED		File requires key to decrypt in fs

Within the kernel, the supported flags are listed by:

	KSTAT_ATTR_FS_IOC_FLAGS

[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]

New flags include:

	STATX_ATTR_AUTOMOUNT		Object is an automount trigger

These are for the use of GUI tools that might want to mark files specially,
depending on what they are.

Fields in struct statx come in a number of classes:

 (0) stx_dev_*, stx_blksize.

     These are local system information and are always available.

 (1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
     stx_size, stx_blocks.

     These will be returned whether the caller asks for them or not.  The
     corresponding bits in stx_mask will be set to indicate whether they
     actually have valid values.

     If the caller didn't ask for them, then they may be approximated.  For
     example, NFS won't waste any time updating them from the server,
     unless as a byproduct of updating something requested.

     If the values don't actually exist for the underlying object (such as
     UID or GID on a DOS file), then the bit won't be set in the stx_mask,
     even if the caller asked for the value.  In such a case, the returned
     value will be a fabrication.

     Note that there are instances where the type might not be valid, for
     instance Windows reparse points.

 (2) stx_rdev_*.

     This will be set only if stx_mode indicates we're looking at a
     blockdev or a chardev, otherwise will be 0.

 (3) stx_btime.

     Similar to (1), except this will be set to 0 if it doesn't exist.

=======
TESTING
=======

The following test program can be used to test the statx system call:

	samples/statx/test-statx.c

Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.

Here's some example output.  Firstly, an NFS directory that crosses to
another FSID.  Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.

	[root@andromeda ~]# /tmp/test-statx -A /warthog/data
	statx(/warthog/data) = 0
	results=7ff
	  Size: 4096            Blocks: 8          IO Block: 1048576  directory
	Device: 00:26           Inode: 1703937     Links: 125
	Access: (3777/drwxrwxrwx)  Uid:     0   Gid:  4041
	Access: 2016-11-24 09:02:12.219699527+0000
	Modify: 2016-11-17 10:44:36.225653653+0000
	Change: 2016-11-17 10:44:36.225653653+0000
	Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)

Secondly, the result of automounting on that directory.

	[root@andromeda ~]# /tmp/test-statx /warthog/data
	statx(/warthog/data) = 0
	results=7ff
	  Size: 4096            Blocks: 8          IO Block: 1048576  directory
	Device: 00:27           Inode: 2           Links: 125
	Access: (3777/drwxrwxrwx)  Uid:     0   Gid:  4041
	Access: 2016-11-24 09:02:12.219699527+0000
	Modify: 2016-11-17 10:44:36.225653653+0000
	Change: 2016-11-17 10:44:36.225653653+0000

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking                  |   3 +-
 Documentation/filesystems/vfs.txt                  |   3 +-
 arch/x86/entry/syscalls/syscall_32.tbl             |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl             |   1 +
 drivers/base/devtmpfs.c                            |   3 +-
 drivers/block/loop.c                               |   3 +-
 drivers/mtd/ubi/build.c                            |   2 +-
 drivers/mtd/ubi/kapi.c                             |   2 +-
 drivers/staging/lustre/lustre/llite/file.c         |   9 +-
 .../staging/lustre/lustre/llite/llite_internal.h   |   3 +-
 fs/9p/vfs_inode.c                                  |  10 +-
 fs/9p/vfs_inode_dotl.c                             |   5 +-
 fs/afs/inode.c                                     |   8 +-
 fs/afs/internal.h                                  |   2 +-
 fs/bad_inode.c                                     |   4 +-
 fs/btrfs/inode.c                                   |   6 +-
 fs/ceph/inode.c                                    |   6 +-
 fs/ceph/super.h                                    |   4 +-
 fs/cifs/cifsfs.h                                   |   2 +-
 fs/cifs/inode.c                                    |   5 +-
 fs/coda/coda_linux.h                               |   2 +-
 fs/coda/inode.c                                    |   7 +-
 fs/ecryptfs/inode.c                                |  13 +-
 fs/exportfs/expfs.c                                |   3 +-
 fs/ext4/ext4.h                                     |   3 +-
 fs/ext4/inode.c                                    |   6 +-
 fs/f2fs/f2fs.h                                     |   4 +-
 fs/f2fs/file.c                                     |   6 +-
 fs/fat/fat.h                                       |   4 +-
 fs/fat/file.c                                      |   5 +-
 fs/fuse/dir.c                                      |   6 +-
 fs/gfs2/inode.c                                    |  11 +-
 fs/kernfs/inode.c                                  |   8 +-
 fs/kernfs/kernfs-internal.h                        |   4 +-
 fs/libfs.c                                         |  12 +-
 fs/minix/inode.c                                   |  11 +-
 fs/minix/minix.h                                   |   2 +-
 fs/nfs/inode.c                                     |  13 +-
 fs/nfs/namespace.c                                 |   9 +-
 fs/nfsd/nfs4xdr.c                                  |   4 +-
 fs/nfsd/vfs.h                                      |   3 +-
 fs/ocfs2/file.c                                    |  11 +-
 fs/ocfs2/file.h                                    |   4 +-
 fs/orangefs/inode.c                                |  13 +-
 fs/orangefs/orangefs-kernel.h                      |   5 +-
 fs/overlayfs/copy_up.c                             |   6 +-
 fs/overlayfs/dir.c                                 |  10 +-
 fs/overlayfs/inode.c                               |   7 +-
 fs/proc/base.c                                     |  12 +-
 fs/proc/generic.c                                  |   6 +-
 fs/proc/internal.h                                 |   2 +-
 fs/proc/proc_net.c                                 |   6 +-
 fs/proc/proc_sysctl.c                              |   5 +-
 fs/proc/root.c                                     |   6 +-
 fs/stat.c                                          | 214 ++++++++++++++---
 fs/sysv/itree.c                                    |   7 +-
 fs/sysv/sysv.h                                     |   2 +-
 fs/ubifs/dir.c                                     |   6 +-
 fs/ubifs/ubifs.h                                   |   4 +-
 fs/udf/symlink.c                                   |   5 +-
 fs/xfs/xfs_iops.c                                  |   9 +-
 include/linux/fs.h                                 |  35 ++-
 include/linux/nfs_fs.h                             |   2 +-
 include/linux/stat.h                               |  24 +-
 include/linux/syscalls.h                           |   3 +
 include/uapi/linux/fcntl.h                         |   5 +
 include/uapi/linux/stat.h                          | 131 +++++++++++
 mm/shmem.c                                         |   6 +-
 samples/Kconfig                                    |   6 +
 samples/Makefile                                   |   2 +-
 samples/statx/Makefile                             |  10 +
 samples/statx/test-statx.c                         | 254 +++++++++++++++++++++
 72 files changed, 822 insertions(+), 214 deletions(-)
 create mode 100644 samples/statx/Makefile
 create mode 100644 samples/statx/test-statx.c

(limited to 'include/uapi')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index ace63cd7af8c..fdcfdd79682a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -58,7 +58,8 @@ prototypes:
 	int (*permission) (struct inode *, int, unsigned int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
+	int (*getattr) (const struct path *, struct dentry *, struct kstat *,
+			u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 	void (*update_time)(struct inode *, struct timespec *, int);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index b968084eeac1..569211703721 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -382,7 +382,8 @@ struct inode_operations {
 	int (*permission) (struct inode *, int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+	int (*getattr) (const struct path *, struct dentry *, struct kstat *,
+			u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	void (*update_time)(struct inode *, struct timespec *, int);
 	int (*atomic_open)(struct inode *, struct dentry *, struct file *,
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2b3618542544..9ba050fe47f3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -389,3 +389,4 @@
 380	i386	pkey_mprotect		sys_pkey_mprotect
 381	i386	pkey_alloc		sys_pkey_alloc
 382	i386	pkey_free		sys_pkey_free
+383	i386	statx			sys_statx
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e93ef0b38db8..5aef183e2f85 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -338,6 +338,7 @@
 329	common	pkey_mprotect		sys_pkey_mprotect
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
+332	common	statx			sys_statx
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 44a74cf1372c..d2fb9c8ed205 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -309,7 +309,8 @@ static int handle_remove(const char *nodename, struct device *dev)
 	if (d_really_is_positive(dentry)) {
 		struct kstat stat;
 		struct path p = {.mnt = parent.mnt, .dentry = dentry};
-		err = vfs_getattr(&p, &stat);
+		err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE,
+				  AT_STATX_SYNC_AS_STAT);
 		if (!err && dev_mynode(dev, d_inode(dentry), &stat)) {
 			struct iattr newattrs;
 			/*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index eeb1db73f44e..8f4051999741 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1175,7 +1175,8 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
-	error = vfs_getattr(&file->f_path, &stat);
+	error = vfs_getattr(&file->f_path, &stat,
+			    STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (error)
 		return error;
 	memset(info, 0, sizeof(*info));
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 85d54f37e28f..77513195f50e 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -1159,7 +1159,7 @@ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev)
 	if (err)
 		return ERR_PTR(err);
 
-	err = vfs_getattr(&path, &stat);
+	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
 	if (err)
 		return ERR_PTR(err);
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 88b1897aeb40..d4b2e8744498 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -314,7 +314,7 @@ struct ubi_volume_desc *ubi_open_volume_path(const char *pathname, int mode)
 	if (error)
 		return ERR_PTR(error);
 
-	error = vfs_getattr(&path, &stat);
+	error = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
 	if (error)
 		return ERR_PTR(error);
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index 10adfcdd7035..481c0d01d4c6 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -2952,15 +2952,16 @@ static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
 	return rc;
 }
 
-int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+int ll_getattr(const struct path *path, struct kstat *stat,
+	       u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(de);
+	struct inode *inode = d_inode(path->dentry);
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct ll_inode_info *lli = ll_i2info(inode);
 	int res;
 
-	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
-				      MDS_INODELOCK_LOOKUP);
+	res = ll_inode_revalidate(path->dentry,
+				  MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP);
 	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
 
 	if (res)
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index ecdfd0c29b7f..55f68acd85d1 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -750,7 +750,8 @@ int ll_file_open(struct inode *inode, struct file *file);
 int ll_file_release(struct inode *inode, struct file *file);
 int ll_release_openhandle(struct inode *, struct lookup_intent *);
 int ll_md_real_close(struct inode *inode, fmode_t fmode);
-int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+int ll_getattr(const struct path *path, struct kstat *stat,
+	       u32 request_mask, unsigned int flags);
 struct posix_acl *ll_get_acl(struct inode *inode, int type);
 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 	       const char *name, int namelen);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f4f4450119e4..f1d96233670c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1047,16 +1047,18 @@ done:
 
 /**
  * v9fs_vfs_getattr - retrieve file metadata
- * @mnt: mount information
- * @dentry: file to get attributes on
+ * @path: Object to query
  * @stat: metadata structure to populate
+ * @request_mask: Mask of STATX_xxx flags indicating the caller's interests
+ * @flags: AT_STATX_xxx setting
  *
  */
 
 static int
-v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_wstat *st;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5999bd050678..570e63ee5b71 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -468,9 +468,10 @@ error:
 }
 
 static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_stat_dotl *st;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 86cc7264c21c..1e4897a048d2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -375,12 +375,10 @@ error_unlock:
 /*
  * read the attributes of an inode
  */
-int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		      struct kstat *stat)
+int afs_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode;
-
-	inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 
 	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8acf3670e756..5dfa56903a2d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -533,7 +533,7 @@ extern struct inode *afs_iget(struct super_block *, struct key *,
 			      struct afs_callback *);
 extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
-extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int afs_setattr(struct dentry *, struct iattr *);
 extern void afs_evict_inode(struct inode *);
 extern int afs_drop_inode(struct inode *);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f685c819298..bb53728c7a31 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -89,8 +89,8 @@ static int bad_inode_permission(struct inode *inode, int mask)
 	return -EIO;
 }
 
-static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
+static int bad_inode_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
 	return -EIO;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ee6978d80491..c40060cc481f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9413,11 +9413,11 @@ fail:
 	return -ENOMEM;
 }
 
-static int btrfs_getattr(struct vfsmount *mnt,
-			 struct dentry *dentry, struct kstat *stat)
+static int btrfs_getattr(const struct path *path, struct kstat *stat,
+			 u32 request_mask, unsigned int flags)
 {
 	u64 delalloc_bytes;
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	u32 blocksize = inode->i_sb->s_blocksize;
 
 	generic_fillattr(inode, stat);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fd8f771f99b7..d449e1c03cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2187,10 +2187,10 @@ int ceph_permission(struct inode *inode, int mask)
  * Get all attributes.  Hopefully somedata we'll have a statlite()
  * and can limit the fields we require to be accurate.
  */
-int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+int ceph_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int err;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e9410bcf4113..fe6b9cfc4013 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -784,8 +784,8 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
 extern int ceph_permission(struct inode *inode, int mask);
 extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat);
+extern int ceph_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
 
 /* xattr.c */
 int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c9c00a862036..da717fee3026 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -83,7 +83,7 @@ extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_revalidate_mapping(struct inode *inode);
 extern int cifs_zap_mapping(struct inode *inode);
-extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
 extern const struct inode_operations cifs_file_inode_ops;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7ab5be7944aa..1363fff460b9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1990,9 +1990,10 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 	return cifs_revalidate_mapping(inode);
 }
 
-int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+int cifs_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	struct inode *inode = d_inode(dentry);
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 5104d84c4f64..d3c361883c28 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -47,7 +47,7 @@ int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
 int coda_permission(struct inode *inode, int mask);
 int coda_revalidate_inode(struct inode *);
-int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_getattr(const struct path *, struct kstat *, u32, unsigned int);
 int coda_setattr(struct dentry *, struct iattr *);
 
 /* this file:  heloers */
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 71dbe7e287ce..2dea594da199 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -255,11 +255,12 @@ static void coda_evict_inode(struct inode *inode)
 	coda_cache_clear_inode(inode);
 }
 
-int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int coda_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	int err = coda_revalidate_inode(d_inode(dentry));
+	int err = coda_revalidate_inode(d_inode(path->dentry));
 	if (!err)
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(d_inode(path->dentry), stat);
 	return err;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e7413f82d27b..efc2db42d175 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -959,9 +959,10 @@ out:
 	return rc;
 }
 
-static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
-				 struct kstat *stat)
+static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
+				 u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	int rc = 0;
 
@@ -983,13 +984,15 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
 	return rc;
 }
 
-static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			    struct kstat *stat)
+static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
+			    u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct kstat lower_stat;
 	int rc;
 
-	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);
+	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat,
+			 request_mask, flags);
 	if (!rc) {
 		fsstack_copy_attr_all(d_inode(dentry),
 				      ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index a4b531be9168..f2d24bb8d745 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -299,7 +299,8 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 	 * filesystem supports 64-bit inode numbers.  So we need to
 	 * actually call ->getattr, not just read i_ino:
 	 */
-	error = vfs_getattr_nosec(&child_path, &stat);
+	error = vfs_getattr_nosec(&child_path, &stat,
+				  STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (error)
 		return error;
 	buffer.ino = stat.ino;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2fd17e8e4984..025d2e85f454 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2462,8 +2462,7 @@ extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
-extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
-				struct kstat *stat);
+extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 971f66342080..7385e6a6b6cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5387,13 +5387,13 @@ err_out:
 	return error;
 }
 
-int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
+int ext4_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode;
 	unsigned long long delalloc_blocks;
 
-	inode = d_inode(dentry);
+	inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 
 	/*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d1483136fed6..e849f83d6114 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2040,8 +2040,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void truncate_data_blocks(struct dnode_of_data *dn);
 int truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate(struct inode *inode);
-int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat);
+int f2fs_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
 int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
 int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 78e65288f2b2..5f7317875a67 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -633,10 +633,10 @@ int f2fs_truncate(struct inode *inode)
 	return 0;
 }
 
-int f2fs_getattr(struct vfsmount *mnt,
-			 struct dentry *dentry, struct kstat *stat)
+int f2fs_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	stat->blocks <<= 3;
 	return 0;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6b764a17a9c..051dac1ce3be 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -364,8 +364,8 @@ extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
-extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		       struct kstat *stat);
+extern int fat_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int flags);
 extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync);
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3d04b124bce0..4724cc9ad650 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -365,9 +365,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
-int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int fat_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 811fd8929a18..beb3d64f16e2 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1777,10 +1777,10 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	return ret;
 }
 
-static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
-			struct kstat *stat)
+static int fuse_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(entry);
+	struct inode *inode = d_inode(path->dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	if (!fuse_allow_current_process(fc))
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index eb7724b8578a..288c15f385bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1959,9 +1959,10 @@ out:
 
 /**
  * gfs2_getattr - Read out an inode's attributes
- * @mnt: The vfsmount the inode is being accessed from
- * @dentry: The dentry to stat
+ * @path: Object to query
  * @stat: The inode's stats
+ * @request_mask: Mask of STATX_xxx flags indicating the caller's interests
+ * @flags: AT_STATX_xxx setting
  *
  * This may be called from the VFS directly, or from within GFS2 with the
  * inode locked, so we look to see if the glock is already locked and only
@@ -1972,10 +1973,10 @@ out:
  * Returns: errno
  */
 
-static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
+static int gfs2_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int error;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index ac9e108ce1ea..fb4b4a79a0d6 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -200,11 +200,11 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
-int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		   struct kstat *stat)
+int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int query_flags)
 {
-	struct kernfs_node *kn = dentry->d_fsdata;
-	struct inode *inode = d_inode(dentry);
+	struct kernfs_node *kn = path->dentry->d_fsdata;
+	struct inode *inode = d_inode(path->dentry);
 
 	mutex_lock(&kernfs_mutex);
 	kernfs_refresh_inode(kn, inode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3100987cf8ba..2d5144ab4251 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -80,8 +80,8 @@ extern const struct xattr_handler *kernfs_xattr_handlers[];
 void kernfs_evict_inode(struct inode *inode);
 int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
-int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		       struct kstat *stat);
+int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int query_flags);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
 
 /*
diff --git a/fs/libfs.c b/fs/libfs.c
index 28d6f35feed6..1dfaf8f606c0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,10 +20,10 @@
 
 #include "internal.h"
 
-int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		   struct kstat *stat)
+int simple_getattr(const struct path *path, struct kstat *stat,
+		   u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
 	return 0;
@@ -1143,10 +1143,10 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
 	return ERR_PTR(-ENOENT);
 }
 
-static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
-				 struct kstat *stat)
+static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(inode, stat);
 	return 0;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e7d9bf86d975..6ac76b0434e9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -622,11 +622,14 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return err;
 }
 
-int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int minix_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags)
 {
-	struct super_block *sb = dentry->d_sb;
-	generic_fillattr(d_inode(dentry), stat);
-	if (INODE_VERSION(d_inode(dentry)) == MINIX_V1)
+	struct super_block *sb = path->dentry->d_sb;
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(inode, stat);
+	if (INODE_VERSION(inode) == MINIX_V1)
 		stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
 	else
 		stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 01ad81dcacc5..663d66138d06 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -51,7 +51,7 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
 extern unsigned long minix_count_free_blocks(struct super_block *sb);
-extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 extern void V1_minix_truncate(struct inode *);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5ca4d96b1942..b5425315adcc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -703,9 +703,10 @@ static bool nfs_need_revalidate_inode(struct inode *inode)
 	return false;
 }
 
-int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int nfs_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err = 0;
 
@@ -726,17 +727,17 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
 	 *    no point in checking those.
 	 */
- 	if ((mnt->mnt_flags & MNT_NOATIME) ||
- 	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+	if ((path->mnt->mnt_flags & MNT_NOATIME) ||
+	    ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		need_atime = 0;
 
 	if (need_atime || nfs_need_revalidate_inode(inode)) {
 		struct nfs_server *server = NFS_SERVER(inode);
 
-		nfs_readdirplus_parent_cache_miss(dentry);
+		nfs_readdirplus_parent_cache_miss(path->dentry);
 		err = __nfs_revalidate_inode(server, inode);
 	} else
-		nfs_readdirplus_parent_cache_hit(dentry);
+		nfs_readdirplus_parent_cache_hit(path->dentry);
 	if (!err) {
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e49d831c4e85..786f17580582 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -178,11 +178,12 @@ out_nofree:
 }
 
 static int
-nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+nfs_namespace_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int query_flags)
 {
-	if (NFS_FH(d_inode(dentry))->size != 0)
-		return nfs_getattr(mnt, dentry, stat);
-	generic_fillattr(d_inode(dentry), stat);
+	if (NFS_FH(d_inode(path->dentry))->size != 0)
+		return nfs_getattr(path, stat, request_mask, query_flags);
+	generic_fillattr(d_inode(path->dentry), stat);
 	return 0;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 382c1fd05b4c..33017d652b1d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2301,7 +2301,7 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
 		if (path.dentry != path.mnt->mnt_root)
 			break;
 	}
-	err = vfs_getattr(&path, stat);
+	err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
 	return err;
 }
@@ -2385,7 +2385,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out;
 	}
 
-	err = vfs_getattr(&path, &stat);
+	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
 	if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index db98c48c735a..1bbdccecbf3d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -135,7 +135,8 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
 {
 	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
 			 .dentry = fh->fh_dentry};
-	return nfserrno(vfs_getattr(&p, stat));
+	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
+				    AT_STATX_SYNC_AS_STAT));
 }
 
 static inline int nfsd_create_is_exclusive(int createmode)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8836305eb378..bfeb647459d9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1306,16 +1306,15 @@ bail:
 	return status;
 }
 
-int ocfs2_getattr(struct vfsmount *mnt,
-		  struct dentry *dentry,
-		  struct kstat *stat)
+int ocfs2_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags)
 {
-	struct inode *inode = d_inode(dentry);
-	struct super_block *sb = dentry->d_sb;
+	struct inode *inode = d_inode(path->dentry);
+	struct super_block *sb = path->dentry->d_sb;
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 
-	err = ocfs2_inode_revalidate(dentry);
+	err = ocfs2_inode_revalidate(path->dentry);
 	if (err) {
 		if (err != -ENOENT)
 			mlog_errno(err);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 897fd9a2e51d..1fdc9839cd93 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -68,8 +68,8 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 		u32 clusters_to_add, int mark_unwritten);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
-int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
+int ocfs2_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags);
 int ocfs2_permission(struct inode *inode, int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5cd617980fbf..a304bf34b212 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -245,25 +245,24 @@ out:
 /*
  * Obtain attributes of an object given a dentry
  */
-int orangefs_getattr(struct vfsmount *mnt,
-		  struct dentry *dentry,
-		  struct kstat *kstat)
+int orangefs_getattr(const struct path *path, struct kstat *stat,
+		     u32 request_mask, unsigned int flags)
 {
 	int ret = -ENOENT;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	struct orangefs_inode_s *orangefs_inode = NULL;
 
 	gossip_debug(GOSSIP_INODE_DEBUG,
 		     "orangefs_getattr: called on %pd\n",
-		     dentry);
+		     path->dentry);
 
 	ret = orangefs_inode_getattr(inode, 0, 0);
 	if (ret == 0) {
-		generic_fillattr(inode, kstat);
+		generic_fillattr(inode, stat);
 
 		/* override block size reported to stat */
 		orangefs_inode = ORANGEFS_I(inode);
-		kstat->blksize = orangefs_inode->blksize;
+		stat->blksize = orangefs_inode->blksize;
 	}
 	return ret;
 }
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 70355a9a2596..0c4f03c22ce0 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -439,9 +439,8 @@ struct inode *orangefs_new_inode(struct super_block *sb,
 
 int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
 
-int orangefs_getattr(struct vfsmount *mnt,
-		  struct dentry *dentry,
-		  struct kstat *kstat);
+int orangefs_getattr(const struct path *path, struct kstat *stat,
+		     u32 request_mask, unsigned int flags);
 
 int orangefs_permission(struct inode *inode, int mask);
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f57043dace62..a6f9ca621e0b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -346,7 +346,8 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
 	ovl_path_upper(parent, &parentpath);
 	upperdir = parentpath.dentry;
 
-	err = vfs_getattr(&parentpath, &pstat);
+	err = vfs_getattr(&parentpath, &pstat,
+			  STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT);
 	if (err)
 		return err;
 
@@ -409,7 +410,8 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags)
 		}
 
 		ovl_path_lower(next, &lowerpath);
-		err = vfs_getattr(&lowerpath, &stat);
+		err = vfs_getattr(&lowerpath, &stat,
+				  STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 		/* maybe truncate regular file. this has no effect on dirs */
 		if (flags & O_TRUNC)
 			stat.size = 0;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 16e06dd89457..6515796460df 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,9 +138,10 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
 	return err;
 }
 
-static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			 struct kstat *stat)
+static int ovl_dir_getattr(const struct path *path, struct kstat *stat,
+			   u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	int err;
 	enum ovl_path_type type;
 	struct path realpath;
@@ -148,7 +149,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	type = ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat);
+	err = vfs_getattr(&realpath, stat, request_mask, flags);
 	revert_creds(old_cred);
 	if (err)
 		return err;
@@ -264,7 +265,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 		goto out;
 
 	ovl_path_upper(dentry, &upperpath);
-	err = vfs_getattr(&upperpath, &stat);
+	err = vfs_getattr(&upperpath, &stat,
+			  STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_unlock;
 
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 08643ac44a02..d4bb54f7b6b4 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -56,16 +56,17 @@ out:
 	return err;
 }
 
-static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			 struct kstat *stat)
+static int ovl_getattr(const struct path *path, struct kstat *stat,
+		       u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct path realpath;
 	const struct cred *old_cred;
 	int err;
 
 	ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat);
+	err = vfs_getattr(&realpath, stat, request_mask, flags);
 	revert_creds(old_cred);
 	return err;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1e1e182d571b..3b5e6aa2a326 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1724,11 +1724,12 @@ out_unlock:
 	return NULL;
 }
 
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct task_struct *task;
-	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
+	struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -3511,9 +3512,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
-static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+static int proc_task_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct task_struct *p = get_proc_task(inode);
 	generic_fillattr(inode, stat);
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 06c73904d497..ee27feb34cf4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -118,10 +118,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	return 0;
 }
 
-static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			struct kstat *stat)
+static int proc_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct proc_dir_entry *de = PDE(inode);
 	if (de && de->nlink)
 		set_nlink(inode, de->nlink);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5d6960f5f1c0..e93cdc6ddb31 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -149,7 +149,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
  * base.c
  */
 extern const struct dentry_operations pid_dentry_operations;
-extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern int pid_revalidate(struct dentry *, unsigned int);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index ffd72a6c6e04..9db1df2537fc 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -140,10 +140,10 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
 	return de;
 }
 
-static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		struct kstat *stat)
+static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
+				 u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct net *net;
 
 	net = get_proc_task_net(inode);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 3e64c6502dc8..3d8726445ad1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -801,9 +801,10 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+			    u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b90da888b81a..fb1955c82274 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,10 +149,10 @@ void __init proc_root_init(void)
 	proc_sys_init();
 }
 
-static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
-)
+static int proc_root_getattr(const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
 {
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(d_inode(path->dentry), stat);
 	stat->nlink = proc_root.nlink + nr_processes();
 	return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 3f14d1ef0868..a3804feadade 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -18,6 +18,15 @@
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 
+/**
+ * generic_fillattr - Fill in the basic attributes from the inode struct
+ * @inode: Inode to use as the source
+ * @stat: Where to fill in the attributes
+ *
+ * Fill in the basic attributes in the kstat structure from data that's to be
+ * found on the VFS inode structure.  This is the default if no getattr inode
+ * operation is supplied.
+ */
 void generic_fillattr(struct inode *inode, struct kstat *stat)
 {
 	stat->dev = inode->i_sb->s_dev;
@@ -33,81 +42,147 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 	stat->ctime = inode->i_ctime;
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
-}
 
+	if (IS_NOATIME(inode))
+		stat->result_mask &= ~STATX_ATIME;
+	if (IS_AUTOMOUNT(inode))
+		stat->attributes |= STATX_ATTR_AUTOMOUNT;
+}
 EXPORT_SYMBOL(generic_fillattr);
 
 /**
  * vfs_getattr_nosec - getattr without security checks
  * @path: file to get attributes from
  * @stat: structure to return attributes in
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
  *
  * Get attributes without calling security_inode_getattr.
  *
  * Currently the only caller other than vfs_getattr is internal to the
- * filehandle lookup code, which uses only the inode number and returns
- * no attributes to any user.  Any other code probably wants
- * vfs_getattr.
+ * filehandle lookup code, which uses only the inode number and returns no
+ * attributes to any user.  Any other code probably wants vfs_getattr.
  */
-int vfs_getattr_nosec(struct path *path, struct kstat *stat)
+int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
+		      u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_backing_inode(path->dentry);
 
+	memset(stat, 0, sizeof(*stat));
+	stat->result_mask |= STATX_BASIC_STATS;
+	request_mask &= STATX_ALL;
+	query_flags &= KSTAT_QUERY_FLAGS;
 	if (inode->i_op->getattr)
-		return inode->i_op->getattr(path->mnt, path->dentry, stat);
+		return inode->i_op->getattr(path, stat, request_mask,
+					    query_flags);
 
 	generic_fillattr(inode, stat);
 	return 0;
 }
-
 EXPORT_SYMBOL(vfs_getattr_nosec);
 
-int vfs_getattr(struct path *path, struct kstat *stat)
+/*
+ * vfs_getattr - Get the enhanced basic attributes of a file
+ * @path: The file of interest
+ * @stat: Where to return the statistics
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
+ *
+ * Ask the filesystem for a file's attributes.  The caller must indicate in
+ * request_mask and query_flags to indicate what they want.
+ *
+ * If the file is remote, the filesystem can be forced to update the attributes
+ * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can
+ * suppress the update by passing AT_STATX_DONT_SYNC.
+ *
+ * Bits must have been set in request_mask to indicate which attributes the
+ * caller wants retrieving.  Any such attribute not requested may be returned
+ * anyway, but the value may be approximate, and, if remote, may not have been
+ * synchronised with the server.
+ *
+ * 0 will be returned on success, and a -ve error code if unsuccessful.
+ */
+int vfs_getattr(const struct path *path, struct kstat *stat,
+		u32 request_mask, unsigned int query_flags)
 {
 	int retval;
 
 	retval = security_inode_getattr(path);
 	if (retval)
 		return retval;
-	return vfs_getattr_nosec(path, stat);
+	return vfs_getattr_nosec(path, stat, request_mask, query_flags);
 }
-
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_fstat(unsigned int fd, struct kstat *stat)
+/**
+ * vfs_statx_fd - Get the enhanced basic attributes by file descriptor
+ * @fd: The file descriptor referring to the file of interest
+ * @stat: The result structure to fill in.
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
+ *
+ * This function is a wrapper around vfs_getattr().  The main difference is
+ * that it uses a file descriptor to determine the file location.
+ *
+ * 0 will be returned on success, and a -ve error code if unsuccessful.
+ */
+int vfs_statx_fd(unsigned int fd, struct kstat *stat,
+		 u32 request_mask, unsigned int query_flags)
 {
 	struct fd f = fdget_raw(fd);
 	int error = -EBADF;
 
 	if (f.file) {
-		error = vfs_getattr(&f.file->f_path, stat);
+		error = vfs_getattr(&f.file->f_path, stat,
+				    request_mask, query_flags);
 		fdput(f);
 	}
 	return error;
 }
-EXPORT_SYMBOL(vfs_fstat);
+EXPORT_SYMBOL(vfs_statx_fd);
 
-int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
-		int flag)
+/**
+ * vfs_statx - Get basic and extra attributes by filename
+ * @dfd: A file descriptor representing the base dir for a relative filename
+ * @filename: The name of the file of interest
+ * @flags: Flags to control the query
+ * @stat: The result structure to fill in.
+ * @request_mask: STATX_xxx flags indicating what the caller wants
+ *
+ * This function is a wrapper around vfs_getattr().  The main difference is
+ * that it uses a filename and base directory to determine the file location.
+ * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
+ * at the given name from being referenced.
+ *
+ * The caller must have preset stat->request_mask as for vfs_getattr().  The
+ * flags are also used to load up stat->query_flags.
+ *
+ * 0 will be returned on success, and a -ve error code if unsuccessful.
+ */
+int vfs_statx(int dfd, const char __user *filename, int flags,
+	      struct kstat *stat, u32 request_mask)
 {
 	struct path path;
 	int error = -EINVAL;
-	unsigned int lookup_flags = 0;
+	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
 
-	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
-		      AT_EMPTY_PATH)) != 0)
-		goto out;
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		       AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
+		return -EINVAL;
 
-	if (!(flag & AT_SYMLINK_NOFOLLOW))
-		lookup_flags |= LOOKUP_FOLLOW;
-	if (flag & AT_EMPTY_PATH)
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
+
 retry:
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
 		goto out;
 
-	error = vfs_getattr(&path, stat);
+	error = vfs_getattr(&path, stat, request_mask, flags);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -116,19 +191,7 @@ retry:
 out:
 	return error;
 }
-EXPORT_SYMBOL(vfs_fstatat);
-
-int vfs_stat(const char __user *name, struct kstat *stat)
-{
-	return vfs_fstatat(AT_FDCWD, name, stat, 0);
-}
-EXPORT_SYMBOL(vfs_stat);
-
-int vfs_lstat(const char __user *name, struct kstat *stat)
-{
-	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
-}
-EXPORT_SYMBOL(vfs_lstat);
+EXPORT_SYMBOL(vfs_statx);
 
 
 #ifdef __ARCH_WANT_OLD_STAT
@@ -141,7 +204,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 {
 	static int warncount = 5;
 	struct __old_kernel_stat tmp;
-	
+
 	if (warncount > 0) {
 		warncount--;
 		printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
@@ -166,7 +229,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 #if BITS_PER_LONG == 32
 	if (stat->size > MAX_NON_LFS)
 		return -EOVERFLOW;
-#endif	
+#endif
 	tmp.st_size = stat->size;
 	tmp.st_atime = stat->atime.tv_sec;
 	tmp.st_mtime = stat->mtime.tv_sec;
@@ -445,6 +508,81 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 }
 #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
 
+static inline int __put_timestamp(struct timespec *kts,
+				  struct statx_timestamp __user *uts)
+{
+	return (__put_user(kts->tv_sec,		&uts->tv_sec		) ||
+		__put_user(kts->tv_nsec,	&uts->tv_nsec		) ||
+		__put_user(0,			&uts->__reserved	));
+}
+
+/*
+ * Set the statx results.
+ */
+static long statx_set_result(struct kstat *stat, struct statx __user *buffer)
+{
+	uid_t uid = from_kuid_munged(current_user_ns(), stat->uid);
+	gid_t gid = from_kgid_munged(current_user_ns(), stat->gid);
+
+	if (__put_user(stat->result_mask,	&buffer->stx_mask	) ||
+	    __put_user(stat->mode,		&buffer->stx_mode	) ||
+	    __clear_user(&buffer->__spare0, sizeof(buffer->__spare0))	  ||
+	    __put_user(stat->nlink,		&buffer->stx_nlink	) ||
+	    __put_user(uid,			&buffer->stx_uid	) ||
+	    __put_user(gid,			&buffer->stx_gid	) ||
+	    __put_user(stat->attributes,	&buffer->stx_attributes	) ||
+	    __put_user(stat->blksize,		&buffer->stx_blksize	) ||
+	    __put_user(MAJOR(stat->rdev),	&buffer->stx_rdev_major	) ||
+	    __put_user(MINOR(stat->rdev),	&buffer->stx_rdev_minor	) ||
+	    __put_user(MAJOR(stat->dev),	&buffer->stx_dev_major	) ||
+	    __put_user(MINOR(stat->dev),	&buffer->stx_dev_minor	) ||
+	    __put_timestamp(&stat->atime,	&buffer->stx_atime	) ||
+	    __put_timestamp(&stat->btime,	&buffer->stx_btime	) ||
+	    __put_timestamp(&stat->ctime,	&buffer->stx_ctime	) ||
+	    __put_timestamp(&stat->mtime,	&buffer->stx_mtime	) ||
+	    __put_user(stat->ino,		&buffer->stx_ino	) ||
+	    __put_user(stat->size,		&buffer->stx_size	) ||
+	    __put_user(stat->blocks,		&buffer->stx_blocks	) ||
+	    __clear_user(&buffer->__spare1, sizeof(buffer->__spare1))	  ||
+	    __clear_user(&buffer->__spare2, sizeof(buffer->__spare2)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * sys_statx - System call to get enhanced stats
+ * @dfd: Base directory to pathwalk from *or* fd to stat.
+ * @filename: File to stat *or* NULL.
+ * @flags: AT_* flags to control pathwalk.
+ * @mask: Parts of statx struct actually required.
+ * @buffer: Result buffer.
+ *
+ * Note that if filename is NULL, then it does the equivalent of fstat() using
+ * dfd to indicate the file of interest.
+ */
+SYSCALL_DEFINE5(statx,
+		int, dfd, const char __user *, filename, unsigned, flags,
+		unsigned int, mask,
+		struct statx __user *, buffer)
+{
+	struct kstat stat;
+	int error;
+
+	if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, buffer, sizeof(*buffer)))
+		return -EFAULT;
+
+	if (filename)
+		error = vfs_statx(dfd, filename, flags, &stat, mask);
+	else
+		error = vfs_statx_fd(dfd, &stat, mask, flags);
+	if (error)
+		return error;
+	return statx_set_result(&stat, buffer);
+}
+
 /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
 void __inode_add_bytes(struct inode *inode, loff_t bytes)
 {
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 08d3e630b49c..83809f5b5eca 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -440,10 +440,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
 	return blocks;
 }
 
-int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int sysv_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
 {
-	struct super_block *s = dentry->d_sb;
-	generic_fillattr(d_inode(dentry), stat);
+	struct super_block *s = path->dentry->d_sb;
+	generic_fillattr(d_inode(path->dentry), stat);
 	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
 	stat->blksize = s->s_blocksize;
 	return 0;
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 6c212288adcb..1e7e27c729af 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int sysv_init_icache(void);
 extern void sysv_destroy_icache(void);
 
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 528369f3e472..30825d882aa9 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1622,11 +1622,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat)
+int ubifs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags)
 {
 	loff_t size;
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode = d_inode(path->dentry);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	mutex_lock(&ui->ui_mutex);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f0c86f076535..4d57e488038e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1749,8 +1749,8 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode);
-int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
+int ubifs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int flags);
 int ubifs_check_dir_empty(struct inode *dir);
 
 /* xattr.c */
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index f7dfef53f739..6023c97c6da2 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -152,9 +152,10 @@ out_unmap:
 	return err;
 }
 
-static int udf_symlink_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			       struct kstat *stat)
+static int udf_symlink_getattr(const struct path *path, struct kstat *stat,
+				u32 request_mask, unsigned int flags)
 {
+	struct dentry *dentry = path->dentry;
 	struct inode *inode = d_backing_inode(dentry);
 	struct page *page;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 22c16155f1b4..229cc6a6d8ef 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -489,11 +489,12 @@ xfs_vn_get_link_inline(
 
 STATIC int
 xfs_vn_getattr(
-	struct vfsmount		*mnt,
-	struct dentry		*dentry,
-	struct kstat		*stat)
+	const struct path	*path,
+	struct kstat		*stat,
+	u32			request_mask,
+	unsigned int		query_flags)
 {
-	struct inode		*inode = d_inode(dentry);
+	struct inode		*inode = d_inode(path->dentry);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 52350947c670..aad3fd0ff5f8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1709,7 +1709,7 @@ struct inode_operations {
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
@@ -2902,8 +2902,8 @@ extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
 extern void generic_fillattr(struct inode *, struct kstat *);
-int vfs_getattr_nosec(struct path *path, struct kstat *stat);
-extern int vfs_getattr(struct path *, struct kstat *);
+extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
+extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
 void __inode_sub_bytes(struct inode *inode, loff_t bytes);
@@ -2916,10 +2916,29 @@ extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
 
-extern int vfs_stat(const char __user *, struct kstat *);
-extern int vfs_lstat(const char __user *, struct kstat *);
-extern int vfs_fstat(unsigned int, struct kstat *);
-extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
+extern int vfs_statx(int, const char __user *, int, struct kstat *, u32);
+extern int vfs_statx_fd(unsigned int, struct kstat *, u32, unsigned int);
+
+static inline int vfs_stat(const char __user *filename, struct kstat *stat)
+{
+	return vfs_statx(AT_FDCWD, filename, 0, stat, STATX_BASIC_STATS);
+}
+static inline int vfs_lstat(const char __user *name, struct kstat *stat)
+{
+	return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW,
+			 stat, STATX_BASIC_STATS);
+}
+static inline int vfs_fstatat(int dfd, const char __user *filename,
+			      struct kstat *stat, int flags)
+{
+	return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+}
+static inline int vfs_fstat(int fd, struct kstat *stat)
+{
+	return vfs_statx_fd(fd, stat, STATX_BASIC_STATS, 0);
+}
+
+
 extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
 extern int vfs_readlink(struct dentry *, char __user *, int);
 
@@ -2949,7 +2968,7 @@ extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, struct dir_context *);
 extern int simple_setattr(struct dentry *, struct iattr *);
-extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_open(struct inode *inode, struct file *file);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f1da8c8dd473..287f34161086 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -335,7 +335,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr);
-extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int nfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct inode *, int);
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 075cb0c7eb2a..c76e524fb34b 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -18,20 +18,32 @@
 #include <linux/time.h>
 #include <linux/uidgid.h>
 
+#define KSTAT_QUERY_FLAGS (AT_STATX_SYNC_TYPE)
+
 struct kstat {
-	u64		ino;
-	dev_t		dev;
+	u32		result_mask;	/* What fields the user got */
 	umode_t		mode;
 	unsigned int	nlink;
+	uint32_t	blksize;	/* Preferred I/O size */
+	u64		attributes;
+#define KSTAT_ATTR_FS_IOC_FLAGS				\
+	(STATX_ATTR_COMPRESSED |			\
+	 STATX_ATTR_IMMUTABLE |				\
+	 STATX_ATTR_APPEND |				\
+	 STATX_ATTR_NODUMP |				\
+	 STATX_ATTR_ENCRYPTED				\
+	 )/* Attrs corresponding to FS_*_FL flags */
+	u64		ino;
+	dev_t		dev;
+	dev_t		rdev;
 	kuid_t		uid;
 	kgid_t		gid;
-	dev_t		rdev;
 	loff_t		size;
-	struct timespec  atime;
+	struct timespec	atime;
 	struct timespec	mtime;
 	struct timespec	ctime;
-	unsigned long	blksize;
-	unsigned long long	blocks;
+	struct timespec	btime;			/* File creation time */
+	u64		blocks;
 };
 
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 91a740f6b884..980c3c9b06f8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,6 +48,7 @@ struct stat;
 struct stat64;
 struct statfs;
 struct statfs64;
+struct statx;
 struct __sysctl_args;
 struct sysinfo;
 struct timespec;
@@ -902,5 +903,7 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
 				  unsigned long prot, int pkey);
 asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 asmlinkage long sys_pkey_free(int pkey);
+asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
+			  unsigned mask, struct statx __user *buffer);
 
 #endif
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index beed138bd359..813afd6eee71 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -63,5 +63,10 @@
 #define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
 #define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
 
+#define AT_STATX_SYNC_TYPE	0x6000	/* Type of synchronisation required from statx() */
+#define AT_STATX_SYNC_AS_STAT	0x0000	/* - Do whatever stat() does */
+#define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
+#define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
+
 
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 7fec7e36d921..51a6b86e3700 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI_LINUX_STAT_H
 #define _UAPI_LINUX_STAT_H
 
+#include <linux/types.h>
 
 #if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)
 
@@ -41,5 +42,135 @@
 
 #endif
 
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is
+ * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time.
+ *
+ * Note that if both tv_sec and tv_nsec are non-zero, then the two values must
+ * either be both positive or both negative.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+	__s64	tv_sec;
+	__s32	tv_nsec;
+	__s32	__reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx().  What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ *   - the bit will be cleared, and
+ *
+ *   - the datum will be set to an appropriate fabricated value if one is
+ *     available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ *   - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ *     set or if the datum is considered out of date, and
+ *
+ *   - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ *   effort, it will be filled in anyway, and the bit will be set upon return
+ *   (it might not be up to date, however, and no attempt will be made to
+ *   synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+	/* 0x00 */
+	__u32	stx_mask;	/* What results were written [uncond] */
+	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */
+	__u64	stx_attributes;	/* Flags conveying information about the file [uncond] */
+	/* 0x10 */
+	__u32	stx_nlink;	/* Number of hard links */
+	__u32	stx_uid;	/* User ID of owner */
+	__u32	stx_gid;	/* Group ID of owner */
+	__u16	stx_mode;	/* File mode */
+	__u16	__spare0[1];
+	/* 0x20 */
+	__u64	stx_ino;	/* Inode number */
+	__u64	stx_size;	/* File size */
+	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
+	__u64	__spare1[1];
+	/* 0x40 */
+	struct statx_timestamp	stx_atime;	/* Last access time */
+	struct statx_timestamp	stx_btime;	/* File creation time */
+	struct statx_timestamp	stx_ctime;	/* Last attribute change time */
+	struct statx_timestamp	stx_mtime;	/* Last data modification time */
+	/* 0x80 */
+	__u32	stx_rdev_major;	/* Device ID of special file [if bdev/cdev] */
+	__u32	stx_rdev_minor;
+	__u32	stx_dev_major;	/* ID of device containing file [uncond] */
+	__u32	stx_dev_minor;
+	/* 0x90 */
+	__u64	__spare2[14];	/* Spare space for future expansion */
+	/* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE		0x00000001U	/* Want/got stx_mode & S_IFMT */
+#define STATX_MODE		0x00000002U	/* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK		0x00000004U	/* Want/got stx_nlink */
+#define STATX_UID		0x00000008U	/* Want/got stx_uid */
+#define STATX_GID		0x00000010U	/* Want/got stx_gid */
+#define STATX_ATIME		0x00000020U	/* Want/got stx_atime */
+#define STATX_MTIME		0x00000040U	/* Want/got stx_mtime */
+#define STATX_CTIME		0x00000080U	/* Want/got stx_ctime */
+#define STATX_INO		0x00000100U	/* Want/got stx_ino */
+#define STATX_SIZE		0x00000200U	/* Want/got stx_size */
+#define STATX_BLOCKS		0x00000400U	/* Want/got stx_blocks */
+#define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
+#define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
+#define STATX_ALL		0x00000fffU	/* All currently supported flags */
+
+/*
+ * Attributes to be found in stx_attributes
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically.  Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED		0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE		0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
+
 
 #endif /* _UAPI_LINUX_STAT_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index a26649a6633f..e07728f716b2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -958,10 +958,10 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			 struct kstat *stat)
+static int shmem_getattr(const struct path *path, struct kstat *stat,
+			 u32 request_mask, unsigned int query_flags)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
diff --git a/samples/Kconfig b/samples/Kconfig
index b124f62ed6cb..9cb63188d3ef 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -112,4 +112,10 @@ config SAMPLE_VFIO_MDEV_MTTY
 	  Build a virtual tty sample driver for use as a VFIO
 	  mediated device
 
+config SAMPLE_STATX
+	bool "Build example extended-stat using code"
+	depends on BROKEN
+	help
+	  Build example userspace program to use the new extended-stat syscall.
+
 endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
index 86a137e451d9..db54e766ddb1 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -3,4 +3,4 @@
 obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ trace_events/ livepatch/ \
 			   hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \
 			   configfs/ connector/ v4l/ trace_printk/ blackfin/ \
-			   vfio-mdev/
+			   vfio-mdev/ statx/
diff --git a/samples/statx/Makefile b/samples/statx/Makefile
new file mode 100644
index 000000000000..1f80a3d8cf45
--- /dev/null
+++ b/samples/statx/Makefile
@@ -0,0 +1,10 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/statx/test-statx.c b/samples/statx/test-statx.c
new file mode 100644
index 000000000000..8571d766331d
--- /dev/null
+++ b/samples/statx/test-statx.c
@@ -0,0 +1,254 @@
+/* Test the statx() system call.
+ *
+ * Note that the output of this program is intended to look like the output of
+ * /bin/stat where possible.
+ *
+ * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <sys/stat.h>
+
+#define AT_STATX_SYNC_TYPE	0x6000
+#define AT_STATX_SYNC_AS_STAT	0x0000
+#define AT_STATX_FORCE_SYNC	0x2000
+#define AT_STATX_DONT_SYNC	0x4000
+
+static __attribute__((unused))
+ssize_t statx(int dfd, const char *filename, unsigned flags,
+	      unsigned int mask, struct statx *buffer)
+{
+	return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
+}
+
+static void print_time(const char *field, struct statx_timestamp *ts)
+{
+	struct tm tm;
+	time_t tim;
+	char buffer[100];
+	int len;
+
+	tim = ts->tv_sec;
+	if (!localtime_r(&tim, &tm)) {
+		perror("localtime_r");
+		exit(1);
+	}
+	len = strftime(buffer, 100, "%F %T", &tm);
+	if (len == 0) {
+		perror("strftime");
+		exit(1);
+	}
+	printf("%s", field);
+	fwrite(buffer, 1, len, stdout);
+	printf(".%09u", ts->tv_nsec);
+	len = strftime(buffer, 100, "%z", &tm);
+	if (len == 0) {
+		perror("strftime2");
+		exit(1);
+	}
+	fwrite(buffer, 1, len, stdout);
+	printf("\n");
+}
+
+static void dump_statx(struct statx *stx)
+{
+	char buffer[256], ft = '?';
+
+	printf("results=%x\n", stx->stx_mask);
+
+	printf(" ");
+	if (stx->stx_mask & STATX_SIZE)
+		printf(" Size: %-15llu", (unsigned long long)stx->stx_size);
+	if (stx->stx_mask & STATX_BLOCKS)
+		printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks);
+	printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize);
+	if (stx->stx_mask & STATX_TYPE) {
+		switch (stx->stx_mode & S_IFMT) {
+		case S_IFIFO:	printf("  FIFO\n");			ft = 'p'; break;
+		case S_IFCHR:	printf("  character special file\n");	ft = 'c'; break;
+		case S_IFDIR:	printf("  directory\n");		ft = 'd'; break;
+		case S_IFBLK:	printf("  block special file\n");	ft = 'b'; break;
+		case S_IFREG:	printf("  regular file\n");		ft = '-'; break;
+		case S_IFLNK:	printf("  symbolic link\n");		ft = 'l'; break;
+		case S_IFSOCK:	printf("  socket\n");			ft = 's'; break;
+		default:
+			printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT);
+			break;
+		}
+	} else {
+		printf(" no type\n");
+	}
+
+	sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor);
+	printf("Device: %-15s", buffer);
+	if (stx->stx_mask & STATX_INO)
+		printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino);
+	if (stx->stx_mask & STATX_NLINK)
+		printf(" Links: %-5u", stx->stx_nlink);
+	if (stx->stx_mask & STATX_TYPE) {
+		switch (stx->stx_mode & S_IFMT) {
+		case S_IFBLK:
+		case S_IFCHR:
+			printf(" Device type: %u,%u",
+			       stx->stx_rdev_major, stx->stx_rdev_minor);
+			break;
+		}
+	}
+	printf("\n");
+
+	if (stx->stx_mask & STATX_MODE)
+		printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c)  ",
+		       stx->stx_mode & 07777,
+		       ft,
+		       stx->stx_mode & S_IRUSR ? 'r' : '-',
+		       stx->stx_mode & S_IWUSR ? 'w' : '-',
+		       stx->stx_mode & S_IXUSR ? 'x' : '-',
+		       stx->stx_mode & S_IRGRP ? 'r' : '-',
+		       stx->stx_mode & S_IWGRP ? 'w' : '-',
+		       stx->stx_mode & S_IXGRP ? 'x' : '-',
+		       stx->stx_mode & S_IROTH ? 'r' : '-',
+		       stx->stx_mode & S_IWOTH ? 'w' : '-',
+		       stx->stx_mode & S_IXOTH ? 'x' : '-');
+	if (stx->stx_mask & STATX_UID)
+		printf("Uid: %5d   ", stx->stx_uid);
+	if (stx->stx_mask & STATX_GID)
+		printf("Gid: %5d\n", stx->stx_gid);
+
+	if (stx->stx_mask & STATX_ATIME)
+		print_time("Access: ", &stx->stx_atime);
+	if (stx->stx_mask & STATX_MTIME)
+		print_time("Modify: ", &stx->stx_mtime);
+	if (stx->stx_mask & STATX_CTIME)
+		print_time("Change: ", &stx->stx_ctime);
+	if (stx->stx_mask & STATX_BTIME)
+		print_time(" Birth: ", &stx->stx_btime);
+
+	if (stx->stx_attributes) {
+		unsigned char bits;
+		int loop, byte;
+
+		static char attr_representation[64 + 1] =
+			/* STATX_ATTR_ flags: */
+			"????????"	/* 63-56 */
+			"????????"	/* 55-48 */
+			"????????"	/* 47-40 */
+			"????????"	/* 39-32 */
+			"????????"	/* 31-24	0x00000000-ff000000 */
+			"????????"	/* 23-16	0x00000000-00ff0000 */
+			"???me???"	/* 15- 8	0x00000000-0000ff00 */
+			"?dai?c??"	/*  7- 0	0x00000000-000000ff */
+			;
+
+		printf("Attributes: %016llx (", stx->stx_attributes);
+		for (byte = 64 - 8; byte >= 0; byte -= 8) {
+			bits = stx->stx_attributes >> byte;
+			for (loop = 7; loop >= 0; loop--) {
+				int bit = byte + loop;
+
+				if (bits & 0x80)
+					putchar(attr_representation[63 - bit]);
+				else
+					putchar('-');
+				bits <<= 1;
+			}
+			if (byte)
+				putchar(' ');
+		}
+		printf(")\n");
+	}
+}
+
+static void dump_hex(unsigned long long *data, int from, int to)
+{
+	unsigned offset, print_offset = 1, col = 0;
+
+	from /= 8;
+	to = (to + 7) / 8;
+
+	for (offset = from; offset < to; offset++) {
+		if (print_offset) {
+			printf("%04x: ", offset * 8);
+			print_offset = 0;
+		}
+		printf("%016llx", data[offset]);
+		col++;
+		if ((col & 3) == 0) {
+			printf("\n");
+			print_offset = 1;
+		} else {
+			printf(" ");
+		}
+	}
+
+	if (!print_offset)
+		printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	struct statx stx;
+	int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW;
+
+	unsigned int mask = STATX_ALL;
+
+	for (argv++; *argv; argv++) {
+		if (strcmp(*argv, "-F") == 0) {
+			atflag &= ~AT_STATX_SYNC_TYPE;
+			atflag |= AT_STATX_FORCE_SYNC;
+			continue;
+		}
+		if (strcmp(*argv, "-D") == 0) {
+			atflag &= ~AT_STATX_SYNC_TYPE;
+			atflag |= AT_STATX_DONT_SYNC;
+			continue;
+		}
+		if (strcmp(*argv, "-L") == 0) {
+			atflag &= ~AT_SYMLINK_NOFOLLOW;
+			continue;
+		}
+		if (strcmp(*argv, "-O") == 0) {
+			mask &= ~STATX_BASIC_STATS;
+			continue;
+		}
+		if (strcmp(*argv, "-A") == 0) {
+			atflag |= AT_NO_AUTOMOUNT;
+			continue;
+		}
+		if (strcmp(*argv, "-R") == 0) {
+			raw = 1;
+			continue;
+		}
+
+		memset(&stx, 0xbf, sizeof(stx));
+		ret = statx(AT_FDCWD, *argv, atflag, mask, &stx);
+		printf("statx(%s) = %d\n", *argv, ret);
+		if (ret < 0) {
+			perror(*argv);
+			exit(1);
+		}
+
+		if (raw)
+			dump_hex((unsigned long long *)&stx, 0, sizeof(stx));
+
+		dump_statx(&stx);
+	}
+	return 0;
+}
-- 
cgit v1.2.3