diff options
Diffstat (limited to 'drivers/infiniband/ulp')
30 files changed, 4453 insertions, 750 deletions
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index f3c7dcf03098..c28af1823a2d 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_INFINIBAND_SRP) += srp/ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ +obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic/ diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index da12717a3eb7..ff50a7bd66d8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -52,7 +52,6 @@ #include <rdma/ib_pack.h> #include <rdma/ib_sa.h> #include <linux/sched.h> - /* constants */ enum ipoib_flush_level { @@ -153,6 +152,13 @@ static inline void skb_add_pseudo_hdr(struct sk_buff *skb) skb_pull(skb, IPOIB_HARD_LEN); } +static inline struct ipoib_dev_priv *ipoib_priv(const struct net_device *dev) +{ + struct rdma_netdev *rn = netdev_priv(dev); + + return rn->clnt_priv; +} + /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ struct ipoib_mcast { struct ib_sa_mcmember_rec mcmember; @@ -404,6 +410,7 @@ struct ipoib_dev_priv { struct timer_list poll_timer; unsigned max_send_sge; bool sm_fullmember_sendonly_support; + const struct net_device_ops *rn_ops; }; struct ipoib_ah { @@ -416,7 +423,7 @@ struct ipoib_ah { struct ipoib_path { struct net_device *dev; - struct ib_sa_path_rec pathrec; + struct sa_path_rec pathrec; struct ipoib_ah *ah; struct sk_buff_head queue; @@ -472,7 +479,7 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); struct ipoib_ah *ipoib_create_ah(struct net_device *dev, - struct ib_pd *pd, struct ib_ah_attr *attr); + struct ib_pd *pd, struct rdma_ah_attr *attr); void ipoib_free_ah(struct kref *kref); static inline void ipoib_put_ah(struct ipoib_ah *ah) { @@ -482,27 +489,28 @@ int ipoib_open(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); int ipoib_add_umcast_attr(struct net_device *dev); -void ipoib_send(struct net_device *dev, struct sk_buff *skb, - struct ipoib_ah *address, u32 qpn); +int ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ib_ah *address, u32 dqpn); void ipoib_reap_ah(struct work_struct *work); struct ipoib_path *__path_find(struct net_device *dev, void *gid); void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); -int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv); -struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); - -int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port, + const char *format); +void ipoib_ib_tx_timer_func(unsigned long ctx); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); +int ipoib_ib_dev_open_default(struct net_device *dev); int ipoib_ib_dev_open(struct net_device *dev); -int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev); int ipoib_ib_dev_stop(struct net_device *dev); +void ipoib_ib_dev_up(struct net_device *dev); +void ipoib_ib_dev_down(struct net_device *dev); +int ipoib_ib_dev_stop_default(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); @@ -513,7 +521,7 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); -int ipoib_mcast_start_thread(struct net_device *dev); +void ipoib_mcast_start_thread(struct net_device *dev); int ipoib_mcast_stop_thread(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev); @@ -562,8 +570,10 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path); #endif -int ipoib_mcast_attach(struct net_device *dev, u16 mlid, - union ib_gid *mgid, int set_qkey); +int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid, int set_qkey, u32 qkey); +int ipoib_mcast_detach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid); void ipoib_mcast_remove_list(struct list_head *remove_list); void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid, struct list_head *remove_list); @@ -587,13 +597,13 @@ void __exit ipoib_netlink_fini(void); void ipoib_set_umcast(struct net_device *ndev, int umcast_val); int ipoib_set_mode(struct net_device *dev, const char *buf); -void ipoib_setup(struct net_device *dev); +void ipoib_setup_common(struct net_device *dev); void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); -int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); +void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 @@ -607,14 +617,14 @@ extern int ipoib_max_conn_qp; static inline int ipoib_cm_admin_enabled(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return IPOIB_CM_SUPPORTED(dev->dev_addr) && test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return IPOIB_CM_SUPPORTED(hwaddr) && test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } @@ -637,13 +647,13 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t static inline int ipoib_cm_has_srq(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return !!priv->cm.srq; } static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return priv->cm.max_cm_mtu; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 096c4f6fbd65..f87d104837dc 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -38,6 +38,8 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/moduleparam.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> #include "ipoib.h" @@ -91,7 +93,7 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_recv_wr *bad_wr; int i, ret; @@ -117,7 +119,7 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, struct ib_recv_wr *wr, struct ib_sge *sge, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_recv_wr *bad_wr; int i, ret; @@ -144,7 +146,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, u64 mapping[IPOIB_CM_RX_SG], gfp_t gfp) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; int i; @@ -195,7 +197,7 @@ partial_error: static void ipoib_cm_free_rx_ring(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int i; for (i = 0; i < ipoib_recvq_size; ++i) @@ -234,7 +236,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) { struct ipoib_cm_rx *p = ctx; - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); unsigned long flags; if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) @@ -250,7 +252,7 @@ static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, struct ipoib_cm_rx *p) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_init_attr attr = { .event_handler = ipoib_cm_rx_event_handler, .send_cq = priv->recv_cq, /* For drain WR */ @@ -275,7 +277,7 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp, unsigned psn) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_attr qp_attr; int qp_attr_mask, ret; @@ -330,7 +332,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev, struct ib_recv_wr *wr, struct ib_sge *sge) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int i; for (i = 0; i < priv->cm.num_frags; ++i) @@ -348,7 +350,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev, static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct { struct ib_recv_wr wr; struct ib_sge sge[IPOIB_CM_RX_SG]; @@ -363,7 +365,7 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i t = kmalloc(sizeof *t, GFP_KERNEL); if (!t) { ret = -ENOMEM; - goto err_free; + goto err_free_1; } ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); @@ -410,6 +412,8 @@ err_count: err_free: kfree(t); + +err_free_1: ipoib_cm_free_rx_ring(dev, rx->rx_ring); return ret; @@ -419,7 +423,7 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp, struct ib_cm_req_event_param *req, unsigned psn) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_data data = {}; struct ib_cm_rep_param rep = {}; @@ -439,7 +443,7 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct net_device *dev = cm_id->context; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_rx *p; unsigned psn; int ret; @@ -512,7 +516,7 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, /* Fall through */ case IB_CM_REJ_RECEIVED: p = cm_id->context; - priv = netdev_priv(p->dev); + priv = ipoib_priv(p->dev); if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); /* Fall through */ @@ -556,7 +560,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_rx_buf *rx_ring; unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct sk_buff *skb, *newskb; @@ -705,7 +709,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_tx_buf *tx_req; int rc; unsigned usable_sge = tx->max_send_sge - !!skb_headlen(skb); @@ -783,7 +787,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ipoib_tx_buf *tx_req; @@ -820,9 +824,12 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_neigh *neigh; - ipoib_dbg(priv, "failed cm send event " - "(status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + if (wc->status != IB_WC_RNR_RETRY_EXC_ERR) + ipoib_warn(priv, "failed cm send event (status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + else + ipoib_dbg(priv, "failed cm send event (status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); spin_lock_irqsave(&priv->lock, flags); neigh = tx->neigh; @@ -849,7 +856,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) int ipoib_cm_dev_open(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret; if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) @@ -881,7 +888,7 @@ err_cm: static void ipoib_cm_free_rx_reap_list(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_rx *rx, *n; LIST_HEAD(list); @@ -904,7 +911,7 @@ static void ipoib_cm_free_rx_reap_list(struct net_device *dev) void ipoib_cm_dev_stop(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_rx *p; unsigned long begin; int ret; @@ -948,7 +955,7 @@ void ipoib_cm_dev_stop(struct net_device *dev) break; } spin_unlock_irq(&priv->lock); - msleep(1); + usleep_range(1000, 2000); ipoib_drain_cq(dev); spin_lock_irq(&priv->lock); } @@ -963,7 +970,7 @@ void ipoib_cm_dev_stop(struct net_device *dev) static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *p = cm_id->context; - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); struct ipoib_cm_data *data = event->private_data; struct sk_buff_head skqueue; struct ib_qp_attr qp_attr; @@ -1015,9 +1022,10 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even while ((skb = __skb_dequeue(&skqueue))) { skb->dev = p->dev; - if (dev_queue_xmit(skb)) - ipoib_warn(priv, "dev_queue_xmit failed " - "to requeue packet\n"); + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n", + __func__, ret); } ret = ib_send_cm_rtu(cm_id, NULL, 0); @@ -1030,7 +1038,7 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_init_attr attr = { .send_cq = priv->recv_cq, .recv_cq = priv->recv_cq, @@ -1040,9 +1048,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = tx, - .create_flags = IB_QP_CREATE_USE_GFP_NOIO + .create_flags = 0 }; - struct ib_qp *tx_qp; if (dev->features & NETIF_F_SG) @@ -1050,10 +1057,6 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); tx_qp = ib_create_qp(priv->pd, &attr); - if (PTR_ERR(tx_qp) == -EINVAL) { - attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; - tx_qp = ib_create_qp(priv->pd, &attr); - } tx->max_send_sge = attr.cap.max_send_sge; return tx_qp; } @@ -1061,9 +1064,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ static int ipoib_cm_send_req(struct net_device *dev, struct ib_cm_id *id, struct ib_qp *qp, u32 qpn, - struct ib_sa_path_rec *pathrec) + struct sa_path_rec *pathrec) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_data data = {}; struct ib_cm_req_param req = {}; @@ -1098,7 +1101,7 @@ static int ipoib_cm_send_req(struct net_device *dev, static int ipoib_cm_modify_tx_init(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_attr qp_attr; int qp_attr_mask, ret; ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); @@ -1121,13 +1124,14 @@ static int ipoib_cm_modify_tx_init(struct net_device *dev, } static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, - struct ib_sa_path_rec *pathrec) + struct sa_path_rec *pathrec) { - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); + unsigned int noio_flag; int ret; - p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, - GFP_NOIO, PAGE_KERNEL); + noio_flag = memalloc_noio_save(); + p->tx_ring = vzalloc(ipoib_sendq_size * sizeof(*p->tx_ring)); if (!p->tx_ring) { ret = -ENOMEM; goto err_tx; @@ -1135,9 +1139,10 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->dev, p); + memalloc_noio_restore(noio_flag); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); - ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + ipoib_warn(priv, "failed to create tx qp: %d\n", ret); goto err_qp; } @@ -1151,13 +1156,13 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); if (ret) { ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); - goto err_modify; + goto err_modify_send; } ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); if (ret) { ipoib_warn(priv, "failed to send cm req: %d\n", ret); - goto err_send_cm; + goto err_modify_send; } ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", @@ -1165,8 +1170,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, return 0; -err_send_cm: -err_modify: +err_modify_send: ib_destroy_cm_id(p->id); err_id: p->id = NULL; @@ -1180,7 +1184,7 @@ err_tx: static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { - struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); struct ipoib_tx_buf *tx_req; unsigned long begin; @@ -1200,7 +1204,7 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) goto timeout; } - msleep(1); + usleep_range(1000, 2000); } } @@ -1230,7 +1234,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *tx = cm_id->context; - struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct ipoib_dev_priv *priv = ipoib_priv(tx->dev); struct net_device *dev = priv->dev; struct ipoib_neigh *neigh; unsigned long flags; @@ -1281,7 +1285,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, struct ipoib_neigh *neigh) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_tx *tx; tx = kzalloc(sizeof *tx, GFP_ATOMIC); @@ -1300,7 +1304,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { - struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct ipoib_dev_priv *priv = ipoib_priv(tx->dev); unsigned long flags; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock_irqsave(&priv->lock, flags); @@ -1326,7 +1330,7 @@ static void ipoib_cm_tx_start(struct work_struct *work) struct ipoib_path *path; int ret; - struct ib_sa_path_rec pathrec; + struct sa_path_rec pathrec; u32 qpn; netif_tx_lock_bh(dev); @@ -1388,7 +1392,7 @@ static void ipoib_cm_tx_reap(struct work_struct *work) while (!list_empty(&priv->cm.reap_list)) { p = list_entry(priv->cm.reap_list.next, typeof(*p), list); - list_del(&p->list); + list_del_init(&p->list); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); ipoib_cm_tx_destroy(p); @@ -1435,7 +1439,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work) void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, unsigned int mtu) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int e = skb_queue_empty(&priv->cm.skb_queue); if (skb_dst(skb)) @@ -1484,7 +1488,8 @@ static void ipoib_cm_stale_task(struct work_struct *work) static ssize_t show_mode(struct device *d, struct device_attribute *attr, char *buf) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) return sprintf(buf, "connected\n"); @@ -1497,7 +1502,7 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, { struct net_device *dev = to_net_dev(d); int ret; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags)) return -EPERM; @@ -1507,12 +1512,14 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, ret = ipoib_set_mode(dev, buf); - rtnl_unlock(); - - if (!ret) - return count; + /* The assumption is that the function ipoib_set_mode returned + * with the rtnl held by it, if not the value -EBUSY returned, + * then no need to rtnl_unlock + */ + if (ret != -EBUSY) + rtnl_unlock(); - return ret; + return (!ret || ret == -EBUSY) ? count : ret; } static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); @@ -1524,7 +1531,7 @@ int ipoib_cm_add_mode_attr(struct net_device *dev) static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_srq_init_attr srq_init_attr = { .srq_type = IB_SRQT_BASIC, .attr = { @@ -1553,7 +1560,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) int ipoib_cm_dev_init(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int max_srq_sge, i; INIT_LIST_HEAD(&priv->cm.passive_ids); @@ -1614,7 +1621,7 @@ int ipoib_cm_dev_init(struct net_device *dev) void ipoib_cm_dev_cleanup(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret; if (!priv->cm.srq) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 7b6d40ff1acf..7871379342f4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -60,12 +60,12 @@ static const struct ipoib_stats ipoib_gstrings_stats[] = { static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { - struct ipoib_dev_priv *priv = netdev_priv(netdev); + struct ipoib_dev_priv *priv = ipoib_priv(netdev); ib_get_device_fw_str(priv->ca, drvinfo->fw_version, sizeof(drvinfo->fw_version)); - strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent), sizeof(drvinfo->bus_info)); strlcpy(drvinfo->version, ipoib_driver_version, @@ -77,7 +77,7 @@ static void ipoib_get_drvinfo(struct net_device *netdev, static int ipoib_get_coalesce(struct net_device *dev, struct ethtool_coalesce *coal) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; @@ -88,7 +88,7 @@ static int ipoib_get_coalesce(struct net_device *dev, static int ipoib_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret; /* @@ -155,7 +155,66 @@ static int ipoib_get_sset_count(struct net_device __always_unused *dev, return -EOPNOTSUPP; } +/* Return lane speed in unit of 1e6 bit/sec */ +static inline int ib_speed_enum_to_int(int speed) +{ + switch (speed) { + case IB_SPEED_SDR: + return SPEED_2500; + case IB_SPEED_DDR: + return SPEED_5000; + case IB_SPEED_QDR: + case IB_SPEED_FDR10: + return SPEED_10000; + case IB_SPEED_FDR: + return SPEED_14000; + case IB_SPEED_EDR: + return SPEED_25000; + } + + return SPEED_UNKNOWN; +} + +static int ipoib_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) +{ + struct ipoib_dev_priv *priv = ipoib_priv(netdev); + struct ib_port_attr attr; + int ret, speed, width; + + if (!netif_carrier_ok(netdev)) { + cmd->base.speed = SPEED_UNKNOWN; + cmd->base.duplex = DUPLEX_UNKNOWN; + return 0; + } + + ret = ib_query_port(priv->ca, priv->port, &attr); + if (ret < 0) + return -EINVAL; + + speed = ib_speed_enum_to_int(attr.active_speed); + width = ib_width_enum_to_int(attr.active_width); + + if (speed < 0 || width < 0) + return -EINVAL; + + /* Except the following are set, the other members of + * the struct ethtool_link_settings are initialized to + * zero in the function __ethtool_get_link_ksettings. + */ + cmd->base.speed = speed * width; + cmd->base.duplex = DUPLEX_FULL; + + cmd->base.phy_address = 0xFF; + + cmd->base.autoneg = AUTONEG_ENABLE; + cmd->base.port = PORT_OTHER; + + return 0; +} + static const struct ethtool_ops ipoib_ethtool_ops = { + .get_link_ksettings = ipoib_get_link_ksettings, .get_drvinfo = ipoib_get_drvinfo, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 6bd5740e2691..11f74cbe6660 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -210,16 +210,16 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) seq_printf(file, "GID: %s\n" " complete: %6s\n", - gid_buf, path.pathrec.dlid ? "yes" : "no"); + gid_buf, sa_path_get_dlid(&path.pathrec) ? "yes" : "no"); - if (path.pathrec.dlid) { + if (sa_path_get_dlid(&path.pathrec)) { rate = ib_rate_to_mbps(path.pathrec.rate); seq_printf(file, " DLID: 0x%04x\n" " SL: %12d\n" " rate: %8d.%d Gb/sec\n", - be16_to_cpu(path.pathrec.dlid), + be32_to_cpu(sa_path_get_dlid(&path.pathrec)), path.pathrec.sl, rate / 1000, rate % 1000); } @@ -261,7 +261,7 @@ static const struct file_operations ipoib_path_fops = { void ipoib_create_debug_files(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); char name[IFNAMSIZ + sizeof "_path"]; snprintf(name, sizeof name, "%s_mcg", dev->name); @@ -279,10 +279,13 @@ void ipoib_create_debug_files(struct net_device *dev) void ipoib_delete_debug_files(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + WARN_ONCE(!priv->mcg_dentry, "null mcg debug file\n"); + WARN_ONCE(!priv->path_dentry, "null path debug file\n"); debugfs_remove(priv->mcg_dentry); debugfs_remove(priv->path_dentry); + priv->mcg_dentry = priv->path_dentry = NULL; } int ipoib_register_debugfs(void) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 5038f9d2d753..57a9655e844d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -52,7 +52,7 @@ MODULE_PARM_DESC(data_debug_level, #endif struct ipoib_ah *ipoib_create_ah(struct net_device *dev, - struct ib_pd *pd, struct ib_ah_attr *attr) + struct ib_pd *pd, struct rdma_ah_attr *attr) { struct ipoib_ah *ah; struct ib_ah *vah; @@ -65,13 +65,13 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev, ah->last_send = 0; kref_init(&ah->ref); - vah = ib_create_ah(pd, attr); + vah = rdma_create_ah(pd, attr); if (IS_ERR(vah)) { kfree(ah); ah = (struct ipoib_ah *)vah; } else { ah->ah = vah; - ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + ipoib_dbg(ipoib_priv(dev), "Created ah %p\n", ah->ah); } return ah; @@ -80,7 +80,7 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev, void ipoib_free_ah(struct kref *kref) { struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); - struct ipoib_dev_priv *priv = netdev_priv(ah->dev); + struct ipoib_dev_priv *priv = ipoib_priv(ah->dev); unsigned long flags; @@ -99,7 +99,7 @@ static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, static int ipoib_ib_post_receive(struct net_device *dev, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_recv_wr *bad_wr; int ret; @@ -121,7 +121,7 @@ static int ipoib_ib_post_receive(struct net_device *dev, int id) static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; int buf_size; u64 *mapping; @@ -153,7 +153,7 @@ error: static int ipoib_ib_post_receives(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int i; for (i = 0; i < ipoib_recvq_size; ++i) { @@ -172,7 +172,7 @@ static int ipoib_ib_post_receives(struct net_device *dev) static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; struct sk_buff *skb; u64 mapping[IPOIB_UD_RX_SG]; @@ -381,7 +381,7 @@ free_res: static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); unsigned int wr_id = wc->wr_id; struct ipoib_tx_buf *tx_req; @@ -485,14 +485,14 @@ poll_more: void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { struct net_device *dev = dev_ptr; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); napi_schedule(&priv->napi); } static void drain_tx_cq(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_tx_lock(dev); while (poll_tx(priv)) @@ -506,14 +506,14 @@ static void drain_tx_cq(struct net_device *dev) void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) { - struct ipoib_dev_priv *priv = netdev_priv(dev_ptr); + struct ipoib_dev_priv *priv = ipoib_priv(dev_ptr); mod_timer(&priv->poll_timer, jiffies); } static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, - struct ib_ah *address, u32 qpn, + struct ib_ah *address, u32 dqpn, struct ipoib_tx_buf *tx_req, void *head, int hlen) { @@ -523,7 +523,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, ipoib_build_sge(priv, tx_req); priv->tx_wr.wr.wr_id = wr_id; - priv->tx_wr.remote_qpn = qpn; + priv->tx_wr.remote_qpn = dqpn; priv->tx_wr.ah = address; if (head) { @@ -537,10 +537,10 @@ static inline int post_send(struct ipoib_dev_priv *priv, return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); } -void ipoib_send(struct net_device *dev, struct sk_buff *skb, - struct ipoib_ah *address, u32 qpn) +int ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ib_ah *address, u32 dqpn) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_tx_buf *tx_req; int hlen, rc; void *phead; @@ -554,7 +554,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ++dev->stats.tx_dropped; ++dev->stats.tx_errors; dev_kfree_skb_any(skb); - return; + return -1; } } else { if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { @@ -563,7 +563,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ++dev->stats.tx_dropped; ++dev->stats.tx_errors; ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); - return; + return -1; } phead = NULL; hlen = 0; @@ -574,7 +574,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ++dev->stats.tx_dropped; ++dev->stats.tx_errors; dev_kfree_skb_any(skb); - return; + return -1; } /* Does skb_linearize return ok without reducing nr_frags? */ if (skb_shinfo(skb)->nr_frags > usable_sge) { @@ -582,12 +582,13 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ++dev->stats.tx_dropped; ++dev->stats.tx_errors; dev_kfree_skb_any(skb); - return; + return -1; } } - ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", - skb->len, address, qpn); + ipoib_dbg_data(priv, + "sending packet, length=%d address=%p dqpn=0x%06x\n", + skb->len, address, dqpn); /* * We put the skb into the tx_ring _before_ we call post_send() @@ -601,7 +602,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { ++dev->stats.tx_errors; dev_kfree_skb_any(skb); - return; + return -1; } if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -620,7 +621,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, skb_dst_drop(skb); rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), - address->ah, qpn, tx_req, phead, hlen); + address, dqpn, tx_req, phead, hlen); if (unlikely(rc)) { ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; @@ -629,21 +630,24 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, dev_kfree_skb_any(skb); if (netif_queue_stopped(dev)) netif_wake_queue(dev); + rc = 0; } else { netif_trans_update(dev); - address->last_send = priv->tx_head; + rc = priv->tx_head; ++priv->tx_head; } if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) while (poll_tx(priv)) ; /* nothing */ + + return rc; } static void __ipoib_reap_ah(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); unsigned long flags; @@ -654,7 +658,7 @@ static void __ipoib_reap_ah(struct net_device *dev) list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) if ((int) priv->tx_tail - (int) ah->last_send >= 0) { list_del(&ah->list); - ib_destroy_ah(ah->ah); + rdma_destroy_ah(ah->ah); kfree(ah); } @@ -677,7 +681,7 @@ void ipoib_reap_ah(struct work_struct *work) static void ipoib_flush_ah(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); cancel_delayed_work(&priv->ah_reap_task); flush_workqueue(priv->wq); @@ -686,30 +690,124 @@ static void ipoib_flush_ah(struct net_device *dev) static void ipoib_stop_ah(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); set_bit(IPOIB_STOP_REAPER, &priv->flags); ipoib_flush_ah(dev); } -static void ipoib_ib_tx_timer_func(unsigned long ctx) +static int recvs_pending(struct net_device *dev) { - drain_tx_cq((struct net_device *)ctx); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int pending = 0; + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (priv->rx_ring[i].skb) + ++pending; + + return pending; } -int ipoib_ib_dev_open(struct net_device *dev) +int ipoib_ib_dev_stop_default(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - int ret; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_attr qp_attr; + unsigned long begin; + struct ipoib_tx_buf *tx_req; + int i; - ipoib_pkey_dev_check_presence(dev); + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_disable(&priv->napi); - if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey, - (!(priv->pkey & 0x7fff) ? "Invalid" : "not found")); - return -1; + ipoib_cm_dev_stop(dev); + + /* + * Move our QP to the error state and then reinitialize in + * when all work requests have completed or have been flushed. + */ + qp_attr.qp_state = IB_QPS_ERR; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + + /* Wait for all sends and receives to complete */ + begin = jiffies; + + while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, + "timing out; %d sends %d receives not completed\n", + priv->tx_head - priv->tx_tail, + recvs_pending(dev)); + + /* + * assume the HW is wedged and just free up + * all our pending work requests. + */ + while ((int)priv->tx_tail - (int)priv->tx_head < 0) { + tx_req = &priv->tx_ring[priv->tx_tail & + (ipoib_sendq_size - 1)]; + ipoib_dma_unmap_tx(priv, tx_req); + dev_kfree_skb_any(tx_req->skb); + ++priv->tx_tail; + --priv->tx_outstanding; + } + + for (i = 0; i < ipoib_recvq_size; ++i) { + struct ipoib_rx_buf *rx_req; + + rx_req = &priv->rx_ring[i]; + if (!rx_req->skb) + continue; + ipoib_ud_dma_unmap_rx(priv, + priv->rx_ring[i].mapping); + dev_kfree_skb_any(rx_req->skb); + rx_req->skb = NULL; + } + + goto timeout; + } + + ipoib_drain_cq(dev); + + usleep_range(1000, 2000); } + ipoib_dbg(priv, "All sends and receives done.\n"); + +timeout: + del_timer_sync(&priv->poll_timer); + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + + return 0; +} + +int ipoib_ib_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + priv->rn_ops->ndo_stop(dev); + + clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + ipoib_flush_ah(dev); + + return 0; +} + +void ipoib_ib_tx_timer_func(unsigned long ctx) +{ + drain_tx_cq((struct net_device *)ctx); +} + +int ipoib_ib_dev_open_default(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + ret = ipoib_init_qp(dev); if (ret) { ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); @@ -719,33 +817,59 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - goto dev_stop; + goto out; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - goto dev_stop; + goto out; + } + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + + return 0; +out: + return -1; +} + +int ipoib_ib_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey, + (!(priv->pkey & 0x7fff) ? "Invalid" : "not found")); + return -1; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); - if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - napi_enable(&priv->napi); + if (priv->rn_ops->ndo_open(dev)) { + pr_warn("%s: Failed to open dev\n", dev->name); + goto dev_stop; + } + + set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); return 0; + dev_stop: - if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - napi_enable(&priv->napi); + set_bit(IPOIB_STOP_REAPER, &priv->flags); + cancel_delayed_work(&priv->ah_reap_task); + set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); ipoib_ib_dev_stop(dev); return -1; } void ipoib_pkey_dev_check_presence(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!(priv->pkey & 0x7fff) || ib_find_pkey(priv->ca, priv->port, priv->pkey, @@ -755,25 +879,25 @@ void ipoib_pkey_dev_check_presence(struct net_device *dev) set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } -int ipoib_ib_dev_up(struct net_device *dev) +void ipoib_ib_dev_up(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_pkey_dev_check_presence(dev); if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { ipoib_dbg(priv, "PKEY is not assigned.\n"); - return 0; + return; } set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); - return ipoib_mcast_start_thread(dev); + ipoib_mcast_start_thread(dev); } -int ipoib_ib_dev_down(struct net_device *dev) +void ipoib_ib_dev_down(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "downing ib_dev\n"); @@ -784,26 +908,11 @@ int ipoib_ib_dev_down(struct net_device *dev) ipoib_mcast_dev_flush(dev); ipoib_flush_paths(dev); - - return 0; -} - -static int recvs_pending(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - int pending = 0; - int i; - - for (i = 0; i < ipoib_recvq_size; ++i) - if (priv->rx_ring[i].skb) - ++pending; - - return pending; } void ipoib_drain_cq(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int i, n; /* @@ -840,109 +949,6 @@ void ipoib_drain_cq(struct net_device *dev) local_bh_enable(); } -int ipoib_ib_dev_stop(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_qp_attr qp_attr; - unsigned long begin; - struct ipoib_tx_buf *tx_req; - int i; - - if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - napi_disable(&priv->napi); - - ipoib_cm_dev_stop(dev); - - /* - * Move our QP to the error state and then reinitialize in - * when all work requests have completed or have been flushed. - */ - qp_attr.qp_state = IB_QPS_ERR; - if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) - ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); - - /* Wait for all sends and receives to complete */ - begin = jiffies; - - while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { - if (time_after(jiffies, begin + 5 * HZ)) { - ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", - priv->tx_head - priv->tx_tail, recvs_pending(dev)); - - /* - * assume the HW is wedged and just free up - * all our pending work requests. - */ - while ((int) priv->tx_tail - (int) priv->tx_head < 0) { - tx_req = &priv->tx_ring[priv->tx_tail & - (ipoib_sendq_size - 1)]; - ipoib_dma_unmap_tx(priv, tx_req); - dev_kfree_skb_any(tx_req->skb); - ++priv->tx_tail; - --priv->tx_outstanding; - } - - for (i = 0; i < ipoib_recvq_size; ++i) { - struct ipoib_rx_buf *rx_req; - - rx_req = &priv->rx_ring[i]; - if (!rx_req->skb) - continue; - ipoib_ud_dma_unmap_rx(priv, - priv->rx_ring[i].mapping); - dev_kfree_skb_any(rx_req->skb); - rx_req->skb = NULL; - } - - goto timeout; - } - - ipoib_drain_cq(dev); - - msleep(1); - } - - ipoib_dbg(priv, "All sends and receives done.\n"); - -timeout: - del_timer_sync(&priv->poll_timer); - qp_attr.qp_state = IB_QPS_RESET; - if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) - ipoib_warn(priv, "Failed to modify QP to RESET state\n"); - - ipoib_flush_ah(dev); - - ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); - - return 0; -} - -int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - priv->ca = ca; - priv->port = port; - priv->qp = NULL; - - if (ipoib_transport_dev_init(dev, ca)) { - printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); - return -ENODEV; - } - - setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, - (unsigned long) dev); - - if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev)) { - ipoib_transport_dev_cleanup(dev); - return -ENODEV; - } - } - - return 0; -} - /* * Takes whatever value which is in pkey index 0 and updates priv->pkey * returns 0 if the pkey value was changed. @@ -971,6 +977,19 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) */ priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; + + /* + * Update the broadcast address in the priv->broadcast object, + * in case it already exists, otherwise no one will do that. + */ + if (priv->broadcast) { + spin_lock_irq(&priv->lock); + memcpy(priv->broadcast->mcmember.mgid.raw, + priv->dev->broadcast + 4, + sizeof(union ib_gid)); + spin_unlock_irq(&priv->lock); + } + return 0; } @@ -1220,7 +1239,7 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work) void ipoib_ib_dev_cleanup(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); /* @@ -1240,7 +1259,13 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) */ ipoib_stop_ah(dev); - ipoib_transport_dev_cleanup(dev); -} + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + priv->rn_ops->ndo_uninit(dev); + + if (priv->pd) { + ib_dealloc_pd(priv->pd); + priv->pd = NULL; + } +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 3ce0765a05ab..4ce315c92b48 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,9 +108,36 @@ static struct ib_client ipoib_client = { .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static int ipoib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netdev_notifier_info *ni = ptr; + struct net_device *dev = ni->dev; + + if (dev->netdev_ops->ndo_open != ipoib_open) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + ipoib_create_debug_files(dev); + break; + case NETDEV_CHANGENAME: + ipoib_delete_debug_files(dev); + ipoib_create_debug_files(dev); + break; + case NETDEV_UNREGISTER: + ipoib_delete_debug_files(dev); + break; + } + + return NOTIFY_DONE; +} +#endif + int ipoib_open(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "bringing up interface\n"); @@ -126,8 +153,7 @@ int ipoib_open(struct net_device *dev) goto err_disable; } - if (ipoib_ib_dev_up(dev)) - goto err_stop; + ipoib_ib_dev_up(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -150,9 +176,6 @@ int ipoib_open(struct net_device *dev) return 0; -err_stop: - ipoib_ib_dev_stop(dev); - err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); @@ -161,7 +184,7 @@ err_disable: static int ipoib_stop(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "stopping interface\n"); @@ -199,7 +222,7 @@ static void ipoib_uninit(struct net_device *dev) static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); @@ -209,7 +232,8 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret = 0; /* dev->mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(dev)) { @@ -229,9 +253,38 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) priv->admin_mtu = new_mtu; - dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + if (priv->mcast_mtu < priv->admin_mtu) + ipoib_dbg(priv, "MTU must be smaller than the underlying " + "link layer MTU - 4 (%u)\n", priv->mcast_mtu); - return 0; + new_mtu = min(priv->mcast_mtu, priv->admin_mtu); + + if (priv->rn_ops->ndo_change_mtu) { + bool carrier_status = netif_carrier_ok(dev); + + netif_carrier_off(dev); + + /* notify lower level on the real mtu */ + ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); + + if (carrier_status) + netif_carrier_on(dev); + } else { + dev->mtu = new_mtu; + } + + return ret; +} + +static void ipoib_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (priv->rn_ops->ndo_get_stats64) + priv->rn_ops->ndo_get_stats64(dev, stats); + else + netdev_stats_to_stats64(stats, &dev->stats); } /* Called with an RCU read lock taken */ @@ -468,7 +521,14 @@ static struct net_device *ipoib_get_net_dev_by_params( int ipoib_set_mode(struct net_device *dev, const char *buf) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + !strcmp(buf, "connected\n")) || + (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + !strcmp(buf, "datagram\n"))) { + return 0; + } /* flush paths if we switch modes so that connections are restarted */ if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { @@ -481,8 +541,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; ipoib_flush_paths(dev); - rtnl_lock(); - return 0; + return (!rtnl_trylock()) ? -EBUSY : 0; } if (!strcmp(buf, "datagram\n")) { @@ -491,8 +550,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); rtnl_unlock(); ipoib_flush_paths(dev); - rtnl_lock(); - return 0; + return (!rtnl_trylock()) ? -EBUSY : 0; } return -EINVAL; @@ -500,7 +558,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) struct ipoib_path *__path_find(struct net_device *dev, void *gid) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; @@ -524,7 +582,7 @@ struct ipoib_path *__path_find(struct net_device *dev, void *gid) static int __path_add(struct net_device *dev, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; @@ -559,7 +617,7 @@ static void path_free(struct net_device *dev, struct ipoib_path *path) while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); - ipoib_dbg(netdev_priv(dev), "path_free\n"); + ipoib_dbg(ipoib_priv(dev), "path_free\n"); /* remove all neigh connected to this path */ ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); @@ -593,7 +651,7 @@ struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) int ipoib_path_iter_next(struct ipoib_path_iter *iter) { - struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); struct rb_node *n; struct ipoib_path *path; int ret = 1; @@ -630,95 +688,32 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, void ipoib_mark_paths_invalid(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { - ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n", - be16_to_cpu(path->pathrec.dlid), - path->pathrec.dgid.raw); + ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", + be32_to_cpu(sa_path_get_dlid(&path->pathrec)), + path->pathrec.dgid.raw); path->valid = 0; } spin_unlock_irq(&priv->lock); } -struct classport_info_context { - struct ipoib_dev_priv *priv; - struct completion done; - struct ib_sa_query *sa_query; -}; - -static void classport_info_query_cb(int status, struct ib_class_port_info *rec, - void *context) +static void push_pseudo_header(struct sk_buff *skb, const char *daddr) { - struct classport_info_context *cb_ctx = context; - struct ipoib_dev_priv *priv; - - WARN_ON(!context); - - priv = cb_ctx->priv; - - if (status || !rec) { - pr_debug("device: %s failed query classport_info status: %d\n", - priv->dev->name, status); - /* keeps the default, will try next mcast_restart */ - priv->sm_fullmember_sendonly_support = false; - goto out; - } - - if (ib_get_cpi_capmask2(rec) & - IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) { - pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n", - priv->dev->name); - priv->sm_fullmember_sendonly_support = true; - } else { - pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n", - priv->dev->name); - priv->sm_fullmember_sendonly_support = false; - } - -out: - complete(&cb_ctx->done); -} - -int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv) -{ - struct classport_info_context *callback_context; - int ret; - - callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL); - if (!callback_context) - return -ENOMEM; - - callback_context->priv = priv; - init_completion(&callback_context->done); - - ret = ib_sa_classport_info_rec_query(&ipoib_sa_client, - priv->ca, priv->port, 3000, - GFP_KERNEL, - classport_info_query_cb, - callback_context, - &callback_context->sa_query); - if (ret < 0) { - pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n", - priv->dev->name, ret); - kfree(callback_context); - return ret; - } - - /* waiting for the callback to finish before returnning */ - wait_for_completion(&callback_context->done); - kfree(callback_context); + struct ipoib_pseudo_header *phdr; - return ret; + phdr = skb_push(skb, sizeof(*phdr)); + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); } void ipoib_flush_paths(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; @@ -747,12 +742,12 @@ void ipoib_flush_paths(struct net_device *dev) } static void path_rec_completion(int status, - struct ib_sa_path_rec *pathrec, + struct sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ipoib_neigh *neigh, *tn; @@ -762,7 +757,8 @@ static void path_rec_completion(int status, if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", - be16_to_cpu(pathrec->dlid), pathrec->dgid.raw); + be32_to_cpu(sa_path_get_dlid(pathrec)), + pathrec->dgid.raw); else ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", status, path->pathrec.dgid.raw); @@ -770,7 +766,7 @@ static void path_rec_completion(int status, skb_queue_head_init(&skqueue); if (!status) { - struct ib_ah_attr av; + struct rdma_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(dev, priv->pd, &av); @@ -785,7 +781,8 @@ static void path_rec_completion(int status, path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", - ah, be16_to_cpu(pathrec->dlid), pathrec->sl); + ah, be32_to_cpu(sa_path_get_dlid(pathrec)), + pathrec->sl); while ((skb = __skb_dequeue(&path->queue))) __skb_queue_tail(&skqueue, skb); @@ -834,16 +831,18 @@ static void path_rec_completion(int status, ipoib_put_ah(old_ah); while ((skb = __skb_dequeue(&skqueue))) { + int ret; skb->dev = dev; - if (dev_queue_xmit(skb)) - ipoib_warn(priv, "dev_queue_xmit failed " - "to requeue packet\n"); + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", + __func__, ret); } } static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; if (!priv->broadcast) @@ -859,6 +858,10 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) INIT_LIST_HEAD(&path->neigh_list); + if (rdma_cap_opa_ah(priv->ca, priv->port)) + path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; + else + path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); @@ -871,7 +874,7 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) static int path_rec_start(struct net_device *dev, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "Start path record lookup for %pI6\n", path->pathrec.dgid.raw); @@ -902,7 +905,8 @@ static int path_rec_start(struct net_device *dev, static void neigh_add_path(struct sk_buff *skb, u8 *daddr, struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; unsigned long flags; @@ -940,8 +944,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, IPOIB_PSEUDO_LEN); + push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { ipoib_warn(priv, "queue length limit %d. Packet drop.\n", @@ -950,7 +953,8 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, } } else { spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); + path->ah->last_send = rn->send(dev, skb, path->ah->ah, + IPOIB_QPN(daddr)); ipoib_neigh_put(neigh); return; } @@ -959,10 +963,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, if (!path->query && path_rec_start(dev, path)) goto err_path; - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); - else + } else { goto err_drop; + } } spin_unlock_irqrestore(&priv->lock, flags); @@ -982,7 +988,8 @@ err_drop: static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, struct ipoib_pseudo_header *phdr) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; unsigned long flags; @@ -998,8 +1005,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, } if (path) { if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, IPOIB_PSEUDO_LEN); + push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; @@ -1023,16 +1029,16 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, } if (path->ah) { - ipoib_dbg(priv, "Send unicast ARP to %04x\n", - be16_to_cpu(path->pathrec.dlid)); + ipoib_dbg(priv, "Send unicast ARP to %08x\n", + be32_to_cpu(sa_path_get_dlid(&path->pathrec))); spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + path->ah->last_send = rn->send(dev, skb, path->ah->ah, + IPOIB_QPN(phdr->hwaddr)); return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, IPOIB_PSEUDO_LEN); + push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; @@ -1044,7 +1050,8 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_neigh *neigh; struct ipoib_pseudo_header *phdr; struct ipoib_header *header; @@ -1108,13 +1115,13 @@ send_using_neigh: goto unref; } } else if (neigh->ah) { - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr)); + neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, + IPOIB_QPN(phdr->hwaddr)); goto unref; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof(*phdr)); + push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); @@ -1131,7 +1138,7 @@ unref: static void ipoib_timeout(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev_trans_start(dev))); @@ -1146,10 +1153,9 @@ static int ipoib_hard_header(struct sk_buff *skb, unsigned short type, const void *daddr, const void *saddr, unsigned len) { - struct ipoib_pseudo_header *phdr; struct ipoib_header *header; - header = (struct ipoib_header *) skb_push(skb, sizeof *header); + header = skb_push(skb, sizeof *header); header->proto = htons(type); header->reserved = 0; @@ -1159,15 +1165,14 @@ static int ipoib_hard_header(struct sk_buff *skb, * destination address into skb hard header so we can figure out where * to send the packet later. */ - phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr)); - memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); + push_pseudo_header(skb, daddr); return IPOIB_HARD_LEN; } static void ipoib_set_mcast_list(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); @@ -1179,7 +1184,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) static int ipoib_get_iflink(const struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); /* parent interface */ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) @@ -1207,7 +1212,7 @@ static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh = NULL; @@ -1286,7 +1291,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1336,7 +1341,7 @@ static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh; @@ -1393,7 +1398,7 @@ void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; if (neigh->ah) ipoib_put_ah(neigh->ah); @@ -1403,7 +1408,7 @@ void ipoib_neigh_dtor(struct ipoib_neigh *neigh) } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); - ipoib_dbg(netdev_priv(dev), + ipoib_dbg(ipoib_priv(dev), "neigh free for %06x %pI6\n", IPOIB_QPN(neigh->daddr), neigh->daddr + 4); @@ -1425,7 +1430,7 @@ static void ipoib_neigh_reclaim(struct rcu_head *rp) void ipoib_neigh_free(struct ipoib_neigh *neigh) { struct net_device *dev = neigh->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **np; @@ -1450,7 +1455,7 @@ void ipoib_neigh_free(struct ipoib_neigh *neigh) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { @@ -1508,7 +1513,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; @@ -1535,7 +1540,7 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1577,7 +1582,7 @@ static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } @@ -1594,7 +1599,7 @@ out_unlock: static void ipoib_neigh_hash_uninit(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int stopped; ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); @@ -1611,10 +1616,28 @@ static void ipoib_neigh_hash_uninit(struct net_device *dev) wait_for_completion(&priv->ntbl.deleted); } +static void ipoib_dev_uninit_default(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_transport_dev_cleanup(dev); -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) + netif_napi_del(&priv->napi); + + ipoib_cm_dev_cleanup(dev); + + kfree(priv->rx_ring); + vfree(priv->tx_ring); + + priv->rx_ring = NULL; + priv->tx_ring = NULL; +} + +static int ipoib_dev_init_default(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, @@ -1625,46 +1648,112 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", - ca->name, ipoib_sendq_size); + priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ - if (ipoib_ib_dev_init(dev, ca, port)) + if (ipoib_transport_dev_init(dev, priv->ca)) { + pr_warn("%s: ipoib_transport_dev_init failed\n", + priv->ca->name); goto out_tx_ring_cleanup; + } + + /* after qp created set dev address */ + priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; + priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; + priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff; + + setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, + (unsigned long)dev); + + return 0; + +out_tx_ring_cleanup: + vfree(priv->tx_ring); + +out_rx_ring_cleanup: + kfree(priv->rx_ring); + +out: + netif_napi_del(&priv->napi); + return -ENOMEM; +} + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret = -ENOMEM; + + priv->ca = ca; + priv->port = port; + priv->qp = NULL; /* - * Must be after ipoib_ib_dev_init so we can allocate a per - * device wq there and use it here + * the various IPoIB tasks assume they will never race against + * themselves, so always use a single thread workqueue */ - if (ipoib_neigh_hash_init(priv) < 0) + priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); + if (!priv->wq) { + pr_warn("%s: failed to allocate device WQ\n", dev->name); + goto out; + } + + /* create pd, which used both for control and datapath*/ + priv->pd = ib_alloc_pd(priv->ca, 0); + if (IS_ERR(priv->pd)) { + pr_warn("%s: failed to allocate PD\n", ca->name); + goto clean_wq; + } + + ret = priv->rn_ops->ndo_init(dev); + if (ret) { + pr_warn("%s failed to init HW resource\n", dev->name); + goto out_free_pd; + } + + if (ipoib_neigh_hash_init(priv) < 0) { + pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; + } + + if (dev->flags & IFF_UP) { + if (ipoib_ib_dev_open(dev)) { + pr_warn("%s failed to open device\n", dev->name); + ret = -ENODEV; + goto out_dev_uninit; + } + } return 0; out_dev_uninit: ipoib_ib_dev_cleanup(dev); -out_tx_ring_cleanup: - vfree(priv->tx_ring); +out_free_pd: + if (priv->pd) { + ib_dealloc_pd(priv->pd); + priv->pd = NULL; + } -out_rx_ring_cleanup: - kfree(priv->rx_ring); +clean_wq: + if (priv->wq) { + destroy_workqueue(priv->wq); + priv->wq = NULL; + } out: - return -ENOMEM; + return ret; } void ipoib_dev_cleanup(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + struct ipoib_dev_priv *priv = ipoib_priv(dev), *cpriv, *tcpriv; LIST_HEAD(head); ASSERT_RTNL(); - ipoib_delete_debug_files(dev); - /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { /* Stop GC on child */ @@ -1674,24 +1763,21 @@ void ipoib_dev_cleanup(struct net_device *dev) } unregister_netdevice_many(&head); - /* - * Must be before ipoib_ib_dev_cleanup or we delete an in use - * work queue - */ ipoib_neigh_hash_uninit(dev); ipoib_ib_dev_cleanup(dev); - kfree(priv->rx_ring); - vfree(priv->tx_ring); - - priv->rx_ring = NULL; - priv->tx_ring = NULL; + /* no more works over the priv->wq */ + if (priv->wq) { + flush_workqueue(priv->wq); + destroy_workqueue(priv->wq); + priv->wq = NULL; + } } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); } @@ -1699,7 +1785,7 @@ static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_stat static int ipoib_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int err; err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); @@ -1713,7 +1799,7 @@ static int ipoib_get_vf_config(struct net_device *dev, int vf, static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) return -EINVAL; @@ -1724,7 +1810,7 @@ static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) static int ipoib_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); } @@ -1748,6 +1834,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_set_vf_guid = ipoib_set_vf_guid, .ndo_set_mac_address = ipoib_set_mac, + .ndo_get_stats64 = ipoib_get_stats, }; static const struct net_device_ops ipoib_netdev_ops_vf = { @@ -1762,21 +1849,12 @@ static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_get_iflink = ipoib_get_iflink, }; -void ipoib_setup(struct net_device *dev) +void ipoib_setup_common(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - - if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION) - dev->netdev_ops = &ipoib_netdev_ops_vf; - else - dev->netdev_ops = &ipoib_netdev_ops_pf; - dev->header_ops = &ipoib_header_ops; ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); - dev->watchdog_timeo = HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; @@ -1790,11 +1868,14 @@ void ipoib_setup(struct net_device *dev) netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); +} - priv->dev = dev; +static void ipoib_build_priv(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + priv->dev = dev; spin_lock_init(&priv->lock); - init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); @@ -1812,22 +1893,100 @@ void ipoib_setup(struct net_device *dev) INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } -struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) +static const struct net_device_ops ipoib_netdev_default_pf = { + .ndo_init = ipoib_dev_init_default, + .ndo_uninit = ipoib_dev_uninit_default, + .ndo_open = ipoib_ib_dev_open_default, + .ndo_stop = ipoib_ib_dev_stop_default, +}; + +static struct net_device +*ipoib_create_netdev_default(struct ib_device *hca, + const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *)) { struct net_device *dev; + struct rdma_netdev *rn; - dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name, - NET_NAME_UNKNOWN, ipoib_setup); + dev = alloc_netdev((int)sizeof(struct rdma_netdev), + name, + name_assign_type, setup); if (!dev) return NULL; - return netdev_priv(dev); + rn = netdev_priv(dev); + + rn->send = ipoib_send; + rn->attach_mcast = ipoib_mcast_attach; + rn->detach_mcast = ipoib_mcast_detach; + rn->free_rdma_netdev = free_netdev; + rn->hca = hca; + + dev->netdev_ops = &ipoib_netdev_default_pf; + + return dev; +} + +static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port, + const char *name) +{ + struct net_device *dev; + + if (hca->alloc_rdma_netdev) { + dev = hca->alloc_rdma_netdev(hca, port, + RDMA_NETDEV_IPOIB, name, + NET_NAME_UNKNOWN, + ipoib_setup_common); + if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP) + return NULL; + } + + if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP) + dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN, + ipoib_setup_common); + + return dev; +} + +struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port, + const char *name) +{ + struct net_device *dev; + struct ipoib_dev_priv *priv; + struct rdma_netdev *rn; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return NULL; + + dev = ipoib_get_netdev(hca, port, name); + if (!dev) + goto free_priv; + + priv->rn_ops = dev->netdev_ops; + + /* fixme : should be after the query_cap */ + if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION) + dev->netdev_ops = &ipoib_netdev_ops_vf; + else + dev->netdev_ops = &ipoib_netdev_ops_pf; + + rn = netdev_priv(dev); + rn->clnt_priv = priv; + ipoib_build_priv(dev); + + return priv; +free_priv: + kfree(priv); + return NULL; } static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, char *buf) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + struct net_device *ndev = to_net_dev(dev); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sprintf(buf, "0x%04x\n", priv->pkey); } @@ -1836,14 +1995,15 @@ static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); static ssize_t show_umcast(struct device *dev, struct device_attribute *attr, char *buf) { - struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + struct net_device *ndev = to_net_dev(dev); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { - struct ipoib_dev_priv *priv = netdev_priv(ndev); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); @@ -1916,7 +2076,7 @@ static int ipoib_check_lladdr(struct net_device *dev, static int ipoib_set_mac(struct net_device *dev, void *addr) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sockaddr_storage *ss = addr; int ret; @@ -1984,20 +2144,18 @@ int ipoib_add_pkey_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_pkey); } -int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) +void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { priv->hca_caps = hca->attrs.device_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { - priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->hca_caps & IB_DEVICE_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; } - - return 0; } static struct net_device *ipoib_add_port(const char *format, @@ -2007,11 +2165,11 @@ static struct net_device *ipoib_add_port(const char *format, struct ib_port_attr attr; int result = -ENOMEM; - priv = ipoib_intf_alloc(format); + priv = ipoib_intf_alloc(hca, port, format); if (!priv) goto alloc_mem_failed; - SET_NETDEV_DEV(priv->dev, hca->dma_device); + SET_NETDEV_DEV(priv->dev, hca->dev.parent); priv->dev->dev_id = port - 1; result = ib_query_port(hca, port, &attr); @@ -2037,9 +2195,7 @@ static struct net_device *ipoib_add_port(const char *format, goto device_init_failed; } - result = ipoib_set_dev_features(priv, hca); - if (result) - goto device_init_failed; + ipoib_set_dev_features(priv, hca); /* * Set the full membership bit, so that we join the right @@ -2083,8 +2239,7 @@ static struct net_device *ipoib_add_port(const char *format, goto register_failed; } - ipoib_create_debug_files(priv->dev); - + result = -ENOMEM; if (ipoib_cm_add_mode_attr(priv->dev)) goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) @@ -2099,7 +2254,6 @@ static struct net_device *ipoib_add_port(const char *format, return priv->dev; sysfs_failed: - ipoib_delete_debug_files(priv->dev); unregister_netdev(priv->dev); register_failed: @@ -2115,6 +2269,7 @@ event_failed: device_init_failed: free_netdev(priv->dev); + kfree(priv); alloc_mem_failed: return ERR_PTR(result); @@ -2139,7 +2294,7 @@ static void ipoib_add_one(struct ib_device *device) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { - priv = netdev_priv(dev); + priv = ipoib_priv(dev); list_add_tail(&priv->list, dev_list); count++; } @@ -2155,13 +2310,15 @@ static void ipoib_add_one(struct ib_device *device) static void ipoib_remove_one(struct ib_device *device, void *client_data) { - struct ipoib_dev_priv *priv, *tmp; + struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; struct list_head *dev_list = client_data; if (!dev_list) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { + struct rdma_netdev *rn = netdev_priv(priv->dev); + ib_unregister_event_handler(&priv->event_handler); flush_workqueue(ipoib_workqueue); @@ -2178,12 +2335,23 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) flush_workqueue(priv->wq); unregister_netdev(priv->dev); - free_netdev(priv->dev); + rn->free_rdma_netdev(priv->dev); + + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) + kfree(cpriv); + + kfree(priv); } kfree(dev_list); } +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static struct notifier_block ipoib_netdev_notifier = { + .notifier_call = ipoib_netdev_event, +}; +#endif + static int __init ipoib_init_module(void) { int ret; @@ -2236,6 +2404,9 @@ static int __init ipoib_init_module(void) if (ret) goto err_client; +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + register_netdevice_notifier(&ipoib_netdev_notifier); +#endif return 0; err_client: @@ -2253,6 +2424,9 @@ err_fs: static void __exit ipoib_cleanup_module(void) { +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + unregister_netdevice_notifier(&ipoib_netdev_notifier); +#endif ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index fddff403d5d2..057f58e6afca 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -114,7 +114,7 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast) struct net_device *dev = mcast->dev; int tx_dropped = 0; - ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", + ipoib_dbg_mcast(ipoib_priv(dev), "deleting multicast group %pI6\n", mcast->mcmember.mgid.raw); /* remove all neigh connected to this mcast */ @@ -158,7 +158,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node *n = priv->multicast_tree.rb_node; while (n) { @@ -182,7 +182,7 @@ static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; while (*n) { @@ -212,8 +212,10 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct ib_sa_mcmember_rec *mcmember) { struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_ah *ah; + struct rdma_ah_attr av; int ret; int set_qkey = 0; @@ -260,8 +262,9 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, return 0; } - ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), - &mcast->mcmember.mgid, set_qkey); + ret = rn->attach_mcast(dev, priv->ca, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid), + set_qkey, priv->qkey); if (ret < 0) { ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n", mcast->mcmember.mgid.raw); @@ -271,40 +274,34 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, } } - { - struct ib_ah_attr av = { - .dlid = be16_to_cpu(mcast->mcmember.mlid), - .port_num = priv->port, - .sl = mcast->mcmember.sl, - .ah_flags = IB_AH_GRH, - .static_rate = mcast->mcmember.rate, - .grh = { - .flow_label = be32_to_cpu(mcast->mcmember.flow_label), - .hop_limit = mcast->mcmember.hop_limit, - .sgid_index = 0, - .traffic_class = mcast->mcmember.traffic_class - } - }; - av.grh.dgid = mcast->mcmember.mgid; - - ah = ipoib_create_ah(dev, priv->pd, &av); - if (IS_ERR(ah)) { - ipoib_warn(priv, "ib_address_create failed %ld\n", - -PTR_ERR(ah)); - /* use original error */ - return PTR_ERR(ah); - } else { - spin_lock_irq(&priv->lock); - mcast->ah = ah; - spin_unlock_irq(&priv->lock); - - ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", - mcast->mcmember.mgid.raw, - mcast->ah->ah, - be16_to_cpu(mcast->mcmember.mlid), - mcast->mcmember.sl); - } + memset(&av, 0, sizeof(av)); + av.type = rdma_ah_find_type(priv->ca, priv->port); + rdma_ah_set_dlid(&av, be16_to_cpu(mcast->mcmember.mlid)), + rdma_ah_set_port_num(&av, priv->port); + rdma_ah_set_sl(&av, mcast->mcmember.sl); + rdma_ah_set_static_rate(&av, mcast->mcmember.rate); + + rdma_ah_set_grh(&av, &mcast->mcmember.mgid, + be32_to_cpu(mcast->mcmember.flow_label), + 0, mcast->mcmember.hop_limit, + mcast->mcmember.traffic_class); + + ah = ipoib_create_ah(dev, priv->pd, &av); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); } + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + + ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", + mcast->mcmember.mgid.raw, + mcast->ah->ah, + be16_to_cpu(mcast->mcmember.mlid), + mcast->mcmember.sl); /* actually send any queued packets */ netif_tx_lock_bh(dev); @@ -314,9 +311,11 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, netif_tx_unlock_bh(dev); skb->dev = dev; - if (dev_queue_xmit(skb)) - ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n", + __func__, ret); netif_tx_lock_bh(dev); } netif_tx_unlock_bh(dev); @@ -329,7 +328,6 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; - int ret; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { @@ -342,11 +340,9 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) * because the broadcast group must always be joined first and is always * re-joined if the SM changes substantially. */ - ret = ipoib_check_sm_sendonly_fullmember_support(priv); - if (ret < 0) - pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n", - priv->dev->name, ret); - + priv->sm_fullmember_sendonly_support = + ib_sa_sendonly_fullmem_support(&ipoib_sa_client, + priv->ca, priv->port); /* * Take rtnl_lock to avoid racing with ipoib_stop() and * turning the carrier back on while a device is being @@ -373,7 +369,7 @@ static int ipoib_mcast_join_complete(int status, { struct ipoib_mcast *mcast = multicast->context; struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n", test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? @@ -475,7 +471,7 @@ out_locked: */ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_sa_multicast *multicast; struct ib_sa_mcmember_rec rec = { .join_state = 1 @@ -487,6 +483,9 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return -EINVAL; + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw); rec.mgid = mcast->mcmember.mgid; @@ -645,8 +644,6 @@ void ipoib_mcast_join_task(struct work_struct *work) if (mcast->backoff == 1 || time_after_eq(jiffies, mcast->delay_until)) { /* Found the next unjoined group */ - init_completion(&mcast->done); - set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); if (ipoib_mcast_join(dev, mcast)) { spin_unlock_irq(&priv->lock); return; @@ -666,17 +663,15 @@ out: queue_delayed_work(priv->wq, &priv->mcast_task, delay_until - jiffies); } - if (mcast) { - init_completion(&mcast->done); - set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + if (mcast) ipoib_mcast_join(dev, mcast); - } + spin_unlock_irq(&priv->lock); } -int ipoib_mcast_start_thread(struct net_device *dev) +void ipoib_mcast_start_thread(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); unsigned long flags; ipoib_dbg_mcast(priv, "starting multicast thread\n"); @@ -684,13 +679,11 @@ int ipoib_mcast_start_thread(struct net_device *dev) spin_lock_irqsave(&priv->lock, flags); __ipoib_mcast_schedule_join_thread(priv, NULL, 0); spin_unlock_irqrestore(&priv->lock, flags); - - return 0; } int ipoib_mcast_stop_thread(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); unsigned long flags; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); @@ -706,7 +699,8 @@ int ipoib_mcast_stop_thread(struct net_device *dev) static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) @@ -720,8 +714,8 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) mcast->mcmember.mgid.raw); /* Remove ourselves from the multicast group */ - ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, - be16_to_cpu(mcast->mcmember.mlid)); + ret = rn->detach_mcast(dev, priv->ca, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) @@ -762,7 +756,8 @@ void ipoib_mcast_remove_list(struct list_head *remove_list) void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_mcast *mcast; unsigned long flags; void *mgid = daddr + 4; @@ -825,7 +820,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) } } spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + mcast->ah->last_send = rn->send(dev, skb, mcast->ah->ah, + IB_MULTICAST_QPN); if (neigh) ipoib_neigh_put(neigh); return; @@ -837,7 +833,7 @@ unlock: void ipoib_mcast_dev_flush(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); LIST_HEAD(remove_list); struct ipoib_mcast *mcast, *tmcast; unsigned long flags; @@ -1029,7 +1025,7 @@ struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) { - struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); struct rb_node *n; struct ipoib_mcast *mcast; int ret = 1; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index cdc7df4fdb8a..3e44087935ae 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -44,7 +44,7 @@ static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); u16 val; if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) @@ -64,8 +64,9 @@ nla_put_failure: return -EMSGSIZE; } -static int ipoib_changelink(struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int ipoib_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) { u16 mode, umcast; int ret = 0; @@ -93,7 +94,8 @@ out_err: } static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { struct net_device *pdev; struct ipoib_dev_priv *ppriv; @@ -107,7 +109,7 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, if (!pdev || pdev->type != ARPHRD_INFINIBAND) return -ENODEV; - ppriv = netdev_priv(pdev); + ppriv = ipoib_priv(pdev); if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { ipoib_warn(ppriv, "child creation disallowed for child devices\n"); @@ -129,10 +131,11 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, */ child_pkey |= 0x8000; - err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); + err = __ipoib_vlan_add(ppriv, ipoib_priv(dev), + child_pkey, IPOIB_RTNL_CHILD); if (!err && data) - err = ipoib_changelink(dev, tb, data); + err = ipoib_changelink(dev, tb, data, extack); return err; } @@ -140,8 +143,8 @@ static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head { struct ipoib_dev_priv *priv, *ppriv; - priv = netdev_priv(dev); - ppriv = netdev_priv(priv->parent); + priv = ipoib_priv(dev); + ppriv = ipoib_priv(priv->parent); down_write(&ppriv->vlan_rwsem); unregister_netdevice_queue(dev, head); @@ -161,7 +164,7 @@ static struct rtnl_link_ops ipoib_link_ops __read_mostly = { .maxtype = IFLA_IPOIB_MAX, .policy = ipoib_policy, .priv_size = sizeof(struct ipoib_dev_priv), - .setup = ipoib_setup, + .setup = ipoib_setup_common, .newlink = ipoib_new_child_link, .changelink = ipoib_changelink, .dellink = ipoib_unregister_child_dev, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 189dcd1709d2..bb64baf25309 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -35,9 +35,10 @@ #include "ipoib.h" -int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) +int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid, int set_qkey, u32 qkey) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_attr *qp_attr = NULL; int ret; u16 pkey_index; @@ -56,7 +57,7 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int goto out; /* set correct QKey for QP */ - qp_attr->qkey = priv->qkey; + qp_attr->qkey = qkey; ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); if (ret) { ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); @@ -74,9 +75,20 @@ out: return ret; } +int ipoib_mcast_detach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + + ret = ib_detach_mcast(priv->qp, mgid, mlid); + + return ret; +} + int ipoib_init_qp(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret; struct ib_qp_attr qp_attr; int attr_mask; @@ -130,7 +142,7 @@ out_fail: int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_init_attr init_attr = { .cap = { .max_send_wr = ipoib_sendq_size, @@ -147,22 +159,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) int ret, size; int i; - priv->pd = ib_alloc_pd(priv->ca, 0); - if (IS_ERR(priv->pd)) { - printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); - return -ENODEV; - } - - /* - * the various IPoIB tasks assume they will never race against - * themselves, so always use a single thread workqueue - */ - priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); - if (!priv->wq) { - printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); - goto out_free_pd; - } - size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(dev); if (!ret) { @@ -173,7 +169,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size += ipoib_recvq_size * ipoib_max_conn_qp; } else if (ret != -ENOSYS) - goto out_free_wq; + return -ENODEV; cq_attr.cqe = size; priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, @@ -212,10 +208,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) goto out_free_send_cq; } - priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; - priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; - priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; - for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) priv->tx_sge[i].lkey = priv->pd->local_dma_lkey; @@ -247,26 +239,18 @@ out_free_recv_cq: out_cm_dev_cleanup: ipoib_cm_dev_cleanup(dev); -out_free_wq: - destroy_workqueue(priv->wq); - priv->wq = NULL; - -out_free_pd: - ib_dealloc_pd(priv->pd); - return -ENODEV; } void ipoib_transport_dev_cleanup(struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); if (priv->qp) { if (ib_destroy_qp(priv->qp)) ipoib_warn(priv, "ib_qp_destroy failed\n"); priv->qp = NULL; - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } if (ib_destroy_cq(priv->send_cq)) @@ -274,16 +258,6 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) if (ib_destroy_cq(priv->recv_cq)) ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); - - ipoib_cm_dev_cleanup(dev); - - if (priv->wq) { - flush_workqueue(priv->wq); - destroy_workqueue(priv->wq); - priv->wq = NULL; - } - - ib_dealloc_pd(priv->pd); } void ipoib_event(struct ib_event_handler *handler, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index fd811115af49..081b33deff1b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -31,6 +31,7 @@ */ #include <linux/module.h> +#include <linux/sched/signal.h> #include <linux/init.h> #include <linux/seq_file.h> @@ -43,7 +44,7 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, char *buf) { struct net_device *dev = to_net_dev(d); - struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_dev_priv *priv = ipoib_priv(dev); return sprintf(buf, "%s\n", priv->parent->name); } @@ -61,9 +62,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, priv->parent = ppriv->dev; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); - result = ipoib_set_dev_features(priv, ppriv->ca); - if (result) - goto err; + ipoib_set_dev_features(priv, ppriv->ca); priv->pkey = pkey; @@ -87,8 +86,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, goto register_failed; } - ipoib_create_debug_files(priv->dev); - /* RTNL childs don't need proprietary sysfs entries */ if (type == IPOIB_LEGACY_CHILD) { if (ipoib_cm_add_mode_attr(priv->dev)) @@ -109,7 +106,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, sysfs_failed: result = -ENOMEM; - ipoib_delete_debug_files(priv->dev); unregister_netdevice(priv->dev); register_failed: @@ -129,20 +125,21 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = netdev_priv(pdev); + ppriv = ipoib_priv(pdev); if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) return -EPERM; snprintf(intf_name, sizeof intf_name, "%s.%04x", ppriv->dev->name, pkey); - priv = ipoib_intf_alloc(intf_name); - if (!priv) - return -ENOMEM; if (!rtnl_trylock()) return restart_syscall(); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); + if (!priv) + return -ENOMEM; + down_write(&ppriv->vlan_rwsem); /* @@ -168,11 +165,13 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) out: up_write(&ppriv->vlan_rwsem); - if (result) - free_netdev(priv->dev); - rtnl_unlock(); + if (result) { + free_netdev(priv->dev); + kfree(priv); + } + return result; } @@ -184,7 +183,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = netdev_priv(pdev); + ppriv = ipoib_priv(pdev); if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) return -EPERM; @@ -196,7 +195,6 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { - unregister_netdevice(priv->dev); list_del(&priv->list); dev = priv->dev; break; @@ -204,10 +202,16 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) } up_write(&ppriv->vlan_rwsem); + if (dev) { + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); + } + rtnl_unlock(); if (dev) { free_netdev(dev); + kfree(priv); return 0; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index e71af717e71b..37b33d708c2d 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -83,6 +83,7 @@ static struct scsi_host_template iscsi_iser_sht; static struct iscsi_transport iscsi_iser_transport; static struct scsi_transport_template *iscsi_iser_scsi_transport; static struct workqueue_struct *release_wq; +static DEFINE_MUTEX(unbind_iser_conn_mutex); struct iser_global ig; int iser_debug_level = 0; @@ -550,12 +551,14 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) */ if (iser_conn) { mutex_lock(&iser_conn->state_mutex); + mutex_lock(&unbind_iser_conn_mutex); iser_conn_terminate(iser_conn); iscsi_conn_stop(cls_conn, flag); /* unbind */ iser_conn->iscsi_conn = NULL; conn->dd_data = NULL; + mutex_unlock(&unbind_iser_conn_mutex); complete(&iser_conn->stop_completion); mutex_unlock(&iser_conn->state_mutex); @@ -652,7 +655,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, } if (iscsi_host_add(shost, - ib_conn->device->ib_device->dma_device)) { + ib_conn->device->ib_device->dev.parent)) { mutex_unlock(&iser_conn->state_mutex); goto free_host; } @@ -977,13 +980,21 @@ static int iscsi_iser_slave_alloc(struct scsi_device *sdev) struct iser_conn *iser_conn; struct ib_device *ib_dev; + mutex_lock(&unbind_iser_conn_mutex); + session = starget_to_session(scsi_target(sdev))->dd_data; iser_conn = session->leadconn->dd_data; + if (!iser_conn) { + mutex_unlock(&unbind_iser_conn_mutex); + return -ENOTCONN; + } ib_dev = iser_conn->ib_conn.device->ib_device; if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); + mutex_unlock(&unbind_iser_conn_mutex); + return 0; } @@ -994,6 +1005,7 @@ static struct scsi_host_template iscsi_iser_sht = { .change_queue_depth = scsi_change_queue_depth, .sg_tablesize = ISCSI_ISER_DEF_SG_TABLESIZE, .cmd_per_lun = ISER_DEF_CMD_PER_LUN, + .eh_timed_out = iscsi_eh_cmd_timed_out, .eh_abort_handler = iscsi_eh_abort, .eh_device_reset_handler= iscsi_eh_device_reset, .eh_target_reset_handler = iscsi_eh_recover_target, diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 9d0b22ad58c1..c1ae4aeae2f9 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -430,6 +430,7 @@ struct iser_fr_desc { struct list_head list; struct iser_reg_resources rsc; struct iser_pi_context *pi_ctx; + struct list_head all_list; }; /** @@ -443,6 +444,7 @@ struct iser_fr_pool { struct list_head list; spinlock_t lock; int size; + struct list_head all_list; }; /** diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 81ae2e30dd12..2a07692007bd 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -137,8 +137,10 @@ iser_prepare_write_cmd(struct iscsi_task *task, if (unsol_sz < edtl) { hdr->flags |= ISER_WSV; - hdr->write_stag = cpu_to_be32(mem_reg->rkey); - hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); + if (buf_out->data_len > imm_sz) { + hdr->write_stag = cpu_to_be32(mem_reg->rkey); + hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); + } iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " "VA:%#llX + unsol:%d\n", @@ -612,7 +614,7 @@ iser_check_remote_inv(struct iser_conn *iser_conn, iser_conn, rkey); if (unlikely(!iser_conn->snd_w_inv)) { - iser_err("conn %p: unexepected remote invalidation, " + iser_err("conn %p: unexpected remote invalidation, " "terminating connection\n", iser_conn); return -EPROTO; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 6a9d1cb548ee..26a004e97ae0 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -362,6 +362,7 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, int i, ret; INIT_LIST_HEAD(&fr_pool->list); + INIT_LIST_HEAD(&fr_pool->all_list); spin_lock_init(&fr_pool->lock); fr_pool->size = 0; for (i = 0; i < cmds_max; i++) { @@ -373,6 +374,7 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, } list_add_tail(&desc->list, &fr_pool->list); + list_add_tail(&desc->all_list, &fr_pool->all_list); fr_pool->size++; } @@ -392,13 +394,13 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) struct iser_fr_desc *desc, *tmp; int i = 0; - if (list_empty(&fr_pool->list)) + if (list_empty(&fr_pool->all_list)) return; iser_info("freeing conn %p fr pool\n", ib_conn); - list_for_each_entry_safe(desc, tmp, &fr_pool->list, list) { - list_del(&desc->list); + list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) { + list_del(&desc->all_list); iser_free_reg_res(&desc->rsc); if (desc->pi_ctx) iser_free_pi_ctx(desc->pi_ctx); @@ -597,7 +599,9 @@ static void iser_free_ib_conn_res(struct iser_conn *iser_conn, iser_conn, ib_conn->cma_id, ib_conn->qp); if (ib_conn->qp != NULL) { + mutex_lock(&ig.connlist_mutex); ib_conn->comp->active_qps--; + mutex_unlock(&ig.connlist_mutex); rdma_destroy_qp(ib_conn->cma_id); ib_conn->qp = NULL; } @@ -704,8 +708,14 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, unsigned short sg_tablesize, sup_sg_tablesize; sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K); - sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE, - device->ib_device->attrs.max_fast_reg_page_list_len); + if (device->ib_device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS) + sup_sg_tablesize = + min_t( + uint, ISCSI_ISER_MAX_SG_TABLESIZE, + device->ib_device->attrs.max_fast_reg_page_list_len); + else + sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE; iser_conn->scsi_sg_tablesize = min(sg_tablesize, sup_sg_tablesize); } diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 314e95516068..0e662656ef42 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -728,7 +728,7 @@ isert_disconnected_handler(struct rdma_cm_id *cma_id, iscsit_cause_connection_reinstatement(isert_conn->conn, 0); break; default: - isert_warn("conn %p teminating in state %d\n", + isert_warn("conn %p terminating in state %d\n", isert_conn, isert_conn->state); } mutex_unlock(&isert_conn->mutex); @@ -817,6 +817,7 @@ isert_post_recvm(struct isert_conn *isert_conn, u32 count) rx_wr->sg_list = &rx_desc->rx_sg; rx_wr->num_sge = 1; rx_wr->next = rx_wr + 1; + rx_desc->in_use = false; } rx_wr--; rx_wr->next = NULL; /* mark end of work requests list */ @@ -835,6 +836,15 @@ isert_post_recv(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc) struct ib_recv_wr *rx_wr_failed, rx_wr; int ret; + if (!rx_desc->in_use) { + /* + * if the descriptor is not in-use we already reposted it + * for recv, so just silently return + */ + return 0; + } + + rx_desc->in_use = false; rx_wr.wr_cqe = &rx_desc->rx_cqe; rx_wr.sg_list = &rx_desc->rx_sg; rx_wr.num_sge = 1; @@ -1397,6 +1407,8 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } + rx_desc->in_use = true; + ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); @@ -1440,7 +1452,7 @@ static void isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct isert_conn *isert_conn = wc->qp->qp_context; - struct ib_device *ib_dev = isert_conn->cm_id->device; + struct ib_device *ib_dev = isert_conn->device->ib_device; if (unlikely(wc->status != IB_WC_SUCCESS)) { isert_print_wc(wc, "login recv"); @@ -1659,10 +1671,23 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr); isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); - if (ret) - transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0); - else - isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd); + if (ret) { + /* + * transport_generic_request_failure() expects to have + * plus two references to handle queue-full, so re-add + * one here as target-core will have already dropped + * it after the first isert_put_datain() callback. + */ + kref_get(&cmd->cmd_kref); + transport_generic_request_failure(cmd, cmd->pi_err); + } else { + /* + * XXX: isert_put_response() failure is not retried. + */ + ret = isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd); + if (ret) + pr_warn_ratelimited("isert_put_response() ret: %d\n", ret); + } } static void @@ -1699,13 +1724,15 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; spin_unlock_bh(&cmd->istate_lock); - if (ret) { - target_put_sess_cmd(se_cmd); - transport_send_check_condition_and_sense(se_cmd, - se_cmd->pi_err, 0); - } else { + /* + * transport_generic_request_failure() will drop the extra + * se_cmd->cmd_kref reference after T10-PI error, and handle + * any non-zero ->queue_status() callback error retries. + */ + if (ret) + transport_generic_request_failure(se_cmd, se_cmd->pi_err); + else target_execute_cmd(se_cmd); - } } static void @@ -2171,26 +2198,28 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) chain_wr = &isert_cmd->tx_desc.send_wr; } - isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr); - isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd); - return 1; + rc = isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr); + isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ rc: %d\n", + isert_cmd, rc); + return rc; } static int isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + int ret; isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done); isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - isert_rdma_rw_ctx_post(isert_cmd, conn->context, - &isert_cmd->tx_desc.tx_cqe, NULL); + ret = isert_rdma_rw_ctx_post(isert_cmd, conn->context, + &isert_cmd->tx_desc.tx_cqe, NULL); - isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", - isert_cmd); - return 0; + isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE rc: %d\n", + isert_cmd, ret); + return ret; } static int diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index c02ada57d7f5..87d994de8c91 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -60,7 +60,7 @@ #define ISER_RX_PAD_SIZE (ISCSI_DEF_MAX_RECV_SEG_LEN + 4096 - \ (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \ - sizeof(struct ib_cqe))) + sizeof(struct ib_cqe) + sizeof(bool))) #define ISCSI_ISER_SG_TABLESIZE 256 @@ -85,6 +85,7 @@ struct iser_rx_desc { u64 dma_addr; struct ib_sge rx_sg; struct ib_cqe rx_cqe; + bool in_use; char pad[ISER_RX_PAD_SIZE]; } __packed; diff --git a/drivers/infiniband/ulp/opa_vnic/Kconfig b/drivers/infiniband/ulp/opa_vnic/Kconfig new file mode 100644 index 000000000000..48132ab5e6b9 --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/Kconfig @@ -0,0 +1,8 @@ +config INFINIBAND_OPA_VNIC + tristate "Intel OPA VNIC support" + depends on X86_64 && INFINIBAND + ---help--- + This is Omni-Path (OPA) Virtual Network Interface Controller (VNIC) + driver for Ethernet over Omni-Path feature. It implements the HW + independent VNIC functionality. It interfaces with Linux stack for + data path and IB MAD for the control path. diff --git a/drivers/infiniband/ulp/opa_vnic/Makefile b/drivers/infiniband/ulp/opa_vnic/Makefile new file mode 100644 index 000000000000..8061b287cfe4 --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/Makefile @@ -0,0 +1,7 @@ +# Makefile - Intel Omni-Path Virtual Network Controller driver +# Copyright(c) 2017, Intel Corporation. +# +obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic.o + +opa_vnic-y := opa_vnic_netdev.o opa_vnic_encap.o opa_vnic_ethtool.o \ + opa_vnic_vema.o opa_vnic_vema_iface.o diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c new file mode 100644 index 000000000000..afa938bd26d6 --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c @@ -0,0 +1,475 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains OPA VNIC encapsulation/decapsulation function. + */ + +#include <linux/if_ether.h> +#include <linux/if_vlan.h> + +#include "opa_vnic_internal.h" + +/* OPA 16B Header fields */ +#define OPA_16B_LID_MASK 0xFFFFFull +#define OPA_16B_SLID_HIGH_SHFT 8 +#define OPA_16B_SLID_MASK 0xF00ull +#define OPA_16B_DLID_MASK 0xF000ull +#define OPA_16B_DLID_HIGH_SHFT 12 +#define OPA_16B_LEN_SHFT 20 +#define OPA_16B_SC_SHFT 20 +#define OPA_16B_RC_SHFT 25 +#define OPA_16B_PKEY_SHFT 16 + +#define OPA_VNIC_L4_HDR_SHFT 16 + +/* L2+L4 hdr len is 20 bytes (5 quad words) */ +#define OPA_VNIC_HDR_QW_LEN 5 + +static inline void opa_vnic_make_header(u8 *hdr, u32 slid, u32 dlid, u16 len, + u16 pkey, u16 entropy, u8 sc, u8 rc, + u8 l4_type, u16 l4_hdr) +{ + /* h[1]: LT=1, 16B L2=10 */ + u32 h[OPA_VNIC_HDR_QW_LEN] = {0, 0xc0000000, 0, 0, 0}; + + h[2] = l4_type; + h[3] = entropy; + h[4] = l4_hdr << OPA_VNIC_L4_HDR_SHFT; + + /* Extract and set 4 upper bits and 20 lower bits of the lids */ + h[0] |= (slid & OPA_16B_LID_MASK); + h[2] |= ((slid >> (20 - OPA_16B_SLID_HIGH_SHFT)) & OPA_16B_SLID_MASK); + + h[1] |= (dlid & OPA_16B_LID_MASK); + h[2] |= ((dlid >> (20 - OPA_16B_DLID_HIGH_SHFT)) & OPA_16B_DLID_MASK); + + h[0] |= (len << OPA_16B_LEN_SHFT); + h[1] |= (rc << OPA_16B_RC_SHFT); + h[1] |= (sc << OPA_16B_SC_SHFT); + h[2] |= ((u32)pkey << OPA_16B_PKEY_SHFT); + + memcpy(hdr, h, OPA_VNIC_HDR_LEN); +} + +/* + * Using a simple hash table for mac table implementation with the last octet + * of mac address as a key. + */ +static void opa_vnic_free_mac_tbl(struct hlist_head *mactbl) +{ + struct opa_vnic_mac_tbl_node *node; + struct hlist_node *tmp; + int bkt; + + if (!mactbl) + return; + + vnic_hash_for_each_safe(mactbl, bkt, tmp, node, hlist) { + hash_del(&node->hlist); + kfree(node); + } + kfree(mactbl); +} + +static struct hlist_head *opa_vnic_alloc_mac_tbl(void) +{ + u32 size = sizeof(struct hlist_head) * OPA_VNIC_MAC_TBL_SIZE; + struct hlist_head *mactbl; + + mactbl = kzalloc(size, GFP_KERNEL); + if (!mactbl) + return ERR_PTR(-ENOMEM); + + vnic_hash_init(mactbl); + return mactbl; +} + +/* opa_vnic_release_mac_tbl - empty and free the mac table */ +void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter) +{ + struct hlist_head *mactbl; + + mutex_lock(&adapter->mactbl_lock); + mactbl = rcu_access_pointer(adapter->mactbl); + rcu_assign_pointer(adapter->mactbl, NULL); + synchronize_rcu(); + opa_vnic_free_mac_tbl(mactbl); + mutex_unlock(&adapter->mactbl_lock); +} + +/* + * opa_vnic_query_mac_tbl - query the mac table for a section + * + * This function implements query of specific function of the mac table. + * The function also expects the requested range to be valid. + */ +void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter, + struct opa_veswport_mactable *tbl) +{ + struct opa_vnic_mac_tbl_node *node; + struct hlist_head *mactbl; + int bkt; + u16 loffset, lnum_entries; + + rcu_read_lock(); + mactbl = rcu_dereference(adapter->mactbl); + if (!mactbl) + goto get_mac_done; + + loffset = be16_to_cpu(tbl->offset); + lnum_entries = be16_to_cpu(tbl->num_entries); + + vnic_hash_for_each(mactbl, bkt, node, hlist) { + struct __opa_vnic_mactable_entry *nentry = &node->entry; + struct opa_veswport_mactable_entry *entry; + + if ((node->index < loffset) || + (node->index >= (loffset + lnum_entries))) + continue; + + /* populate entry in the tbl corresponding to the index */ + entry = &tbl->tbl_entries[node->index - loffset]; + memcpy(entry->mac_addr, nentry->mac_addr, + ARRAY_SIZE(entry->mac_addr)); + memcpy(entry->mac_addr_mask, nentry->mac_addr_mask, + ARRAY_SIZE(entry->mac_addr_mask)); + entry->dlid_sd = cpu_to_be32(nentry->dlid_sd); + } + tbl->mac_tbl_digest = cpu_to_be32(adapter->info.vport.mac_tbl_digest); +get_mac_done: + rcu_read_unlock(); +} + +/* + * opa_vnic_update_mac_tbl - update mac table section + * + * This function updates the specified section of the mac table. + * The procedure includes following steps. + * - Allocate a new mac (hash) table. + * - Add the specified entries to the new table. + * (except the ones that are requested to be deleted). + * - Add all the other entries from the old mac table. + * - If there is a failure, free the new table and return. + * - Switch to the new table. + * - Free the old table and return. + * + * The function also expects the requested range to be valid. + */ +int opa_vnic_update_mac_tbl(struct opa_vnic_adapter *adapter, + struct opa_veswport_mactable *tbl) +{ + struct opa_vnic_mac_tbl_node *node, *new_node; + struct hlist_head *new_mactbl, *old_mactbl; + int i, bkt, rc = 0; + u8 key; + u16 loffset, lnum_entries; + + mutex_lock(&adapter->mactbl_lock); + /* allocate new mac table */ + new_mactbl = opa_vnic_alloc_mac_tbl(); + if (IS_ERR(new_mactbl)) { + mutex_unlock(&adapter->mactbl_lock); + return PTR_ERR(new_mactbl); + } + + loffset = be16_to_cpu(tbl->offset); + lnum_entries = be16_to_cpu(tbl->num_entries); + + /* add updated entries to the new mac table */ + for (i = 0; i < lnum_entries; i++) { + struct __opa_vnic_mactable_entry *nentry; + struct opa_veswport_mactable_entry *entry = + &tbl->tbl_entries[i]; + u8 *mac_addr = entry->mac_addr; + u8 empty_mac[ETH_ALEN] = { 0 }; + + v_dbg("new mac entry %4d: %02x:%02x:%02x:%02x:%02x:%02x %x\n", + loffset + i, mac_addr[0], mac_addr[1], mac_addr[2], + mac_addr[3], mac_addr[4], mac_addr[5], + entry->dlid_sd); + + /* if the entry is being removed, do not add it */ + if (!memcmp(mac_addr, empty_mac, ARRAY_SIZE(empty_mac))) + continue; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) { + rc = -ENOMEM; + goto updt_done; + } + + node->index = loffset + i; + nentry = &node->entry; + memcpy(nentry->mac_addr, entry->mac_addr, + ARRAY_SIZE(nentry->mac_addr)); + memcpy(nentry->mac_addr_mask, entry->mac_addr_mask, + ARRAY_SIZE(nentry->mac_addr_mask)); + nentry->dlid_sd = be32_to_cpu(entry->dlid_sd); + key = node->entry.mac_addr[OPA_VNIC_MAC_HASH_IDX]; + vnic_hash_add(new_mactbl, &node->hlist, key); + } + + /* add other entries from current mac table to new mac table */ + old_mactbl = rcu_access_pointer(adapter->mactbl); + if (!old_mactbl) + goto switch_tbl; + + vnic_hash_for_each(old_mactbl, bkt, node, hlist) { + if ((node->index >= loffset) && + (node->index < (loffset + lnum_entries))) + continue; + + new_node = kzalloc(sizeof(*new_node), GFP_KERNEL); + if (!new_node) { + rc = -ENOMEM; + goto updt_done; + } + + new_node->index = node->index; + memcpy(&new_node->entry, &node->entry, sizeof(node->entry)); + key = new_node->entry.mac_addr[OPA_VNIC_MAC_HASH_IDX]; + vnic_hash_add(new_mactbl, &new_node->hlist, key); + } + +switch_tbl: + /* switch to new table */ + rcu_assign_pointer(adapter->mactbl, new_mactbl); + synchronize_rcu(); + + adapter->info.vport.mac_tbl_digest = be32_to_cpu(tbl->mac_tbl_digest); +updt_done: + /* upon failure, free the new table; otherwise, free the old table */ + if (rc) + opa_vnic_free_mac_tbl(new_mactbl); + else + opa_vnic_free_mac_tbl(old_mactbl); + + mutex_unlock(&adapter->mactbl_lock); + return rc; +} + +/* opa_vnic_chk_mac_tbl - check mac table for dlid */ +static uint32_t opa_vnic_chk_mac_tbl(struct opa_vnic_adapter *adapter, + struct ethhdr *mac_hdr) +{ + struct opa_vnic_mac_tbl_node *node; + struct hlist_head *mactbl; + u32 dlid = 0; + u8 key; + + rcu_read_lock(); + mactbl = rcu_dereference(adapter->mactbl); + if (unlikely(!mactbl)) + goto chk_done; + + key = mac_hdr->h_dest[OPA_VNIC_MAC_HASH_IDX]; + vnic_hash_for_each_possible(mactbl, node, hlist, key) { + struct __opa_vnic_mactable_entry *entry = &node->entry; + + /* if related to source mac, skip */ + if (unlikely(OPA_VNIC_DLID_SD_IS_SRC_MAC(entry->dlid_sd))) + continue; + + if (!memcmp(node->entry.mac_addr, mac_hdr->h_dest, + ARRAY_SIZE(node->entry.mac_addr))) { + /* mac address found */ + dlid = OPA_VNIC_DLID_SD_GET_DLID(node->entry.dlid_sd); + break; + } + } + +chk_done: + rcu_read_unlock(); + return dlid; +} + +/* opa_vnic_get_dlid - find and return the DLID */ +static uint32_t opa_vnic_get_dlid(struct opa_vnic_adapter *adapter, + struct sk_buff *skb, u8 def_port) +{ + struct __opa_veswport_info *info = &adapter->info; + struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); + u32 dlid; + + dlid = opa_vnic_chk_mac_tbl(adapter, mac_hdr); + if (dlid) + return dlid; + + if (is_multicast_ether_addr(mac_hdr->h_dest)) { + dlid = info->vesw.u_mcast_dlid; + } else { + if (is_local_ether_addr(mac_hdr->h_dest)) { + dlid = ((uint32_t)mac_hdr->h_dest[5] << 16) | + ((uint32_t)mac_hdr->h_dest[4] << 8) | + mac_hdr->h_dest[3]; + if (unlikely(!dlid)) + v_warn("Null dlid in MAC address\n"); + } else if (def_port != OPA_VNIC_INVALID_PORT) { + dlid = info->vesw.u_ucast_dlid[def_port]; + } + } + + return dlid; +} + +/* opa_vnic_get_sc - return the service class */ +static u8 opa_vnic_get_sc(struct __opa_veswport_info *info, + struct sk_buff *skb) +{ + struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); + u16 vlan_tci; + u8 sc; + + if (!__vlan_get_tag(skb, &vlan_tci)) { + u8 pcp = OPA_VNIC_VLAN_PCP(vlan_tci); + + if (is_multicast_ether_addr(mac_hdr->h_dest)) + sc = info->vport.pcp_to_sc_mc[pcp]; + else + sc = info->vport.pcp_to_sc_uc[pcp]; + } else { + if (is_multicast_ether_addr(mac_hdr->h_dest)) + sc = info->vport.non_vlan_sc_mc; + else + sc = info->vport.non_vlan_sc_uc; + } + + return sc; +} + +u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb) +{ + struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); + struct __opa_veswport_info *info = &adapter->info; + u8 vl; + + if (skb_vlan_tag_present(skb)) { + u8 pcp = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT; + + if (is_multicast_ether_addr(mac_hdr->h_dest)) + vl = info->vport.pcp_to_vl_mc[pcp]; + else + vl = info->vport.pcp_to_vl_uc[pcp]; + } else { + if (is_multicast_ether_addr(mac_hdr->h_dest)) + vl = info->vport.non_vlan_vl_mc; + else + vl = info->vport.non_vlan_vl_uc; + } + + return vl; +} + +/* opa_vnic_calc_entropy - calculate the packet entropy */ +u8 opa_vnic_calc_entropy(struct opa_vnic_adapter *adapter, struct sk_buff *skb) +{ + u16 hash16; + + /* + * Get flow based 16-bit hash and then XOR the upper and lower bytes + * to get the entropy. + * __skb_tx_hash limits qcount to 16 bits. Hence, get 15-bit hash. + */ + hash16 = __skb_tx_hash(adapter->netdev, skb, BIT(15)); + return (u8)((hash16 >> 8) ^ (hash16 & 0xff)); +} + +/* opa_vnic_get_def_port - get default port based on entropy */ +static inline u8 opa_vnic_get_def_port(struct opa_vnic_adapter *adapter, + u8 entropy) +{ + u8 flow_id; + + /* Add the upper and lower 4-bits of entropy to get the flow id */ + flow_id = ((entropy & 0xf) + (entropy >> 4)); + return adapter->flow_tbl[flow_id & (OPA_VNIC_FLOW_TBL_SIZE - 1)]; +} + +/* Calculate packet length including OPA header, crc and padding */ +static inline int opa_vnic_wire_length(struct sk_buff *skb) +{ + u32 pad_len; + + /* padding for 8 bytes size alignment */ + pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7; + pad_len += OPA_VNIC_ICRC_TAIL_LEN; + + return (skb->len + pad_len) >> 3; +} + +/* opa_vnic_encap_skb - encapsulate skb packet with OPA header and meta data */ +void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb) +{ + struct __opa_veswport_info *info = &adapter->info; + struct opa_vnic_skb_mdata *mdata; + u8 def_port, sc, entropy, *hdr; + u16 len, l4_hdr; + u32 dlid; + + hdr = skb_push(skb, OPA_VNIC_HDR_LEN); + + entropy = opa_vnic_calc_entropy(adapter, skb); + def_port = opa_vnic_get_def_port(adapter, entropy); + len = opa_vnic_wire_length(skb); + dlid = opa_vnic_get_dlid(adapter, skb, def_port); + sc = opa_vnic_get_sc(info, skb); + l4_hdr = info->vesw.vesw_id; + + mdata = skb_push(skb, sizeof(*mdata)); + mdata->vl = opa_vnic_get_vl(adapter, skb); + mdata->entropy = entropy; + mdata->flags = 0; + if (unlikely(!dlid)) { + mdata->flags = OPA_VNIC_SKB_MDATA_ENCAP_ERR; + return; + } + + opa_vnic_make_header(hdr, info->vport.encap_slid, dlid, len, + info->vesw.pkey, entropy, sc, 0, + OPA_VNIC_L4_ETHR, l4_hdr); +} diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h new file mode 100644 index 000000000000..4c434b9dd84c --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h @@ -0,0 +1,489 @@ +#ifndef _OPA_VNIC_ENCAP_H +#define _OPA_VNIC_ENCAP_H +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains all OPA VNIC declaration required for encapsulation + * and decapsulation of Ethernet packets + */ + +#include <linux/types.h> +#include <rdma/ib_mad.h> + +/* EMA class version */ +#define OPA_EMA_CLASS_VERSION 0x80 + +/* + * Define the Intel vendor management class for OPA + * ETHERNET MANAGEMENT + */ +#define OPA_MGMT_CLASS_INTEL_EMA 0x34 + +/* EM attribute IDs */ +#define OPA_EM_ATTR_CLASS_PORT_INFO 0x0001 +#define OPA_EM_ATTR_VESWPORT_INFO 0x0011 +#define OPA_EM_ATTR_VESWPORT_MAC_ENTRIES 0x0012 +#define OPA_EM_ATTR_IFACE_UCAST_MACS 0x0013 +#define OPA_EM_ATTR_IFACE_MCAST_MACS 0x0014 +#define OPA_EM_ATTR_DELETE_VESW 0x0015 +#define OPA_EM_ATTR_VESWPORT_SUMMARY_COUNTERS 0x0020 +#define OPA_EM_ATTR_VESWPORT_ERROR_COUNTERS 0x0022 + +/* VNIC configured and operational state values */ +#define OPA_VNIC_STATE_DROP_ALL 0x1 +#define OPA_VNIC_STATE_FORWARDING 0x3 + +#define OPA_VESW_MAX_NUM_DEF_PORT 16 +#define OPA_VNIC_MAX_NUM_PCP 8 + +#define OPA_VNIC_EMA_DATA (OPA_MGMT_MAD_SIZE - IB_MGMT_VENDOR_HDR) + +/* Defines for vendor specific notice(trap) attributes */ +#define OPA_INTEL_EMA_NOTICE_TYPE_INFO 0x04 + +/* INTEL OUI */ +#define INTEL_OUI_1 0x00 +#define INTEL_OUI_2 0x06 +#define INTEL_OUI_3 0x6a + +/* Trap opcodes sent from VNIC */ +#define OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE 0x1 +#define OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE 0x2 +#define OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE 0x3 + +#define OPA_VNIC_DLID_SD_IS_SRC_MAC(dlid_sd) (!!((dlid_sd) & 0x20)) +#define OPA_VNIC_DLID_SD_GET_DLID(dlid_sd) ((dlid_sd) >> 8) + +/* VNIC Ethernet link status */ +#define OPA_VNIC_ETH_LINK_UP 1 +#define OPA_VNIC_ETH_LINK_DOWN 2 + +/** + * struct opa_vesw_info - OPA vnic switch information + * @fabric_id: 10-bit fabric id + * @vesw_id: 12-bit virtual ethernet switch id + * @def_port_mask: bitmask of default ports + * @pkey: partition key + * @u_mcast_dlid: unknown multicast dlid + * @u_ucast_dlid: array of unknown unicast dlids + * @eth_mtu: MTUs for each vlan PCP + * @eth_mtu_non_vlan: MTU for non vlan packets + */ +struct opa_vesw_info { + __be16 fabric_id; + __be16 vesw_id; + + u8 rsvd0[6]; + __be16 def_port_mask; + + u8 rsvd1[2]; + __be16 pkey; + + u8 rsvd2[4]; + __be32 u_mcast_dlid; + __be32 u_ucast_dlid[OPA_VESW_MAX_NUM_DEF_PORT]; + + u8 rsvd3[44]; + __be16 eth_mtu[OPA_VNIC_MAX_NUM_PCP]; + __be16 eth_mtu_non_vlan; + u8 rsvd4[2]; +} __packed; + +/** + * struct opa_per_veswport_info - OPA vnic per port information + * @port_num: port number + * @eth_link_status: current ethernet link state + * @base_mac_addr: base mac address + * @config_state: configured port state + * @oper_state: operational port state + * @max_mac_tbl_ent: max number of mac table entries + * @max_smac_ent: max smac entries in mac table + * @mac_tbl_digest: mac table digest + * @encap_slid: base slid for the port + * @pcp_to_sc_uc: sc by pcp index for unicast ethernet packets + * @pcp_to_vl_uc: vl by pcp index for unicast ethernet packets + * @pcp_to_sc_mc: sc by pcp index for multicast ethernet packets + * @pcp_to_vl_mc: vl by pcp index for multicast ethernet packets + * @non_vlan_sc_uc: sc for non-vlan unicast ethernet packets + * @non_vlan_vl_uc: vl for non-vlan unicast ethernet packets + * @non_vlan_sc_mc: sc for non-vlan multicast ethernet packets + * @non_vlan_vl_mc: vl for non-vlan multicast ethernet packets + * @uc_macs_gen_count: generation count for unicast macs list + * @mc_macs_gen_count: generation count for multicast macs list + */ +struct opa_per_veswport_info { + __be32 port_num; + + u8 eth_link_status; + u8 rsvd0[3]; + + u8 base_mac_addr[ETH_ALEN]; + u8 config_state; + u8 oper_state; + + __be16 max_mac_tbl_ent; + __be16 max_smac_ent; + __be32 mac_tbl_digest; + u8 rsvd1[4]; + + __be32 encap_slid; + + u8 pcp_to_sc_uc[OPA_VNIC_MAX_NUM_PCP]; + u8 pcp_to_vl_uc[OPA_VNIC_MAX_NUM_PCP]; + u8 pcp_to_sc_mc[OPA_VNIC_MAX_NUM_PCP]; + u8 pcp_to_vl_mc[OPA_VNIC_MAX_NUM_PCP]; + + u8 non_vlan_sc_uc; + u8 non_vlan_vl_uc; + u8 non_vlan_sc_mc; + u8 non_vlan_vl_mc; + + u8 rsvd2[48]; + + __be16 uc_macs_gen_count; + __be16 mc_macs_gen_count; + + u8 rsvd3[8]; +} __packed; + +/** + * struct opa_veswport_info - OPA vnic port information + * @vesw: OPA vnic switch information + * @vport: OPA vnic per port information + * + * On host, each of the virtual ethernet ports belongs + * to a different virtual ethernet switches. + */ +struct opa_veswport_info { + struct opa_vesw_info vesw; + struct opa_per_veswport_info vport; +}; + +/** + * struct opa_veswport_mactable_entry - single entry in the forwarding table + * @mac_addr: MAC address + * @mac_addr_mask: MAC address bit mask + * @dlid_sd: Matching DLID and side data + * + * On the host each virtual ethernet port will have + * a forwarding table. These tables are used to + * map a MAC to a LID and other data. For more + * details see struct opa_veswport_mactable_entries. + * This is the structure of a single mactable entry + */ +struct opa_veswport_mactable_entry { + u8 mac_addr[ETH_ALEN]; + u8 mac_addr_mask[ETH_ALEN]; + __be32 dlid_sd; +} __packed; + +/** + * struct opa_veswport_mactable - Forwarding table array + * @offset: mac table starting offset + * @num_entries: Number of entries to get or set + * @mac_tbl_digest: mac table digest + * @tbl_entries[]: Array of table entries + * + * The EM sends down this structure in a MAD indicating + * the starting offset in the forwarding table that this + * entry is to be loaded into and the number of entries + * that that this MAD instance contains + * The mac_tbl_digest has been added to this MAD structure. It will be set by + * the EM and it will be used by the EM to check if there are any + * discrepancies with this value and the value + * maintained by the EM in the case of VNIC port being deleted or unloaded + * A new instantiation of a VNIC will always have a value of zero. + * This value is stored as part of the vnic adapter structure and will be + * accessed by the GET and SET routines for both the mactable entries and the + * veswport info. + */ +struct opa_veswport_mactable { + __be16 offset; + __be16 num_entries; + __be32 mac_tbl_digest; + struct opa_veswport_mactable_entry tbl_entries[0]; +} __packed; + +/** + * struct opa_veswport_summary_counters - summary counters + * @vp_instance: vport instance on the OPA port + * @vesw_id: virtual ethernet switch id + * @veswport_num: virtual ethernet switch port number + * @tx_errors: transmit errors + * @rx_errors: receive errors + * @tx_packets: transmit packets + * @rx_packets: receive packets + * @tx_bytes: transmit bytes + * @rx_bytes: receive bytes + * @tx_unicast: unicast packets transmitted + * @tx_mcastbcast: multicast/broadcast packets transmitted + * @tx_untagged: non-vlan packets transmitted + * @tx_vlan: vlan packets transmitted + * @tx_64_size: transmit packet length is 64 bytes + * @tx_65_127: transmit packet length is >=65 and < 127 bytes + * @tx_128_255: transmit packet length is >=128 and < 255 bytes + * @tx_256_511: transmit packet length is >=256 and < 511 bytes + * @tx_512_1023: transmit packet length is >=512 and < 1023 bytes + * @tx_1024_1518: transmit packet length is >=1024 and < 1518 bytes + * @tx_1519_max: transmit packet length >= 1519 bytes + * @rx_unicast: unicast packets received + * @rx_mcastbcast: multicast/broadcast packets received + * @rx_untagged: non-vlan packets received + * @rx_vlan: vlan packets received + * @rx_64_size: received packet length is 64 bytes + * @rx_65_127: received packet length is >=65 and < 127 bytes + * @rx_128_255: received packet length is >=128 and < 255 bytes + * @rx_256_511: received packet length is >=256 and < 511 bytes + * @rx_512_1023: received packet length is >=512 and < 1023 bytes + * @rx_1024_1518: received packet length is >=1024 and < 1518 bytes + * @rx_1519_max: received packet length >= 1519 bytes + * + * All the above are counters of corresponding conditions. + */ +struct opa_veswport_summary_counters { + __be16 vp_instance; + __be16 vesw_id; + __be32 veswport_num; + + __be64 tx_errors; + __be64 rx_errors; + __be64 tx_packets; + __be64 rx_packets; + __be64 tx_bytes; + __be64 rx_bytes; + + __be64 tx_unicast; + __be64 tx_mcastbcast; + + __be64 tx_untagged; + __be64 tx_vlan; + + __be64 tx_64_size; + __be64 tx_65_127; + __be64 tx_128_255; + __be64 tx_256_511; + __be64 tx_512_1023; + __be64 tx_1024_1518; + __be64 tx_1519_max; + + __be64 rx_unicast; + __be64 rx_mcastbcast; + + __be64 rx_untagged; + __be64 rx_vlan; + + __be64 rx_64_size; + __be64 rx_65_127; + __be64 rx_128_255; + __be64 rx_256_511; + __be64 rx_512_1023; + __be64 rx_1024_1518; + __be64 rx_1519_max; + + __be64 reserved[16]; +} __packed; + +/** + * struct opa_veswport_error_counters - error counters + * @vp_instance: vport instance on the OPA port + * @vesw_id: virtual ethernet switch id + * @veswport_num: virtual ethernet switch port number + * @tx_errors: transmit errors + * @rx_errors: receive errors + * @tx_smac_filt: smac filter errors + * @tx_dlid_zero: transmit packets with invalid dlid + * @tx_logic: other transmit errors + * @tx_drop_state: packet tansmission in non-forward port state + * @rx_bad_veswid: received packet with invalid vesw id + * @rx_runt: received ethernet packet with length < 64 bytes + * @rx_oversize: received ethernet packet with length > MTU size + * @rx_eth_down: received packets when interface is down + * @rx_drop_state: received packets in non-forwarding port state + * @rx_logic: other receive errors + * + * All the above are counters of corresponding erorr conditions. + */ +struct opa_veswport_error_counters { + __be16 vp_instance; + __be16 vesw_id; + __be32 veswport_num; + + __be64 tx_errors; + __be64 rx_errors; + + __be64 rsvd0; + __be64 tx_smac_filt; + __be64 rsvd1; + __be64 rsvd2; + __be64 rsvd3; + __be64 tx_dlid_zero; + __be64 rsvd4; + __be64 tx_logic; + __be64 rsvd5; + __be64 tx_drop_state; + + __be64 rx_bad_veswid; + __be64 rsvd6; + __be64 rx_runt; + __be64 rx_oversize; + __be64 rsvd7; + __be64 rx_eth_down; + __be64 rx_drop_state; + __be64 rx_logic; + __be64 rsvd8; + + __be64 rsvd9[16]; +} __packed; + +/** + * struct opa_veswport_trap - Trap message sent to EM by VNIC + * @fabric_id: 10 bit fabric id + * @veswid: 12 bit virtual ethernet switch id + * @veswportnum: logical port number on the Virtual switch + * @opaportnum: physical port num (redundant on host) + * @veswportindex: switch port index on opa port 0 based + * @opcode: operation + * @reserved: 32 bit for alignment + * + * The VNIC will send trap messages to the Ethernet manager to + * inform it about changes to the VNIC config, behaviour etc. + * This is the format of the trap payload. + */ +struct opa_veswport_trap { + __be16 fabric_id; + __be16 veswid; + __be32 veswportnum; + __be16 opaportnum; + u8 veswportindex; + u8 opcode; + __be32 reserved; +} __packed; + +/** + * struct opa_vnic_iface_macs_entry - single entry in the mac list + * @mac_addr: MAC address + */ +struct opa_vnic_iface_mac_entry { + u8 mac_addr[ETH_ALEN]; +}; + +/** + * struct opa_veswport_iface_macs - Msg to set globally administered MAC + * @start_idx: position of first entry (0 based) + * @num_macs_in_msg: number of MACs in this message + * @tot_macs_in_lst: The total number of MACs the agent has + * @gen_count: gen_count to indicate change + * @entry: The mac list entry + * + * Same attribute IDS and attribute modifiers as in locally administered + * addresses used to set globally administered addresses + */ +struct opa_veswport_iface_macs { + __be16 start_idx; + __be16 num_macs_in_msg; + __be16 tot_macs_in_lst; + __be16 gen_count; + struct opa_vnic_iface_mac_entry entry[0]; +} __packed; + +/** + * struct opa_vnic_vema_mad - Generic VEMA MAD + * @mad_hdr: Generic MAD header + * @rmpp_hdr: RMPP header for vendor specific MADs + * @oui: Unique org identifier + * @data: MAD data + */ +struct opa_vnic_vema_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 reserved; + u8 oui[3]; + u8 data[OPA_VNIC_EMA_DATA]; +}; + +/** + * struct opa_vnic_notice_attr - Generic Notice MAD + * @gen_type: Generic/Specific bit and type of notice + * @oui_1: Vendor ID byte 1 + * @oui_2: Vendor ID byte 2 + * @oui_3: Vendor ID byte 3 + * @trap_num: Trap number + * @toggle_count: Notice toggle bit and count value + * @issuer_lid: Trap issuer's lid + * @issuer_gid: Issuer GID (only if Report method) + * @raw_data: Trap message body + */ +struct opa_vnic_notice_attr { + u8 gen_type; + u8 oui_1; + u8 oui_2; + u8 oui_3; + __be16 trap_num; + __be16 toggle_count; + __be32 issuer_lid; + __be32 reserved; + u8 issuer_gid[16]; + u8 raw_data[64]; +} __packed; + +/** + * struct opa_vnic_vema_mad_trap - Generic VEMA MAD Trap + * @mad_hdr: Generic MAD header + * @rmpp_hdr: RMPP header for vendor specific MADs + * @oui: Unique org identifier + * @notice: Notice structure + */ +struct opa_vnic_vema_mad_trap { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 reserved; + u8 oui[3]; + struct opa_vnic_notice_attr notice; +}; + +#endif /* _OPA_VNIC_ENCAP_H */ diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c new file mode 100644 index 000000000000..62390e9e0023 --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c @@ -0,0 +1,187 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains OPA VNIC ethtool functions + */ + +#include <linux/ethtool.h> + +#include "opa_vnic_internal.h" + +enum {NETDEV_STATS, VNIC_STATS}; + +struct vnic_stats { + char stat_string[ETH_GSTRING_LEN]; + struct { + int sizeof_stat; + int stat_offset; + }; +}; + +#define VNIC_STAT(m) { FIELD_SIZEOF(struct opa_vnic_stats, m), \ + offsetof(struct opa_vnic_stats, m) } + +static struct vnic_stats vnic_gstrings_stats[] = { + /* NETDEV stats */ + {"rx_packets", VNIC_STAT(netstats.rx_packets)}, + {"tx_packets", VNIC_STAT(netstats.tx_packets)}, + {"rx_bytes", VNIC_STAT(netstats.rx_bytes)}, + {"tx_bytes", VNIC_STAT(netstats.tx_bytes)}, + {"rx_errors", VNIC_STAT(netstats.rx_errors)}, + {"tx_errors", VNIC_STAT(netstats.tx_errors)}, + {"rx_dropped", VNIC_STAT(netstats.rx_dropped)}, + {"tx_dropped", VNIC_STAT(netstats.tx_dropped)}, + + /* SUMMARY counters */ + {"tx_unicast", VNIC_STAT(tx_grp.unicast)}, + {"tx_mcastbcast", VNIC_STAT(tx_grp.mcastbcast)}, + {"tx_untagged", VNIC_STAT(tx_grp.untagged)}, + {"tx_vlan", VNIC_STAT(tx_grp.vlan)}, + + {"tx_64_size", VNIC_STAT(tx_grp.s_64)}, + {"tx_65_127", VNIC_STAT(tx_grp.s_65_127)}, + {"tx_128_255", VNIC_STAT(tx_grp.s_128_255)}, + {"tx_256_511", VNIC_STAT(tx_grp.s_256_511)}, + {"tx_512_1023", VNIC_STAT(tx_grp.s_512_1023)}, + {"tx_1024_1518", VNIC_STAT(tx_grp.s_1024_1518)}, + {"tx_1519_max", VNIC_STAT(tx_grp.s_1519_max)}, + + {"rx_unicast", VNIC_STAT(rx_grp.unicast)}, + {"rx_mcastbcast", VNIC_STAT(rx_grp.mcastbcast)}, + {"rx_untagged", VNIC_STAT(rx_grp.untagged)}, + {"rx_vlan", VNIC_STAT(rx_grp.vlan)}, + + {"rx_64_size", VNIC_STAT(rx_grp.s_64)}, + {"rx_65_127", VNIC_STAT(rx_grp.s_65_127)}, + {"rx_128_255", VNIC_STAT(rx_grp.s_128_255)}, + {"rx_256_511", VNIC_STAT(rx_grp.s_256_511)}, + {"rx_512_1023", VNIC_STAT(rx_grp.s_512_1023)}, + {"rx_1024_1518", VNIC_STAT(rx_grp.s_1024_1518)}, + {"rx_1519_max", VNIC_STAT(rx_grp.s_1519_max)}, + + /* ERROR counters */ + {"rx_fifo_errors", VNIC_STAT(netstats.rx_fifo_errors)}, + {"rx_length_errors", VNIC_STAT(netstats.rx_length_errors)}, + + {"tx_fifo_errors", VNIC_STAT(netstats.tx_fifo_errors)}, + {"tx_carrier_errors", VNIC_STAT(netstats.tx_carrier_errors)}, + + {"tx_dlid_zero", VNIC_STAT(tx_dlid_zero)}, + {"tx_drop_state", VNIC_STAT(tx_drop_state)}, + {"rx_drop_state", VNIC_STAT(rx_drop_state)}, + {"rx_oversize", VNIC_STAT(rx_oversize)}, + {"rx_runt", VNIC_STAT(rx_runt)}, +}; + +#define VNIC_STATS_LEN ARRAY_SIZE(vnic_gstrings_stats) + +/* vnic_get_drvinfo - get driver info */ +static void vnic_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->driver, opa_vnic_driver_name, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, opa_vnic_driver_version, + sizeof(drvinfo->version)); + strlcpy(drvinfo->bus_info, dev_name(netdev->dev.parent), + sizeof(drvinfo->bus_info)); +} + +/* vnic_get_sset_count - get string set count */ +static int vnic_get_sset_count(struct net_device *netdev, int sset) +{ + return (sset == ETH_SS_STATS) ? VNIC_STATS_LEN : -EOPNOTSUPP; +} + +/* vnic_get_ethtool_stats - get statistics */ +static void vnic_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *stats, u64 *data) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + struct opa_vnic_stats vstats; + int i; + + memset(&vstats, 0, sizeof(vstats)); + spin_lock(&adapter->stats_lock); + adapter->rn_ops->ndo_get_stats64(netdev, &vstats.netstats); + spin_unlock(&adapter->stats_lock); + for (i = 0; i < VNIC_STATS_LEN; i++) { + char *p = (char *)&vstats + vnic_gstrings_stats[i].stat_offset; + + data[i] = (vnic_gstrings_stats[i].sizeof_stat == + sizeof(u64)) ? *(u64 *)p : *(u32 *)p; + } +} + +/* vnic_get_strings - get strings */ +static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + int i; + + if (stringset != ETH_SS_STATS) + return; + + for (i = 0; i < VNIC_STATS_LEN; i++) + memcpy(data + i * ETH_GSTRING_LEN, + vnic_gstrings_stats[i].stat_string, + ETH_GSTRING_LEN); +} + +/* ethtool ops */ +static const struct ethtool_ops opa_vnic_ethtool_ops = { + .get_drvinfo = vnic_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_strings = vnic_get_strings, + .get_sset_count = vnic_get_sset_count, + .get_ethtool_stats = vnic_get_ethtool_stats, +}; + +/* opa_vnic_set_ethtool_ops - set ethtool ops */ +void opa_vnic_set_ethtool_ops(struct net_device *netdev) +{ + netdev->ethtool_ops = &opa_vnic_ethtool_ops; +} diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h new file mode 100644 index 000000000000..ca29e6d5aedc --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h @@ -0,0 +1,329 @@ +#ifndef _OPA_VNIC_INTERNAL_H +#define _OPA_VNIC_INTERNAL_H +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains OPA VNIC driver internal declarations + */ + +#include <linux/bitops.h> +#include <linux/etherdevice.h> +#include <linux/hashtable.h> +#include <linux/sizes.h> +#include <rdma/opa_vnic.h> + +#include "opa_vnic_encap.h" + +#define OPA_VNIC_VLAN_PCP(vlan_tci) \ + (((vlan_tci) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) + +/* Flow to default port redirection table size */ +#define OPA_VNIC_FLOW_TBL_SIZE 32 + +/* Invalid port number */ +#define OPA_VNIC_INVALID_PORT 0xff + +struct opa_vnic_adapter; + +/** + * struct __opa_vesw_info - OPA vnic virtual switch info + * + * Same as opa_vesw_info without bitwise attribute. + */ +struct __opa_vesw_info { + u16 fabric_id; + u16 vesw_id; + + u8 rsvd0[6]; + u16 def_port_mask; + + u8 rsvd1[2]; + u16 pkey; + + u8 rsvd2[4]; + u32 u_mcast_dlid; + u32 u_ucast_dlid[OPA_VESW_MAX_NUM_DEF_PORT]; + + u8 rsvd3[44]; + u16 eth_mtu[OPA_VNIC_MAX_NUM_PCP]; + u16 eth_mtu_non_vlan; + u8 rsvd4[2]; +} __packed; + +/** + * struct __opa_per_veswport_info - OPA vnic per port info + * + * Same as opa_per_veswport_info without bitwise attribute. + */ +struct __opa_per_veswport_info { + u32 port_num; + + u8 eth_link_status; + u8 rsvd0[3]; + + u8 base_mac_addr[ETH_ALEN]; + u8 config_state; + u8 oper_state; + + u16 max_mac_tbl_ent; + u16 max_smac_ent; + u32 mac_tbl_digest; + u8 rsvd1[4]; + + u32 encap_slid; + + u8 pcp_to_sc_uc[OPA_VNIC_MAX_NUM_PCP]; + u8 pcp_to_vl_uc[OPA_VNIC_MAX_NUM_PCP]; + u8 pcp_to_sc_mc[OPA_VNIC_MAX_NUM_PCP]; + u8 pcp_to_vl_mc[OPA_VNIC_MAX_NUM_PCP]; + + u8 non_vlan_sc_uc; + u8 non_vlan_vl_uc; + u8 non_vlan_sc_mc; + u8 non_vlan_vl_mc; + + u8 rsvd2[48]; + + u16 uc_macs_gen_count; + u16 mc_macs_gen_count; + + u8 rsvd3[8]; +} __packed; + +/** + * struct __opa_veswport_info - OPA vnic port info + * + * Same as opa_veswport_info without bitwise attribute. + */ +struct __opa_veswport_info { + struct __opa_vesw_info vesw; + struct __opa_per_veswport_info vport; +}; + +/** + * struct __opa_veswport_trap - OPA vnic trap info + * + * Same as opa_veswport_trap without bitwise attribute. + */ +struct __opa_veswport_trap { + u16 fabric_id; + u16 veswid; + u32 veswportnum; + u16 opaportnum; + u8 veswportindex; + u8 opcode; + u32 reserved; +} __packed; + +/** + * struct opa_vnic_ctrl_port - OPA virtual NIC control port + * @ibdev: pointer to ib device + * @ops: opa vnic control operations + * @num_ports: number of opa ports + */ +struct opa_vnic_ctrl_port { + struct ib_device *ibdev; + struct opa_vnic_ctrl_ops *ops; + u8 num_ports; +}; + +/** + * struct opa_vnic_adapter - OPA VNIC netdev private data structure + * @netdev: pointer to associated netdev + * @ibdev: ib device + * @cport: pointer to opa vnic control port + * @rn_ops: rdma netdev's net_device_ops + * @port_num: OPA port number + * @vport_num: vesw port number + * @lock: adapter lock + * @info: virtual ethernet switch port information + * @vema_mac_addr: mac address configured by vema + * @umac_hash: unicast maclist hash + * @mmac_hash: multicast maclist hash + * @mactbl: hash table of MAC entries + * @mactbl_lock: mac table lock + * @stats_lock: statistics lock + * @flow_tbl: flow to default port redirection table + * @trap_timeout: trap timeout + * @trap_count: no. of traps allowed within timeout period + */ +struct opa_vnic_adapter { + struct net_device *netdev; + struct ib_device *ibdev; + struct opa_vnic_ctrl_port *cport; + const struct net_device_ops *rn_ops; + + u8 port_num; + u8 vport_num; + + /* Lock used around concurrent updates to netdev */ + struct mutex lock; + + struct __opa_veswport_info info; + u8 vema_mac_addr[ETH_ALEN]; + u32 umac_hash; + u32 mmac_hash; + struct hlist_head __rcu *mactbl; + + /* Lock used to protect updates to mac table */ + struct mutex mactbl_lock; + + /* Lock used to protect access to vnic counters */ + spinlock_t stats_lock; + + u8 flow_tbl[OPA_VNIC_FLOW_TBL_SIZE]; + + unsigned long trap_timeout; + u8 trap_count; +}; + +/* Same as opa_veswport_mactable_entry, but without bitwise attribute */ +struct __opa_vnic_mactable_entry { + u8 mac_addr[ETH_ALEN]; + u8 mac_addr_mask[ETH_ALEN]; + u32 dlid_sd; +} __packed; + +/** + * struct opa_vnic_mac_tbl_node - OPA VNIC mac table node + * @hlist: hash list handle + * @index: index of entry in the mac table + * @entry: entry in the table + */ +struct opa_vnic_mac_tbl_node { + struct hlist_node hlist; + u16 index; + struct __opa_vnic_mactable_entry entry; +}; + +#define v_dbg(format, arg...) \ + netdev_dbg(adapter->netdev, format, ## arg) +#define v_err(format, arg...) \ + netdev_err(adapter->netdev, format, ## arg) +#define v_info(format, arg...) \ + netdev_info(adapter->netdev, format, ## arg) +#define v_warn(format, arg...) \ + netdev_warn(adapter->netdev, format, ## arg) + +#define c_err(format, arg...) \ + dev_err(&cport->ibdev->dev, format, ## arg) +#define c_info(format, arg...) \ + dev_info(&cport->ibdev->dev, format, ## arg) +#define c_dbg(format, arg...) \ + dev_dbg(&cport->ibdev->dev, format, ## arg) + +/* The maximum allowed entries in the mac table */ +#define OPA_VNIC_MAC_TBL_MAX_ENTRIES 2048 +/* Limit of smac entries in mac table */ +#define OPA_VNIC_MAX_SMAC_LIMIT 256 + +/* The last octet of the MAC address is used as the key to the hash table */ +#define OPA_VNIC_MAC_HASH_IDX 5 + +/* The VNIC MAC hash table is of size 2^8 */ +#define OPA_VNIC_MAC_TBL_HASH_BITS 8 +#define OPA_VNIC_MAC_TBL_SIZE BIT(OPA_VNIC_MAC_TBL_HASH_BITS) + +/* VNIC HASH MACROS */ +#define vnic_hash_init(hashtable) __hash_init(hashtable, OPA_VNIC_MAC_TBL_SIZE) + +#define vnic_hash_add(hashtable, node, key) \ + hlist_add_head(node, \ + &hashtable[hash_min(key, ilog2(OPA_VNIC_MAC_TBL_SIZE))]) + +#define vnic_hash_for_each_safe(name, bkt, tmp, obj, member) \ + for ((bkt) = 0, obj = NULL; \ + !obj && (bkt) < OPA_VNIC_MAC_TBL_SIZE; (bkt)++) \ + hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) + +#define vnic_hash_for_each_possible(name, obj, member, key) \ + hlist_for_each_entry(obj, \ + &name[hash_min(key, ilog2(OPA_VNIC_MAC_TBL_SIZE))], member) + +#define vnic_hash_for_each(name, bkt, obj, member) \ + for ((bkt) = 0, obj = NULL; \ + !obj && (bkt) < OPA_VNIC_MAC_TBL_SIZE; (bkt)++) \ + hlist_for_each_entry(obj, &name[bkt], member) + +extern char opa_vnic_driver_name[]; +extern const char opa_vnic_driver_version[]; + +struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev, + u8 port_num, u8 vport_num); +void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter); +void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb); +u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb); +u8 opa_vnic_calc_entropy(struct opa_vnic_adapter *adapter, struct sk_buff *skb); +void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter); +void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter); +void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter, + struct opa_veswport_mactable *tbl); +int opa_vnic_update_mac_tbl(struct opa_vnic_adapter *adapter, + struct opa_veswport_mactable *tbl); +void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, + struct opa_veswport_iface_macs *macs); +void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter, + struct opa_veswport_iface_macs *macs); +void opa_vnic_get_summary_counters(struct opa_vnic_adapter *adapter, + struct opa_veswport_summary_counters *cntrs); +void opa_vnic_get_error_counters(struct opa_vnic_adapter *adapter, + struct opa_veswport_error_counters *cntrs); +void opa_vnic_get_vesw_info(struct opa_vnic_adapter *adapter, + struct opa_vesw_info *info); +void opa_vnic_set_vesw_info(struct opa_vnic_adapter *adapter, + struct opa_vesw_info *info); +void opa_vnic_get_per_veswport_info(struct opa_vnic_adapter *adapter, + struct opa_per_veswport_info *info); +void opa_vnic_set_per_veswport_info(struct opa_vnic_adapter *adapter, + struct opa_per_veswport_info *info); +void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event); +void opa_vnic_set_ethtool_ops(struct net_device *netdev); +void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter, + struct __opa_veswport_trap *data, u32 lid); + +#endif /* _OPA_VNIC_INTERNAL_H */ diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c new file mode 100644 index 000000000000..1a3c25364b64 --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c @@ -0,0 +1,387 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains OPA Virtual Network Interface Controller (VNIC) driver + * netdev functionality. + */ + +#include <linux/module.h> +#include <linux/if_vlan.h> +#include <linux/crc32.h> + +#include "opa_vnic_internal.h" + +#define OPA_TX_TIMEOUT_MS 1000 + +#define OPA_VNIC_SKB_HEADROOM \ + ALIGN((OPA_VNIC_HDR_LEN + OPA_VNIC_SKB_MDATA_LEN), 8) + +/* This function is overloaded for opa_vnic specific implementation */ +static void opa_vnic_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + struct opa_vnic_stats vstats; + + memset(&vstats, 0, sizeof(vstats)); + spin_lock(&adapter->stats_lock); + adapter->rn_ops->ndo_get_stats64(netdev, &vstats.netstats); + spin_unlock(&adapter->stats_lock); + memcpy(stats, &vstats.netstats, sizeof(*stats)); +} + +/* opa_netdev_start_xmit - transmit function */ +static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + + v_dbg("xmit: queue %d skb len %d\n", skb->queue_mapping, skb->len); + /* pad to ensure mininum ethernet packet length */ + if (unlikely(skb->len < ETH_ZLEN)) { + if (skb_padto(skb, ETH_ZLEN)) + return NETDEV_TX_OK; + + skb_put(skb, ETH_ZLEN - skb->len); + } + + opa_vnic_encap_skb(adapter, skb); + return adapter->rn_ops->ndo_start_xmit(skb, netdev); +} + +static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, + void *accel_priv, + select_queue_fallback_t fallback) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + struct opa_vnic_skb_mdata *mdata; + int rc; + + /* pass entropy and vl as metadata in skb */ + mdata = skb_push(skb, sizeof(*mdata)); + mdata->entropy = opa_vnic_calc_entropy(adapter, skb); + mdata->vl = opa_vnic_get_vl(adapter, skb); + rc = adapter->rn_ops->ndo_select_queue(netdev, skb, + accel_priv, fallback); + skb_pull(skb, sizeof(*mdata)); + return rc; +} + +/* opa_vnic_process_vema_config - process vema configuration updates */ +void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter) +{ + struct __opa_veswport_info *info = &adapter->info; + struct rdma_netdev *rn = netdev_priv(adapter->netdev); + u8 port_num[OPA_VESW_MAX_NUM_DEF_PORT] = { 0 }; + struct net_device *netdev = adapter->netdev; + u8 i, port_count = 0; + u16 port_mask; + + /* If the base_mac_addr is changed, update the interface mac address */ + if (memcmp(info->vport.base_mac_addr, adapter->vema_mac_addr, + ARRAY_SIZE(info->vport.base_mac_addr))) { + struct sockaddr saddr; + + memcpy(saddr.sa_data, info->vport.base_mac_addr, + ARRAY_SIZE(info->vport.base_mac_addr)); + mutex_lock(&adapter->lock); + eth_mac_addr(netdev, &saddr); + memcpy(adapter->vema_mac_addr, + info->vport.base_mac_addr, ETH_ALEN); + mutex_unlock(&adapter->lock); + } + + rn->set_id(netdev, info->vesw.vesw_id); + + /* Handle MTU limit change */ + rtnl_lock(); + netdev->max_mtu = max_t(unsigned int, info->vesw.eth_mtu_non_vlan, + netdev->min_mtu); + if (netdev->mtu > netdev->max_mtu) + dev_set_mtu(netdev, netdev->max_mtu); + rtnl_unlock(); + + /* Update flow to default port redirection table */ + port_mask = info->vesw.def_port_mask; + for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) { + if (port_mask & 1) + port_num[port_count++] = i; + port_mask >>= 1; + } + + /* + * Build the flow table. Flow table is required when destination LID + * is not available. Up to OPA_VNIC_FLOW_TBL_SIZE flows supported. + * Each flow need a default port number to get its dlid from the + * u_ucast_dlid array. + */ + for (i = 0; i < OPA_VNIC_FLOW_TBL_SIZE; i++) + adapter->flow_tbl[i] = port_count ? port_num[i % port_count] : + OPA_VNIC_INVALID_PORT; + + /* Operational state can only be DROP_ALL or FORWARDING */ + if (info->vport.config_state == OPA_VNIC_STATE_FORWARDING) { + info->vport.oper_state = OPA_VNIC_STATE_FORWARDING; + netif_dormant_off(netdev); + } else { + info->vport.oper_state = OPA_VNIC_STATE_DROP_ALL; + netif_dormant_on(netdev); + } +} + +/* + * Set the power on default values in adapter's vema interface structure. + */ +static inline void opa_vnic_set_pod_values(struct opa_vnic_adapter *adapter) +{ + adapter->info.vport.max_mac_tbl_ent = OPA_VNIC_MAC_TBL_MAX_ENTRIES; + adapter->info.vport.max_smac_ent = OPA_VNIC_MAX_SMAC_LIMIT; + adapter->info.vport.config_state = OPA_VNIC_STATE_DROP_ALL; + adapter->info.vport.eth_link_status = OPA_VNIC_ETH_LINK_DOWN; +} + +/* opa_vnic_set_mac_addr - change mac address */ +static int opa_vnic_set_mac_addr(struct net_device *netdev, void *addr) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + struct sockaddr *sa = addr; + int rc; + + if (!memcmp(netdev->dev_addr, sa->sa_data, ETH_ALEN)) + return 0; + + mutex_lock(&adapter->lock); + rc = eth_mac_addr(netdev, addr); + mutex_unlock(&adapter->lock); + if (rc) + return rc; + + adapter->info.vport.uc_macs_gen_count++; + opa_vnic_vema_report_event(adapter, + OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE); + return 0; +} + +/* + * opa_vnic_mac_send_event - post event on possible mac list exchange + * Send trap when digest from uc/mc mac list differs from previous run. + * Digest is evaluated similar to how cksum does. + */ +static void opa_vnic_mac_send_event(struct net_device *netdev, u8 event) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + struct netdev_hw_addr *ha; + struct netdev_hw_addr_list *hw_list; + u32 *ref_crc; + u32 l, crc = 0; + + switch (event) { + case OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE: + hw_list = &netdev->uc; + adapter->info.vport.uc_macs_gen_count++; + ref_crc = &adapter->umac_hash; + break; + case OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE: + hw_list = &netdev->mc; + adapter->info.vport.mc_macs_gen_count++; + ref_crc = &adapter->mmac_hash; + break; + default: + return; + } + netdev_hw_addr_list_for_each(ha, hw_list) { + crc = crc32_le(crc, ha->addr, ETH_ALEN); + } + l = netdev_hw_addr_list_count(hw_list) * ETH_ALEN; + crc = ~crc32_le(crc, (void *)&l, sizeof(l)); + + if (crc != *ref_crc) { + *ref_crc = crc; + opa_vnic_vema_report_event(adapter, event); + } +} + +/* opa_vnic_set_rx_mode - handle uc/mc mac list change */ +static void opa_vnic_set_rx_mode(struct net_device *netdev) +{ + opa_vnic_mac_send_event(netdev, + OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE); + + opa_vnic_mac_send_event(netdev, + OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE); +} + +/* opa_netdev_open - activate network interface */ +static int opa_netdev_open(struct net_device *netdev) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + int rc; + + rc = adapter->rn_ops->ndo_open(adapter->netdev); + if (rc) { + v_dbg("open failed %d\n", rc); + return rc; + } + + /* Update eth link status and send trap */ + adapter->info.vport.eth_link_status = OPA_VNIC_ETH_LINK_UP; + opa_vnic_vema_report_event(adapter, + OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE); + return 0; +} + +/* opa_netdev_close - disable network interface */ +static int opa_netdev_close(struct net_device *netdev) +{ + struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); + int rc; + + rc = adapter->rn_ops->ndo_stop(adapter->netdev); + if (rc) { + v_dbg("close failed %d\n", rc); + return rc; + } + + /* Update eth link status and send trap */ + adapter->info.vport.eth_link_status = OPA_VNIC_ETH_LINK_DOWN; + opa_vnic_vema_report_event(adapter, + OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE); + return 0; +} + +/* netdev ops */ +static const struct net_device_ops opa_netdev_ops = { + .ndo_open = opa_netdev_open, + .ndo_stop = opa_netdev_close, + .ndo_start_xmit = opa_netdev_start_xmit, + .ndo_get_stats64 = opa_vnic_get_stats64, + .ndo_set_rx_mode = opa_vnic_set_rx_mode, + .ndo_select_queue = opa_vnic_select_queue, + .ndo_set_mac_address = opa_vnic_set_mac_addr, +}; + +/* opa_vnic_add_netdev - create vnic netdev interface */ +struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev, + u8 port_num, u8 vport_num) +{ + struct opa_vnic_adapter *adapter; + struct net_device *netdev; + struct rdma_netdev *rn; + int rc; + + netdev = ibdev->alloc_rdma_netdev(ibdev, port_num, + RDMA_NETDEV_OPA_VNIC, + "veth%d", NET_NAME_UNKNOWN, + ether_setup); + if (!netdev) + return ERR_PTR(-ENOMEM); + else if (IS_ERR(netdev)) + return ERR_CAST(netdev); + + rn = netdev_priv(netdev); + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); + if (!adapter) { + rc = -ENOMEM; + goto adapter_err; + } + + rn->clnt_priv = adapter; + rn->hca = ibdev; + rn->port_num = port_num; + adapter->netdev = netdev; + adapter->ibdev = ibdev; + adapter->port_num = port_num; + adapter->vport_num = vport_num; + adapter->rn_ops = netdev->netdev_ops; + + netdev->netdev_ops = &opa_netdev_ops; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->hard_header_len += OPA_VNIC_SKB_HEADROOM; + mutex_init(&adapter->lock); + mutex_init(&adapter->mactbl_lock); + spin_lock_init(&adapter->stats_lock); + + SET_NETDEV_DEV(netdev, ibdev->dev.parent); + + opa_vnic_set_ethtool_ops(netdev); + + opa_vnic_set_pod_values(adapter); + + rc = register_netdev(netdev); + if (rc) + goto netdev_err; + + netif_carrier_off(netdev); + netif_dormant_on(netdev); + v_info("initialized\n"); + + return adapter; +netdev_err: + mutex_destroy(&adapter->lock); + mutex_destroy(&adapter->mactbl_lock); + kfree(adapter); +adapter_err: + rn->free_rdma_netdev(netdev); + + return ERR_PTR(rc); +} + +/* opa_vnic_rem_netdev - remove vnic netdev interface */ +void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct rdma_netdev *rn = netdev_priv(netdev); + + v_info("removing\n"); + unregister_netdev(netdev); + opa_vnic_release_mac_tbl(adapter); + mutex_destroy(&adapter->lock); + mutex_destroy(&adapter->mactbl_lock); + kfree(adapter); + rn->free_rdma_netdev(netdev); +} diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c new file mode 100644 index 000000000000..cf768dd78d1b --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -0,0 +1,1056 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains OPA Virtual Network Interface Controller (VNIC) + * Ethernet Management Agent (EMA) driver + */ + +#include <linux/module.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_smi.h> + +#include "opa_vnic_internal.h" + +#define DRV_VERSION "1.0" +char opa_vnic_driver_name[] = "opa_vnic"; +const char opa_vnic_driver_version[] = DRV_VERSION; + +/* + * The trap service level is kept in bits 3 to 7 in the trap_sl_rsvd + * field in the class port info MAD. + */ +#define GET_TRAP_SL_FROM_CLASS_PORT_INFO(x) (((x) >> 3) & 0x1f) + +/* Cap trap bursts to a reasonable limit good for normal cases */ +#define OPA_VNIC_TRAP_BURST_LIMIT 4 + +/* + * VNIC trap limit timeout. + * Inverse of cap2_mask response time out (1.0737 secs) = 0.9 + * secs approx IB spec 13.4.6.2.1 PortInfoSubnetTimeout and + * 13.4.9 Traps. + */ +#define OPA_VNIC_TRAP_TIMEOUT ((4096 * (1UL << 18)) / 1000) + +#define OPA_VNIC_UNSUP_ATTR \ + cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB) + +#define OPA_VNIC_INVAL_ATTR \ + cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE) + +#define OPA_VNIC_CLASS_CAP_TRAP 0x1 + +/* Maximum number of VNIC ports supported */ +#define OPA_VNIC_MAX_NUM_VPORT 255 + +/** + * struct opa_vnic_vema_port -- VNIC VEMA port details + * @cport: pointer to port + * @mad_agent: pointer to mad agent for port + * @class_port_info: Class port info information. + * @tid: Transaction id + * @port_num: OPA port number + * @vport_idr: vnic ports idr + * @event_handler: ib event handler + * @lock: adapter interface lock + */ +struct opa_vnic_vema_port { + struct opa_vnic_ctrl_port *cport; + struct ib_mad_agent *mad_agent; + struct opa_class_port_info class_port_info; + u64 tid; + u8 port_num; + struct idr vport_idr; + struct ib_event_handler event_handler; + + /* Lock to query/update network adapter */ + struct mutex lock; +}; + +static void opa_vnic_vema_add_one(struct ib_device *device); +static void opa_vnic_vema_rem_one(struct ib_device *device, + void *client_data); + +static struct ib_client opa_vnic_client = { + .name = opa_vnic_driver_name, + .add = opa_vnic_vema_add_one, + .remove = opa_vnic_vema_rem_one, +}; + +/** + * vema_get_vport_num -- Get the vnic from the mad + * @recvd_mad: Received mad + * + * Return: returns value of the vnic port number + */ +static inline u8 vema_get_vport_num(struct opa_vnic_vema_mad *recvd_mad) +{ + return be32_to_cpu(recvd_mad->mad_hdr.attr_mod) & 0xff; +} + +/** + * vema_get_vport_adapter -- Get vnic port adapter from recvd mad + * @recvd_mad: received mad + * @port: ptr to port struct on which MAD was recvd + * + * Return: vnic adapter + */ +static inline struct opa_vnic_adapter * +vema_get_vport_adapter(struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_port *port) +{ + u8 vport_num = vema_get_vport_num(recvd_mad); + + return idr_find(&port->vport_idr, vport_num); +} + +/** + * vema_mac_tbl_req_ok -- Check if mac request has correct values + * @mac_tbl: mac table + * + * This function checks for the validity of the offset and number of + * entries required. + * + * Return: true if offset and num_entries are valid + */ +static inline bool vema_mac_tbl_req_ok(struct opa_veswport_mactable *mac_tbl) +{ + u16 offset, num_entries; + u16 req_entries = ((OPA_VNIC_EMA_DATA - sizeof(*mac_tbl)) / + sizeof(mac_tbl->tbl_entries[0])); + + offset = be16_to_cpu(mac_tbl->offset); + num_entries = be16_to_cpu(mac_tbl->num_entries); + + return ((num_entries <= req_entries) && + (offset + num_entries <= OPA_VNIC_MAC_TBL_MAX_ENTRIES)); +} + +/* + * Return the power on default values in the port info structure + * in big endian format as required by MAD. + */ +static inline void vema_get_pod_values(struct opa_veswport_info *port_info) +{ + memset(port_info, 0, sizeof(*port_info)); + port_info->vport.max_mac_tbl_ent = + cpu_to_be16(OPA_VNIC_MAC_TBL_MAX_ENTRIES); + port_info->vport.max_smac_ent = + cpu_to_be16(OPA_VNIC_MAX_SMAC_LIMIT); + port_info->vport.oper_state = OPA_VNIC_STATE_DROP_ALL; + port_info->vport.config_state = OPA_VNIC_STATE_DROP_ALL; +} + +/** + * vema_add_vport -- Add a new vnic port + * @port: ptr to opa_vnic_vema_port struct + * @vport_num: vnic port number (to be added) + * + * Return a pointer to the vnic adapter structure + */ +static struct opa_vnic_adapter *vema_add_vport(struct opa_vnic_vema_port *port, + u8 vport_num) +{ + struct opa_vnic_ctrl_port *cport = port->cport; + struct opa_vnic_adapter *adapter; + + adapter = opa_vnic_add_netdev(cport->ibdev, port->port_num, vport_num); + if (!IS_ERR(adapter)) { + int rc; + + adapter->cport = cport; + rc = idr_alloc(&port->vport_idr, adapter, vport_num, + vport_num + 1, GFP_NOWAIT); + if (rc < 0) { + opa_vnic_rem_netdev(adapter); + adapter = ERR_PTR(rc); + } + } + + return adapter; +} + +/** + * vema_get_class_port_info -- Get class info for port + * @port: Port on whic MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + * + * This function copies the latest class port info value set for the + * port and stores it for generating traps + */ +static void vema_get_class_port_info(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_class_port_info *port_info; + + port_info = (struct opa_class_port_info *)rsp_mad->data; + memcpy(port_info, &port->class_port_info, sizeof(*port_info)); + port_info->base_version = OPA_MGMT_BASE_VERSION, + port_info->class_version = OPA_EMA_CLASS_VERSION; + + /* + * Set capability mask bit indicating agent generates traps, + * and set the maximum number of VNIC ports supported. + */ + port_info->cap_mask = cpu_to_be16((OPA_VNIC_CLASS_CAP_TRAP | + (OPA_VNIC_MAX_NUM_VPORT << 8))); + + /* + * Since a get routine is always sent by the EM first we + * set the expected response time to + * 4.096 usec * 2^18 == 1.0737 sec here. + */ + port_info->cap_mask2_resp_time = cpu_to_be32(18); +} + +/** + * vema_set_class_port_info -- Get class info for port + * @port: Port on whic MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + * + * This function updates the port class info for the specific vnic + * and sets up the response mad data + */ +static void vema_set_class_port_info(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + memcpy(&port->class_port_info, recvd_mad->data, + sizeof(port->class_port_info)); + + vema_get_class_port_info(port, recvd_mad, rsp_mad); +} + +/** + * vema_get_veswport_info -- Get veswport info + * @port: source port on which MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + */ +static void vema_get_veswport_info(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_veswport_info *port_info = + (struct opa_veswport_info *)rsp_mad->data; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (adapter) { + memset(port_info, 0, sizeof(*port_info)); + opa_vnic_get_vesw_info(adapter, &port_info->vesw); + opa_vnic_get_per_veswport_info(adapter, + &port_info->vport); + } else { + vema_get_pod_values(port_info); + } +} + +/** + * vema_set_veswport_info -- Set veswport info + * @port: source port on which MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + * + * This function gets the port class infor for vnic + */ +static void vema_set_veswport_info(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_vnic_ctrl_port *cport = port->cport; + struct opa_veswport_info *port_info; + struct opa_vnic_adapter *adapter; + u8 vport_num; + + vport_num = vema_get_vport_num(recvd_mad); + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (!adapter) { + adapter = vema_add_vport(port, vport_num); + if (IS_ERR(adapter)) { + c_err("failed to add vport %d: %ld\n", + vport_num, PTR_ERR(adapter)); + goto err_exit; + } + } + + port_info = (struct opa_veswport_info *)recvd_mad->data; + opa_vnic_set_vesw_info(adapter, &port_info->vesw); + opa_vnic_set_per_veswport_info(adapter, &port_info->vport); + + /* Process the new config settings */ + opa_vnic_process_vema_config(adapter); + + vema_get_veswport_info(port, recvd_mad, rsp_mad); + return; + +err_exit: + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; +} + +/** + * vema_get_mac_entries -- Get MAC entries in VNIC MAC table + * @port: source port on which MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + * + * This function gets the MAC entries that are programmed into + * the VNIC MAC forwarding table. It checks for the validity of + * the index into the MAC table and the number of entries that + * are to be retrieved. + */ +static void vema_get_mac_entries(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_veswport_mactable *mac_tbl_in, *mac_tbl_out; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (!adapter) { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + return; + } + + mac_tbl_in = (struct opa_veswport_mactable *)recvd_mad->data; + mac_tbl_out = (struct opa_veswport_mactable *)rsp_mad->data; + + if (vema_mac_tbl_req_ok(mac_tbl_in)) { + mac_tbl_out->offset = mac_tbl_in->offset; + mac_tbl_out->num_entries = mac_tbl_in->num_entries; + opa_vnic_query_mac_tbl(adapter, mac_tbl_out); + } else { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + } +} + +/** + * vema_set_mac_entries -- Set MAC entries in VNIC MAC table + * @port: source port on which MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + * + * This function sets the MAC entries in the VNIC forwarding table + * It checks for the validity of the index and the number of forwarding + * table entries to be programmed. + */ +static void vema_set_mac_entries(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_veswport_mactable *mac_tbl; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (!adapter) { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + return; + } + + mac_tbl = (struct opa_veswport_mactable *)recvd_mad->data; + if (vema_mac_tbl_req_ok(mac_tbl)) { + if (opa_vnic_update_mac_tbl(adapter, mac_tbl)) + rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; + } else { + rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; + } + vema_get_mac_entries(port, recvd_mad, rsp_mad); +} + +/** + * vema_set_delete_vesw -- Reset VESW info to POD values + * @port: source port on which MAD was received + * @recvd_mad: pointer to the received mad + * @rsp_mad: pointer to respose mad + * + * This function clears all the fields of veswport info for the requested vesw + * and sets them back to the power-on default values. It does not delete the + * vesw. + */ +static void vema_set_delete_vesw(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_veswport_info *port_info = + (struct opa_veswport_info *)rsp_mad->data; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (!adapter) { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + return; + } + + vema_get_pod_values(port_info); + opa_vnic_set_vesw_info(adapter, &port_info->vesw); + opa_vnic_set_per_veswport_info(adapter, &port_info->vport); + + /* Process the new config settings */ + opa_vnic_process_vema_config(adapter); + + opa_vnic_release_mac_tbl(adapter); + + vema_get_veswport_info(port, recvd_mad, rsp_mad); +} + +/** + * vema_get_mac_list -- Get the unicast/multicast macs. + * @port: source port on which MAD was received + * @recvd_mad: Received mad contains fields to set vnic parameters + * @rsp_mad: Response mad to be built + * @attr_id: Attribute ID indicating multicast or unicast mac list + */ +static void vema_get_mac_list(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad, + u16 attr_id) +{ + struct opa_veswport_iface_macs *macs_in, *macs_out; + int max_entries = (OPA_VNIC_EMA_DATA - sizeof(*macs_out)) / ETH_ALEN; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (!adapter) { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + return; + } + + macs_in = (struct opa_veswport_iface_macs *)recvd_mad->data; + macs_out = (struct opa_veswport_iface_macs *)rsp_mad->data; + + macs_out->start_idx = macs_in->start_idx; + if (macs_in->num_macs_in_msg) + macs_out->num_macs_in_msg = macs_in->num_macs_in_msg; + else + macs_out->num_macs_in_msg = cpu_to_be16(max_entries); + + if (attr_id == OPA_EM_ATTR_IFACE_MCAST_MACS) + opa_vnic_query_mcast_macs(adapter, macs_out); + else + opa_vnic_query_ucast_macs(adapter, macs_out); +} + +/** + * vema_get_summary_counters -- Gets summary counters. + * @port: source port on which MAD was received + * @recvd_mad: Received mad contains fields to set vnic parameters + * @rsp_mad: Response mad to be built + */ +static void vema_get_summary_counters(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_veswport_summary_counters *cntrs; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (adapter) { + cntrs = (struct opa_veswport_summary_counters *)rsp_mad->data; + opa_vnic_get_summary_counters(adapter, cntrs); + } else { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + } +} + +/** + * vema_get_error_counters -- Gets summary counters. + * @port: source port on which MAD was received + * @recvd_mad: Received mad contains fields to set vnic parameters + * @rsp_mad: Response mad to be built + */ +static void vema_get_error_counters(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + struct opa_veswport_error_counters *cntrs; + struct opa_vnic_adapter *adapter; + + adapter = vema_get_vport_adapter(recvd_mad, port); + if (adapter) { + cntrs = (struct opa_veswport_error_counters *)rsp_mad->data; + opa_vnic_get_error_counters(adapter, cntrs); + } else { + rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; + } +} + +/** + * vema_get -- Process received get MAD + * @port: source port on which MAD was received + * @recvd_mad: Received mad + * @rsp_mad: Response mad to be built + */ +static void vema_get(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id); + + switch (attr_id) { + case OPA_EM_ATTR_CLASS_PORT_INFO: + vema_get_class_port_info(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_VESWPORT_INFO: + vema_get_veswport_info(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_VESWPORT_MAC_ENTRIES: + vema_get_mac_entries(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_IFACE_UCAST_MACS: + /* fall through */ + case OPA_EM_ATTR_IFACE_MCAST_MACS: + vema_get_mac_list(port, recvd_mad, rsp_mad, attr_id); + break; + case OPA_EM_ATTR_VESWPORT_SUMMARY_COUNTERS: + vema_get_summary_counters(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_VESWPORT_ERROR_COUNTERS: + vema_get_error_counters(port, recvd_mad, rsp_mad); + break; + default: + rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; + break; + } +} + +/** + * vema_set -- Process received set MAD + * @port: source port on which MAD was received + * @recvd_mad: Received mad contains fields to set vnic parameters + * @rsp_mad: Response mad to be built + */ +static void vema_set(struct opa_vnic_vema_port *port, + struct opa_vnic_vema_mad *recvd_mad, + struct opa_vnic_vema_mad *rsp_mad) +{ + u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id); + + switch (attr_id) { + case OPA_EM_ATTR_CLASS_PORT_INFO: + vema_set_class_port_info(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_VESWPORT_INFO: + vema_set_veswport_info(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_VESWPORT_MAC_ENTRIES: + vema_set_mac_entries(port, recvd_mad, rsp_mad); + break; + case OPA_EM_ATTR_DELETE_VESW: + vema_set_delete_vesw(port, recvd_mad, rsp_mad); + break; + default: + rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; + break; + } +} + +/** + * vema_send -- Send handler for VEMA MAD agent + * @mad_agent: pointer to the mad agent + * @mad_wc: pointer to mad send work completion information + * + * Free all the data structures associated with the sent MAD + */ +static void vema_send(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_wc) +{ + rdma_destroy_ah(mad_wc->send_buf->ah); + ib_free_send_mad(mad_wc->send_buf); +} + +/** + * vema_recv -- Recv handler for VEMA MAD agent + * @mad_agent: pointer to the mad agent + * @send_buf: Send buffer if found, else NULL + * @mad_wc: pointer to mad send work completion information + * + * Handle only set and get methods and respond to other methods + * as unsupported. Allocate response buffer and address handle + * for the response MAD. + */ +static void vema_recv(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, + struct ib_mad_recv_wc *mad_wc) +{ + struct opa_vnic_vema_port *port; + struct ib_ah *ah; + struct ib_mad_send_buf *rsp; + struct opa_vnic_vema_mad *vema_mad; + + if (!mad_wc || !mad_wc->recv_buf.mad) + return; + + port = mad_agent->context; + ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc, + mad_wc->recv_buf.grh, mad_agent->port_num); + if (IS_ERR(ah)) + goto free_recv_mad; + + rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp, + mad_wc->wc->pkey_index, 0, + IB_MGMT_VENDOR_HDR, OPA_VNIC_EMA_DATA, + GFP_KERNEL, OPA_MGMT_BASE_VERSION); + if (IS_ERR(rsp)) + goto err_rsp; + + rsp->ah = ah; + vema_mad = rsp->mad; + memcpy(vema_mad, mad_wc->recv_buf.mad, IB_MGMT_VENDOR_HDR); + vema_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + vema_mad->mad_hdr.status = 0; + + /* Lock ensures network adapter is not removed */ + mutex_lock(&port->lock); + + switch (mad_wc->recv_buf.mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + vema_get(port, (struct opa_vnic_vema_mad *)mad_wc->recv_buf.mad, + vema_mad); + break; + case IB_MGMT_METHOD_SET: + vema_set(port, (struct opa_vnic_vema_mad *)mad_wc->recv_buf.mad, + vema_mad); + break; + default: + vema_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; + break; + } + mutex_unlock(&port->lock); + + if (!ib_post_send_mad(rsp, NULL)) { + /* + * with post send successful ah and send mad + * will be destroyed in send handler + */ + goto free_recv_mad; + } + + ib_free_send_mad(rsp); + +err_rsp: + rdma_destroy_ah(ah); +free_recv_mad: + ib_free_recv_mad(mad_wc); +} + +/** + * vema_get_port -- Gets the opa_vnic_vema_port + * @cport: pointer to control dev + * @port_num: Port number + * + * This function loops through the ports and returns + * the opa_vnic_vema port structure that is associated + * with the OPA port number + * + * Return: ptr to requested opa_vnic_vema_port strucure + * if success, NULL if not + */ +static struct opa_vnic_vema_port * +vema_get_port(struct opa_vnic_ctrl_port *cport, u8 port_num) +{ + struct opa_vnic_vema_port *port = (void *)cport + sizeof(*cport); + + if (port_num > cport->num_ports) + return NULL; + + return port + (port_num - 1); +} + +/** + * opa_vnic_vema_send_trap -- This function sends a trap to the EM + * @cport: pointer to vnic control port + * @data: pointer to trap data filled by calling function + * @lid: issuers lid (encap_slid from vesw_port_info) + * + * This function is called from the VNIC driver to send a trap if there + * is somethng the EM should be notified about. These events currently + * are + * 1) UNICAST INTERFACE MACADDRESS changes + * 2) MULTICAST INTERFACE MACADDRESS changes + * 3) ETHERNET LINK STATUS changes + * While allocating the send mad the remote site qpn used is 1 + * as this is the well known QP. + * + */ +void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter, + struct __opa_veswport_trap *data, u32 lid) +{ + struct opa_vnic_ctrl_port *cport = adapter->cport; + struct ib_mad_send_buf *send_buf; + struct opa_vnic_vema_port *port; + struct ib_device *ibp; + struct opa_vnic_vema_mad_trap *trap_mad; + struct opa_class_port_info *class; + struct rdma_ah_attr ah_attr; + struct ib_ah *ah; + struct opa_veswport_trap *trap; + u32 trap_lid; + u16 pkey_idx; + + if (!cport) + goto err_exit; + ibp = cport->ibdev; + port = vema_get_port(cport, data->opaportnum); + if (!port || !port->mad_agent) + goto err_exit; + + if (time_before(jiffies, adapter->trap_timeout)) { + if (adapter->trap_count == OPA_VNIC_TRAP_BURST_LIMIT) { + v_warn("Trap rate exceeded\n"); + goto err_exit; + } else { + adapter->trap_count++; + } + } else { + adapter->trap_count = 0; + } + + class = &port->class_port_info; + /* Set up address handle */ + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.type = rdma_ah_find_type(ibp, port->port_num); + rdma_ah_set_sl(&ah_attr, + GET_TRAP_SL_FROM_CLASS_PORT_INFO(class->trap_sl_rsvd)); + rdma_ah_set_port_num(&ah_attr, port->port_num); + trap_lid = be32_to_cpu(class->trap_lid); + /* + * check for trap lid validity, must not be zero + * The trap sink could change after we fashion the MAD but since traps + * are not guaranteed we won't use a lock as anyway the change will take + * place even with locking. + */ + if (!trap_lid) { + c_err("%s: Invalid dlid\n", __func__); + goto err_exit; + } + + rdma_ah_set_dlid(&ah_attr, trap_lid); + ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr); + if (IS_ERR(ah)) { + c_err("%s:Couldn't create new AH = %p\n", __func__, ah); + c_err("%s:dlid = %d, sl = %d, port = %d\n", __func__, + rdma_ah_get_dlid(&ah_attr), rdma_ah_get_sl(&ah_attr), + rdma_ah_get_port_num(&ah_attr)); + goto err_exit; + } + + if (ib_find_pkey(ibp, data->opaportnum, IB_DEFAULT_PKEY_FULL, + &pkey_idx) < 0) { + c_err("%s:full key not found, defaulting to partial\n", + __func__); + if (ib_find_pkey(ibp, data->opaportnum, IB_DEFAULT_PKEY_PARTIAL, + &pkey_idx) < 0) + pkey_idx = 1; + } + + send_buf = ib_create_send_mad(port->mad_agent, 1, pkey_idx, 0, + IB_MGMT_VENDOR_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, OPA_MGMT_BASE_VERSION); + if (IS_ERR(send_buf)) { + c_err("%s:Couldn't allocate send buf\n", __func__); + goto err_sndbuf; + } + + send_buf->ah = ah; + + /* Set up common MAD hdr */ + trap_mad = send_buf->mad; + trap_mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION; + trap_mad->mad_hdr.mgmt_class = OPA_MGMT_CLASS_INTEL_EMA; + trap_mad->mad_hdr.class_version = OPA_EMA_CLASS_VERSION; + trap_mad->mad_hdr.method = IB_MGMT_METHOD_TRAP; + port->tid++; + trap_mad->mad_hdr.tid = cpu_to_be64(port->tid); + trap_mad->mad_hdr.attr_id = IB_SMP_ATTR_NOTICE; + + /* Set up vendor OUI */ + trap_mad->oui[0] = INTEL_OUI_1; + trap_mad->oui[1] = INTEL_OUI_2; + trap_mad->oui[2] = INTEL_OUI_3; + + /* Setup notice attribute portion */ + trap_mad->notice.gen_type = OPA_INTEL_EMA_NOTICE_TYPE_INFO << 1; + trap_mad->notice.oui_1 = INTEL_OUI_1; + trap_mad->notice.oui_2 = INTEL_OUI_2; + trap_mad->notice.oui_3 = INTEL_OUI_3; + trap_mad->notice.issuer_lid = cpu_to_be32(lid); + + /* copy the actual trap data */ + trap = (struct opa_veswport_trap *)trap_mad->notice.raw_data; + trap->fabric_id = cpu_to_be16(data->fabric_id); + trap->veswid = cpu_to_be16(data->veswid); + trap->veswportnum = cpu_to_be32(data->veswportnum); + trap->opaportnum = cpu_to_be16(data->opaportnum); + trap->veswportindex = data->veswportindex; + trap->opcode = data->opcode; + + /* If successful send set up rate limit timeout else bail */ + if (ib_post_send_mad(send_buf, NULL)) { + ib_free_send_mad(send_buf); + } else { + if (adapter->trap_count) + return; + adapter->trap_timeout = jiffies + + usecs_to_jiffies(OPA_VNIC_TRAP_TIMEOUT); + return; + } + +err_sndbuf: + rdma_destroy_ah(ah); +err_exit: + v_err("Aborting trap\n"); +} + +static int vema_rem_vport(int id, void *p, void *data) +{ + struct opa_vnic_adapter *adapter = p; + + opa_vnic_rem_netdev(adapter); + return 0; +} + +static int vema_enable_vport(int id, void *p, void *data) +{ + struct opa_vnic_adapter *adapter = p; + + netif_carrier_on(adapter->netdev); + return 0; +} + +static int vema_disable_vport(int id, void *p, void *data) +{ + struct opa_vnic_adapter *adapter = p; + + netif_carrier_off(adapter->netdev); + return 0; +} + +static void opa_vnic_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct opa_vnic_vema_port *port = + container_of(handler, struct opa_vnic_vema_port, event_handler); + struct opa_vnic_ctrl_port *cport = port->cport; + + if (record->element.port_num != port->port_num) + return; + + c_dbg("OPA_VNIC received event %d on device %s port %d\n", + record->event, record->device->name, record->element.port_num); + + if (record->event == IB_EVENT_PORT_ERR) + idr_for_each(&port->vport_idr, vema_disable_vport, NULL); + if (record->event == IB_EVENT_PORT_ACTIVE) + idr_for_each(&port->vport_idr, vema_enable_vport, NULL); +} + +/** + * vema_unregister -- Unregisters agent + * @cport: pointer to control port + * + * This deletes the registration by VEMA for MADs + */ +static void vema_unregister(struct opa_vnic_ctrl_port *cport) +{ + int i; + + for (i = 1; i <= cport->num_ports; i++) { + struct opa_vnic_vema_port *port = vema_get_port(cport, i); + + if (!port->mad_agent) + continue; + + /* Lock ensures no MAD is being processed */ + mutex_lock(&port->lock); + idr_for_each(&port->vport_idr, vema_rem_vport, NULL); + mutex_unlock(&port->lock); + + ib_unregister_mad_agent(port->mad_agent); + port->mad_agent = NULL; + mutex_destroy(&port->lock); + idr_destroy(&port->vport_idr); + ib_unregister_event_handler(&port->event_handler); + } +} + +/** + * vema_register -- Registers agent + * @cport: pointer to control port + * + * This function registers the handlers for the VEMA MADs + * + * Return: returns 0 on success. non zero otherwise + */ +static int vema_register(struct opa_vnic_ctrl_port *cport) +{ + struct ib_mad_reg_req reg_req = { + .mgmt_class = OPA_MGMT_CLASS_INTEL_EMA, + .mgmt_class_version = OPA_MGMT_BASE_VERSION, + .oui = { INTEL_OUI_1, INTEL_OUI_2, INTEL_OUI_3 } + }; + int i; + + set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); + set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask); + + /* register ib event handler and mad agent for each port on dev */ + for (i = 1; i <= cport->num_ports; i++) { + struct opa_vnic_vema_port *port = vema_get_port(cport, i); + int ret; + + port->cport = cport; + port->port_num = i; + + INIT_IB_EVENT_HANDLER(&port->event_handler, + cport->ibdev, opa_vnic_event); + ret = ib_register_event_handler(&port->event_handler); + if (ret) { + c_err("port %d: event handler register failed\n", i); + vema_unregister(cport); + return ret; + } + + idr_init(&port->vport_idr); + mutex_init(&port->lock); + port->mad_agent = ib_register_mad_agent(cport->ibdev, i, + IB_QPT_GSI, ®_req, + IB_MGMT_RMPP_VERSION, + vema_send, vema_recv, + port, 0); + if (IS_ERR(port->mad_agent)) { + ret = PTR_ERR(port->mad_agent); + port->mad_agent = NULL; + mutex_destroy(&port->lock); + idr_destroy(&port->vport_idr); + vema_unregister(cport); + return ret; + } + } + + return 0; +} + +/** + * opa_vnic_vema_add_one -- Handle new ib device + * @device: ib device pointer + * + * Allocate the vnic control port and initialize it. + */ +static void opa_vnic_vema_add_one(struct ib_device *device) +{ + struct opa_vnic_ctrl_port *cport; + int rc, size = sizeof(*cport); + + if (!rdma_cap_opa_vnic(device)) + return; + + size += device->phys_port_cnt * sizeof(struct opa_vnic_vema_port); + cport = kzalloc(size, GFP_KERNEL); + if (!cport) + return; + + cport->num_ports = device->phys_port_cnt; + cport->ibdev = device; + + /* Initialize opa vnic management agent (vema) */ + rc = vema_register(cport); + if (!rc) + c_info("VNIC client initialized\n"); + + ib_set_client_data(device, &opa_vnic_client, cport); +} + +/** + * opa_vnic_vema_rem_one -- Handle ib device removal + * @device: ib device pointer + * @client_data: ib client data + * + * Uninitialize and free the vnic control port. + */ +static void opa_vnic_vema_rem_one(struct ib_device *device, + void *client_data) +{ + struct opa_vnic_ctrl_port *cport = client_data; + + if (!cport) + return; + + c_info("removing VNIC client\n"); + vema_unregister(cport); + kfree(cport); +} + +static int __init opa_vnic_init(void) +{ + int rc; + + pr_info("OPA Virtual Network Driver - v%s\n", + opa_vnic_driver_version); + + rc = ib_register_client(&opa_vnic_client); + if (rc) + pr_err("VNIC driver register failed %d\n", rc); + + return rc; +} +module_init(opa_vnic_init); + +static void opa_vnic_deinit(void) +{ + ib_unregister_client(&opa_vnic_client); +} +module_exit(opa_vnic_deinit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Intel OPA Virtual Network driver"); +MODULE_VERSION(DRV_VERSION); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c new file mode 100644 index 000000000000..c2733964379c --- /dev/null +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c @@ -0,0 +1,390 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains OPA VNIC EMA Interface functions. + */ + +#include "opa_vnic_internal.h" + +/** + * opa_vnic_vema_report_event - sent trap to report the specified event + * @adapter: vnic port adapter + * @event: event to be reported + * + * This function calls vema api to sent a trap for the given event. + */ +void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event) +{ + struct __opa_veswport_info *info = &adapter->info; + struct __opa_veswport_trap trap_data; + + trap_data.fabric_id = info->vesw.fabric_id; + trap_data.veswid = info->vesw.vesw_id; + trap_data.veswportnum = info->vport.port_num; + trap_data.opaportnum = adapter->port_num; + trap_data.veswportindex = adapter->vport_num; + trap_data.opcode = event; + + opa_vnic_vema_send_trap(adapter, &trap_data, info->vport.encap_slid); +} + +/** + * opa_vnic_get_error_counters - get summary counters + * @adapter: vnic port adapter + * @cntrs: pointer to destination summary counters structure + * + * This function populates the summary counters that is maintained by the + * given adapter to destination address provided. + */ +void opa_vnic_get_summary_counters(struct opa_vnic_adapter *adapter, + struct opa_veswport_summary_counters *cntrs) +{ + struct opa_vnic_stats vstats; + __be64 *dst; + u64 *src; + + memset(&vstats, 0, sizeof(vstats)); + spin_lock(&adapter->stats_lock); + adapter->rn_ops->ndo_get_stats64(adapter->netdev, &vstats.netstats); + spin_unlock(&adapter->stats_lock); + + cntrs->vp_instance = cpu_to_be16(adapter->vport_num); + cntrs->vesw_id = cpu_to_be16(adapter->info.vesw.vesw_id); + cntrs->veswport_num = cpu_to_be32(adapter->port_num); + + cntrs->tx_errors = cpu_to_be64(vstats.netstats.tx_errors); + cntrs->rx_errors = cpu_to_be64(vstats.netstats.rx_errors); + cntrs->tx_packets = cpu_to_be64(vstats.netstats.tx_packets); + cntrs->rx_packets = cpu_to_be64(vstats.netstats.rx_packets); + cntrs->tx_bytes = cpu_to_be64(vstats.netstats.tx_bytes); + cntrs->rx_bytes = cpu_to_be64(vstats.netstats.rx_bytes); + + /* + * This loop depends on layout of + * opa_veswport_summary_counters opa_vnic_stats structures. + */ + for (dst = &cntrs->tx_unicast, src = &vstats.tx_grp.unicast; + dst < &cntrs->reserved[0]; dst++, src++) { + *dst = cpu_to_be64(*src); + } +} + +/** + * opa_vnic_get_error_counters - get error counters + * @adapter: vnic port adapter + * @cntrs: pointer to destination error counters structure + * + * This function populates the error counters that is maintained by the + * given adapter to destination address provided. + */ +void opa_vnic_get_error_counters(struct opa_vnic_adapter *adapter, + struct opa_veswport_error_counters *cntrs) +{ + struct opa_vnic_stats vstats; + + memset(&vstats, 0, sizeof(vstats)); + spin_lock(&adapter->stats_lock); + adapter->rn_ops->ndo_get_stats64(adapter->netdev, &vstats.netstats); + spin_unlock(&adapter->stats_lock); + + cntrs->vp_instance = cpu_to_be16(adapter->vport_num); + cntrs->vesw_id = cpu_to_be16(adapter->info.vesw.vesw_id); + cntrs->veswport_num = cpu_to_be32(adapter->port_num); + + cntrs->tx_errors = cpu_to_be64(vstats.netstats.tx_errors); + cntrs->rx_errors = cpu_to_be64(vstats.netstats.rx_errors); + cntrs->tx_dlid_zero = cpu_to_be64(vstats.tx_dlid_zero); + cntrs->tx_drop_state = cpu_to_be64(vstats.tx_drop_state); + cntrs->tx_logic = cpu_to_be64(vstats.netstats.tx_fifo_errors + + vstats.netstats.tx_carrier_errors); + + cntrs->rx_bad_veswid = cpu_to_be64(vstats.netstats.rx_nohandler); + cntrs->rx_runt = cpu_to_be64(vstats.rx_runt); + cntrs->rx_oversize = cpu_to_be64(vstats.rx_oversize); + cntrs->rx_drop_state = cpu_to_be64(vstats.rx_drop_state); + cntrs->rx_logic = cpu_to_be64(vstats.netstats.rx_fifo_errors); +} + +/** + * opa_vnic_get_vesw_info -- Get the vesw information + * @adapter: vnic port adapter + * @info: pointer to destination vesw info structure + * + * This function copies the vesw info that is maintained by the + * given adapter to destination address provided. + */ +void opa_vnic_get_vesw_info(struct opa_vnic_adapter *adapter, + struct opa_vesw_info *info) +{ + struct __opa_vesw_info *src = &adapter->info.vesw; + int i; + + info->fabric_id = cpu_to_be16(src->fabric_id); + info->vesw_id = cpu_to_be16(src->vesw_id); + memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0)); + info->def_port_mask = cpu_to_be16(src->def_port_mask); + memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1)); + info->pkey = cpu_to_be16(src->pkey); + + memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2)); + info->u_mcast_dlid = cpu_to_be32(src->u_mcast_dlid); + for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) + info->u_ucast_dlid[i] = cpu_to_be32(src->u_ucast_dlid[i]); + + memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3)); + for (i = 0; i < OPA_VNIC_MAX_NUM_PCP; i++) + info->eth_mtu[i] = cpu_to_be16(src->eth_mtu[i]); + + info->eth_mtu_non_vlan = cpu_to_be16(src->eth_mtu_non_vlan); + memcpy(info->rsvd4, src->rsvd4, ARRAY_SIZE(src->rsvd4)); +} + +/** + * opa_vnic_set_vesw_info -- Set the vesw information + * @adapter: vnic port adapter + * @info: pointer to vesw info structure + * + * This function updates the vesw info that is maintained by the + * given adapter with vesw info provided. Reserved fields are stored + * and returned back to EM as is. + */ +void opa_vnic_set_vesw_info(struct opa_vnic_adapter *adapter, + struct opa_vesw_info *info) +{ + struct __opa_vesw_info *dst = &adapter->info.vesw; + int i; + + dst->fabric_id = be16_to_cpu(info->fabric_id); + dst->vesw_id = be16_to_cpu(info->vesw_id); + memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0)); + dst->def_port_mask = be16_to_cpu(info->def_port_mask); + memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1)); + dst->pkey = be16_to_cpu(info->pkey); + + memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2)); + dst->u_mcast_dlid = be32_to_cpu(info->u_mcast_dlid); + for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) + dst->u_ucast_dlid[i] = be32_to_cpu(info->u_ucast_dlid[i]); + + memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3)); + for (i = 0; i < OPA_VNIC_MAX_NUM_PCP; i++) + dst->eth_mtu[i] = be16_to_cpu(info->eth_mtu[i]); + + dst->eth_mtu_non_vlan = be16_to_cpu(info->eth_mtu_non_vlan); + memcpy(dst->rsvd4, info->rsvd4, ARRAY_SIZE(info->rsvd4)); +} + +/** + * opa_vnic_get_per_veswport_info -- Get the vesw per port information + * @adapter: vnic port adapter + * @info: pointer to destination vport info structure + * + * This function copies the vesw per port info that is maintained by the + * given adapter to destination address provided. + * Note that the read only fields are not copied. + */ +void opa_vnic_get_per_veswport_info(struct opa_vnic_adapter *adapter, + struct opa_per_veswport_info *info) +{ + struct __opa_per_veswport_info *src = &adapter->info.vport; + + info->port_num = cpu_to_be32(src->port_num); + info->eth_link_status = src->eth_link_status; + memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0)); + + memcpy(info->base_mac_addr, src->base_mac_addr, + ARRAY_SIZE(info->base_mac_addr)); + info->config_state = src->config_state; + info->oper_state = src->oper_state; + info->max_mac_tbl_ent = cpu_to_be16(src->max_mac_tbl_ent); + info->max_smac_ent = cpu_to_be16(src->max_smac_ent); + info->mac_tbl_digest = cpu_to_be32(src->mac_tbl_digest); + memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1)); + + info->encap_slid = cpu_to_be32(src->encap_slid); + memcpy(info->pcp_to_sc_uc, src->pcp_to_sc_uc, + ARRAY_SIZE(info->pcp_to_sc_uc)); + memcpy(info->pcp_to_vl_uc, src->pcp_to_vl_uc, + ARRAY_SIZE(info->pcp_to_vl_uc)); + memcpy(info->pcp_to_sc_mc, src->pcp_to_sc_mc, + ARRAY_SIZE(info->pcp_to_sc_mc)); + memcpy(info->pcp_to_vl_mc, src->pcp_to_vl_mc, + ARRAY_SIZE(info->pcp_to_vl_mc)); + info->non_vlan_sc_uc = src->non_vlan_sc_uc; + info->non_vlan_vl_uc = src->non_vlan_vl_uc; + info->non_vlan_sc_mc = src->non_vlan_sc_mc; + info->non_vlan_vl_mc = src->non_vlan_vl_mc; + memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2)); + + info->uc_macs_gen_count = cpu_to_be16(src->uc_macs_gen_count); + info->mc_macs_gen_count = cpu_to_be16(src->mc_macs_gen_count); + memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3)); +} + +/** + * opa_vnic_set_per_veswport_info -- Set vesw per port information + * @adapter: vnic port adapter + * @info: pointer to vport info structure + * + * This function updates the vesw per port info that is maintained by the + * given adapter with vesw per port info provided. Reserved fields are + * stored and returned back to EM as is. + */ +void opa_vnic_set_per_veswport_info(struct opa_vnic_adapter *adapter, + struct opa_per_veswport_info *info) +{ + struct __opa_per_veswport_info *dst = &adapter->info.vport; + + dst->port_num = be32_to_cpu(info->port_num); + memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0)); + + memcpy(dst->base_mac_addr, info->base_mac_addr, + ARRAY_SIZE(dst->base_mac_addr)); + dst->config_state = info->config_state; + memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1)); + + dst->encap_slid = be32_to_cpu(info->encap_slid); + memcpy(dst->pcp_to_sc_uc, info->pcp_to_sc_uc, + ARRAY_SIZE(dst->pcp_to_sc_uc)); + memcpy(dst->pcp_to_vl_uc, info->pcp_to_vl_uc, + ARRAY_SIZE(dst->pcp_to_vl_uc)); + memcpy(dst->pcp_to_sc_mc, info->pcp_to_sc_mc, + ARRAY_SIZE(dst->pcp_to_sc_mc)); + memcpy(dst->pcp_to_vl_mc, info->pcp_to_vl_mc, + ARRAY_SIZE(dst->pcp_to_vl_mc)); + dst->non_vlan_sc_uc = info->non_vlan_sc_uc; + dst->non_vlan_vl_uc = info->non_vlan_vl_uc; + dst->non_vlan_sc_mc = info->non_vlan_sc_mc; + dst->non_vlan_vl_mc = info->non_vlan_vl_mc; + memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2)); + memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3)); +} + +/** + * opa_vnic_query_mcast_macs - query multicast mac list + * @adapter: vnic port adapter + * @macs: pointer mac list + * + * This function populates the provided mac list with the configured + * multicast addresses in the adapter. + */ +void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter, + struct opa_veswport_iface_macs *macs) +{ + u16 start_idx, num_macs, idx = 0, count = 0; + struct netdev_hw_addr *ha; + + start_idx = be16_to_cpu(macs->start_idx); + num_macs = be16_to_cpu(macs->num_macs_in_msg); + netdev_for_each_mc_addr(ha, adapter->netdev) { + struct opa_vnic_iface_mac_entry *entry = &macs->entry[count]; + + if (start_idx > idx++) + continue; + else if (num_macs == count) + break; + memcpy(entry, ha->addr, sizeof(*entry)); + count++; + } + + macs->tot_macs_in_lst = cpu_to_be16(netdev_mc_count(adapter->netdev)); + macs->num_macs_in_msg = cpu_to_be16(count); + macs->gen_count = cpu_to_be16(adapter->info.vport.mc_macs_gen_count); +} + +/** + * opa_vnic_query_ucast_macs - query unicast mac list + * @adapter: vnic port adapter + * @macs: pointer mac list + * + * This function populates the provided mac list with the configured + * unicast addresses in the adapter. + */ +void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, + struct opa_veswport_iface_macs *macs) +{ + u16 start_idx, tot_macs, num_macs, idx = 0, count = 0; + struct netdev_hw_addr *ha; + + start_idx = be16_to_cpu(macs->start_idx); + num_macs = be16_to_cpu(macs->num_macs_in_msg); + /* loop through dev_addrs list first */ + for_each_dev_addr(adapter->netdev, ha) { + struct opa_vnic_iface_mac_entry *entry = &macs->entry[count]; + + /* Do not include EM specified MAC address */ + if (!memcmp(adapter->info.vport.base_mac_addr, ha->addr, + ARRAY_SIZE(adapter->info.vport.base_mac_addr))) + continue; + + if (start_idx > idx++) + continue; + else if (num_macs == count) + break; + memcpy(entry, ha->addr, sizeof(*entry)); + count++; + } + + /* loop through uc list */ + netdev_for_each_uc_addr(ha, adapter->netdev) { + struct opa_vnic_iface_mac_entry *entry = &macs->entry[count]; + + if (start_idx > idx++) + continue; + else if (num_macs == count) + break; + memcpy(entry, ha->addr, sizeof(*entry)); + count++; + } + + tot_macs = netdev_hw_addr_list_count(&adapter->netdev->dev_addrs) + + netdev_uc_count(adapter->netdev); + macs->tot_macs_in_lst = cpu_to_be16(tot_macs); + macs->num_macs_in_msg = cpu_to_be16(count); + macs->gen_count = cpu_to_be16(adapter->info.vport.uc_macs_gen_count); +} diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 79bf48477ddb..2354c742caa1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -40,6 +40,7 @@ #include <linux/parser.h> #include <linux/random.h> #include <linux/jiffies.h> +#include <linux/lockdep.h> #include <rdma/ib_cache.h> #include <linux/atomic.h> @@ -311,6 +312,11 @@ static int srp_new_cm_id(struct srp_rdma_ch *ch) if (ch->cm_id) ib_destroy_cm_id(ch->cm_id); ch->cm_id = new_cm_id; + if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev, + target->srp_host->port)) + ch->path.rec_type = SA_PATH_REC_TYPE_OPA; + else + ch->path.rec_type = SA_PATH_REC_TYPE_IB; ch->path.sgid = target->sgid; ch->path.dgid = target->orig_dgid; ch->path.pkey = target->pkey; @@ -371,7 +377,6 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, struct srp_fr_desc *d; struct ib_mr *mr; int i, ret = -EINVAL; - enum ib_mr_type mr_type; if (pool_size <= 0) goto err; @@ -385,13 +390,9 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, spin_lock_init(&pool->lock); INIT_LIST_HEAD(&pool->free_list); - if (device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) - mr_type = IB_MR_TYPE_SG_GAPS; - else - mr_type = IB_MR_TYPE_MEM_REG; - for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { - mr = ib_alloc_mr(pd, mr_type, max_page_list_len); + mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, + max_page_list_len); if (IS_ERR(mr)) { ret = PTR_ERR(mr); if (ret == -ENOMEM) @@ -470,9 +471,13 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) * completion handler can access the queue pair while it is * being destroyed. */ -static void srp_destroy_qp(struct ib_qp *qp) +static void srp_destroy_qp(struct srp_rdma_ch *ch, struct ib_qp *qp) { - ib_drain_rq(qp); + spin_lock_irq(&ch->lock); + ib_process_cq_direct(ch->send_cq, -1); + spin_unlock_irq(&ch->lock); + + ib_drain_qp(qp); ib_destroy_qp(qp); } @@ -546,7 +551,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) } if (ch->qp) - srp_destroy_qp(ch->qp); + srp_destroy_qp(ch, ch->qp); if (ch->recv_cq) ib_free_cq(ch->recv_cq); if (ch->send_cq) @@ -570,7 +575,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) return 0; err_qp: - srp_destroy_qp(qp); + ib_destroy_qp(qp); err_send_cq: ib_free_cq(send_cq); @@ -613,7 +618,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, ib_destroy_fmr_pool(ch->fmr_pool); } - srp_destroy_qp(ch->qp); + srp_destroy_qp(ch, ch->qp); ib_free_cq(ch->send_cq); ib_free_cq(ch->recv_cq); @@ -643,7 +648,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, } static void srp_path_rec_completion(int status, - struct ib_sa_path_rec *pathrec, + struct sa_path_rec *pathrec, void *ch_ptr) { struct srp_rdma_ch *ch = ch_ptr; @@ -1804,6 +1809,8 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; struct srp_iu *iu; + lockdep_assert_held(&ch->lock); + ib_process_cq_direct(ch->send_cq, -1); if (list_empty(&ch->free_tx)) @@ -1824,6 +1831,11 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, return iu; } +/* + * Note: if this function is called from inside ib_drain_sq() then it will + * be called without ch->lock being held. If ib_drain_sq() dequeues a WQE + * with status IB_WC_SUCCESS then that's a bug. + */ static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); @@ -1834,6 +1846,8 @@ static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc) return; } + lockdep_assert_held(&ch->lock); + list_add(&iu->list, &ch->free_tx); } @@ -1889,17 +1903,24 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { spin_lock_irqsave(&ch->lock, flags); ch->req_lim += be32_to_cpu(rsp->req_lim_delta); + if (rsp->tag == ch->tsk_mgmt_tag) { + ch->tsk_mgmt_status = -1; + if (be32_to_cpu(rsp->resp_data_len) >= 4) + ch->tsk_mgmt_status = rsp->data[3]; + complete(&ch->tsk_mgmt_done); + } else { + shost_printk(KERN_ERR, target->scsi_host, + "Received tsk mgmt response too late for tag %#llx\n", + rsp->tag); + } spin_unlock_irqrestore(&ch->lock, flags); - - ch->tsk_mgmt_status = -1; - if (be32_to_cpu(rsp->resp_data_len) >= 4) - ch->tsk_mgmt_status = rsp->data[3]; - complete(&ch->tsk_mgmt_done); } else { scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag); - if (scmnd) { + if (scmnd && scmnd->host_scribble) { req = (void *)scmnd->host_scribble; scmnd = srp_claim_req(ch, req, NULL, scmnd); + } else { + scmnd = NULL; } if (!scmnd) { shost_printk(KERN_ERR, target->scsi_host, @@ -2383,12 +2404,12 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, switch (event->param.rej_rcvd.reason) { case IB_CM_REJ_PORT_CM_REDIRECT: cpi = event->param.rej_rcvd.ari; - ch->path.dlid = cpi->redirect_lid; + sa_path_set_dlid(&ch->path, htonl(ntohs(cpi->redirect_lid))); ch->path.pkey = cpi->redirect_pkey; cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff; memcpy(ch->path.dgid.raw, cpi->redirect_gid, 16); - ch->status = ch->path.dlid ? + ch->status = sa_path_get_dlid(&ch->path) ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; break; @@ -2531,19 +2552,18 @@ srp_change_queue_depth(struct scsi_device *sdev, int qdepth) } static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun, - u8 func) + u8 func, u8 *status) { struct srp_target_port *target = ch->target; struct srp_rport *rport = target->rport; struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; + int res; if (!ch->connected || target->qp_in_error) return -1; - init_completion(&ch->tsk_mgmt_done); - /* * Lock the rport mutex to avoid that srp_create_ch_ib() is * invoked while a task management function is being sent. @@ -2566,10 +2586,16 @@ static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun, tsk_mgmt->opcode = SRP_TSK_MGMT; int_to_scsilun(lun, &tsk_mgmt->lun); - tsk_mgmt->tag = req_tag | SRP_TAG_TSK_MGMT; tsk_mgmt->tsk_mgmt_func = func; tsk_mgmt->task_tag = req_tag; + spin_lock_irq(&ch->lock); + ch->tsk_mgmt_tag = (ch->tsk_mgmt_tag + 1) | SRP_TAG_TSK_MGMT; + tsk_mgmt->tag = ch->tsk_mgmt_tag; + spin_unlock_irq(&ch->lock); + + init_completion(&ch->tsk_mgmt_done); + ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); if (srp_post_send(ch, iu, sizeof(*tsk_mgmt))) { @@ -2578,13 +2604,15 @@ static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun, return -1; } + res = wait_for_completion_timeout(&ch->tsk_mgmt_done, + msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)); + if (res > 0 && status) + *status = ch->tsk_mgmt_status; mutex_unlock(&rport->mutex); - if (!wait_for_completion_timeout(&ch->tsk_mgmt_done, - msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) - return -1; + WARN_ON_ONCE(res < 0); - return 0; + return res > 0 ? 0 : -1; } static int srp_abort(struct scsi_cmnd *scmnd) @@ -2610,7 +2638,7 @@ static int srp_abort(struct scsi_cmnd *scmnd) shost_printk(KERN_ERR, target->scsi_host, "Sending SRP abort for tag %#x\n", tag); if (srp_send_tsk_mgmt(ch, tag, scmnd->device->lun, - SRP_TSK_ABORT_TASK) == 0) + SRP_TSK_ABORT_TASK, NULL) == 0) ret = SUCCESS; else if (target->rport->state == SRP_RPORT_LOST) ret = FAST_IO_FAIL; @@ -2628,14 +2656,15 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_rdma_ch *ch; int i; + u8 status; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); ch = &target->ch[0]; if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun, - SRP_TSK_LUN_RESET)) + SRP_TSK_LUN_RESET, &status)) return FAILED; - if (ch->tsk_mgmt_status) + if (status) return FAILED; for (i = 0; i < target->ch_count; i++) { @@ -2664,9 +2693,8 @@ static int srp_slave_alloc(struct scsi_device *sdev) struct Scsi_Host *shost = sdev->host; struct srp_target_port *target = host_to_target(shost); struct srp_device *srp_dev = target->srp_host->srp_dev; - struct ib_device *ibdev = srp_dev->dev; - if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + if (true) blk_queue_virt_boundary(sdev->request_queue, ~srp_dev->mr_page_mask); @@ -2869,6 +2897,7 @@ static struct scsi_host_template srp_template = { .info = srp_target_info, .queuecommand = srp_queuecommand, .change_queue_depth = srp_change_queue_depth, + .eh_timed_out = srp_timed_out, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, @@ -2909,7 +2938,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) sprintf(target->target_name, "SRP.T10:%016llX", be64_to_cpu(target->id_ext)); - if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dma_device)) + if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dev.parent)) return -ENODEV; memcpy(ids.port_id, &target->id_ext, 8); @@ -3421,11 +3450,12 @@ static ssize_t srp_create_target(struct device *dev, ret = srp_connect_ch(ch, multich); if (ret) { shost_printk(KERN_ERR, target->scsi_host, - PFX "Connection %d/%d failed\n", + PFX "Connection %d/%d to %pI6 failed\n", ch_start + cpu_idx, - target->ch_count); + target->ch_count, + ch->target->orig_dgid.raw); if (node_idx == 0 && cpu_idx == 0) { - goto err_disconnect; + goto free_ch; } else { srp_free_ch_ib(target, ch); srp_free_req_data(target, ch); @@ -3472,6 +3502,7 @@ put: err_disconnect: srp_disconnect_target(target); +free_ch: for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; srp_free_ch_ib(target, ch); @@ -3520,7 +3551,7 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) host->port = port; host->dev.class = &srp_class; - host->dev.parent = device->dev->dma_device; + host->dev.parent = device->dev->dev.parent; dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port); if (device_register(&host->dev)) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 21c69695f9d4..ab9077b81d5a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -152,7 +152,7 @@ struct srp_rdma_ch { struct completion done; int status; - struct ib_sa_path_rec path; + struct sa_path_rec path; struct ib_sa_query *path_query; int path_query_id; @@ -163,6 +163,7 @@ struct srp_rdma_ch { int max_ti_iu_len; int comp_vector; + u64 tsk_mgmt_tag; struct completion tsk_mgmt_done; u8 tsk_mgmt_status; bool connected; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index d21ba9d857c3..402275be0931 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -417,7 +417,7 @@ static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_wc) { - ib_destroy_ah(mad_wc->send_buf->ah); + rdma_destroy_ah(mad_wc->send_buf->ah); ib_free_send_mad(mad_wc->send_buf); } @@ -481,7 +481,7 @@ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, ib_free_send_mad(rsp); err_rsp: - ib_destroy_ah(ah); + rdma_destroy_ah(ah); err: ib_free_recv_mad(mad_wc); } @@ -500,6 +500,7 @@ static int srpt_refresh_port(struct srpt_port *sport) struct ib_mad_reg_req reg_req; struct ib_port_modify port_modify; struct ib_port_attr port_attr; + __be16 *guid; int ret; memset(&port_modify, 0, sizeof(port_modify)); @@ -522,10 +523,17 @@ static int srpt_refresh_port(struct srpt_port *sport) if (ret) goto err_query_port; + sport->port_guid_wwn.priv = sport; + guid = (__be16 *)&sport->gid.global.interface_id; snprintf(sport->port_guid, sizeof(sport->port_guid), - "0x%016llx%016llx", - be64_to_cpu(sport->gid.global.subnet_prefix), - be64_to_cpu(sport->gid.global.interface_id)); + "%04x:%04x:%04x:%04x", + be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), + be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); + sport->port_gid_wwn.priv = sport; + snprintf(sport->port_gid, sizeof(sport->port_gid), + "0x%016llx%016llx", + be64_to_cpu(sport->gid.global.subnet_prefix), + be64_to_cpu(sport->gid.global.interface_id)); if (!sport->mad_agent) { memset(®_req, 0, sizeof(reg_req)); @@ -1149,8 +1157,8 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) } spin_unlock_irqrestore(&ioctx->spinlock, flags); - pr_debug("Aborting cmd with state %d and tag %lld\n", state, - ioctx->cmd.tag); + pr_debug("Aborting cmd with state %d -> %d and tag %lld\n", state, + ioctx->state, ioctx->cmd.tag); switch (state) { case SRPT_STATE_NEW: @@ -1838,6 +1846,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, struct srp_login_rej *rej; struct ib_cm_rep_param *rep_param; struct srpt_rdma_ch *ch, *tmp_ch; + __be16 *guid; u32 it_iu_len; int i, ret = 0; @@ -1983,26 +1992,30 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - /* - * Use the initator port identifier as the session name, when - * checking against se_node_acl->initiatorname[] this can be - * with or without preceeding '0x'. - */ + guid = (__be16 *)¶m->primary_path->sgid.global.interface_id; + snprintf(ch->ini_guid, sizeof(ch->ini_guid), "%04x:%04x:%04x:%04x", + be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), + be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", be64_to_cpu(*(__be64 *)ch->i_port_id), be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); pr_debug("registering session %s\n", ch->sess_name); - ch->sess = target_alloc_session(&sport->port_tpg_1, 0, 0, + if (sport->port_guid_tpg.se_tpg_wwn) + ch->sess = target_alloc_session(&sport->port_guid_tpg, 0, 0, + TARGET_PROT_NORMAL, + ch->ini_guid, ch, NULL); + if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess)) + ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0, TARGET_PROT_NORMAL, ch->sess_name, ch, NULL); /* Retry without leading "0x" */ - if (IS_ERR(ch->sess)) - ch->sess = target_alloc_session(&sport->port_tpg_1, 0, 0, + if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess)) + ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0, TARGET_PROT_NORMAL, ch->sess_name + 2, ch, NULL); - if (IS_ERR(ch->sess)) { + if (IS_ERR_OR_NULL(ch->sess)) { pr_info("Rejected login because no ACL has been configured yet for initiator %s.\n", ch->sess_name); rej->reason = cpu_to_be32((PTR_ERR(ch->sess) == -ENOMEM) ? @@ -2289,12 +2302,8 @@ static void srpt_queue_response(struct se_cmd *cmd) } spin_unlock_irqrestore(&ioctx->spinlock, flags); - if (unlikely(transport_check_aborted_status(&ioctx->cmd, false) - || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) { - atomic_inc(&ch->req_lim_delta); - srpt_abort_cmd(ioctx); + if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) return; - } /* For read commands, transfer the data to the initiator. */ if (ioctx->cmd.data_direction == DMA_FROM_DEVICE && @@ -2420,7 +2429,7 @@ static int srpt_release_sdev(struct srpt_device *sdev) return 0; } -static struct srpt_port *__srpt_lookup_port(const char *name) +static struct se_wwn *__srpt_lookup_wwn(const char *name) { struct ib_device *dev; struct srpt_device *sdev; @@ -2435,23 +2444,25 @@ static struct srpt_port *__srpt_lookup_port(const char *name) for (i = 0; i < dev->phys_port_cnt; i++) { sport = &sdev->port[i]; - if (!strcmp(sport->port_guid, name)) - return sport; + if (strcmp(sport->port_guid, name) == 0) + return &sport->port_guid_wwn; + if (strcmp(sport->port_gid, name) == 0) + return &sport->port_gid_wwn; } } return NULL; } -static struct srpt_port *srpt_lookup_port(const char *name) +static struct se_wwn *srpt_lookup_wwn(const char *name) { - struct srpt_port *sport; + struct se_wwn *wwn; spin_lock(&srpt_dev_lock); - sport = __srpt_lookup_port(name); + wwn = __srpt_lookup_wwn(name); spin_unlock(&srpt_dev_lock); - return sport; + return wwn; } /** @@ -2464,8 +2475,7 @@ static void srpt_add_one(struct ib_device *device) struct ib_srq_init_attr srq_attr; int i; - pr_debug("device = %p, device->dma_ops = %p\n", device, - device->dma_ops); + pr_debug("device = %p\n", device); sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); if (!sdev) @@ -2643,11 +2653,19 @@ static char *srpt_get_fabric_name(void) return "srpt"; } +static struct srpt_port *srpt_tpg_to_sport(struct se_portal_group *tpg) +{ + return tpg->se_tpg_wwn->priv; +} + static char *srpt_get_fabric_wwn(struct se_portal_group *tpg) { - struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(tpg); - return sport->port_guid; + WARN_ON_ONCE(tpg != &sport->port_guid_tpg && + tpg != &sport->port_gid_tpg); + return tpg == &sport->port_guid_tpg ? sport->port_guid : + sport->port_gid; } static u16 srpt_get_tag(struct se_portal_group *tpg) @@ -2667,7 +2685,8 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) struct srpt_rdma_ch *ch = ioctx->ch; unsigned long flags; - WARN_ON(ioctx->state != SRPT_STATE_DONE); + WARN_ON_ONCE(ioctx->state != SRPT_STATE_DONE && + !(ioctx->cmd.transport_state & CMD_T_ABORTED)); if (ioctx->n_rw_ctx) { srpt_free_rw_ctxs(ch, ioctx); @@ -2737,6 +2756,19 @@ static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd) return srpt_get_cmd_state(ioctx); } +static int srpt_parse_guid(u64 *guid, const char *name) +{ + u16 w[4]; + int ret = -EINVAL; + + if (sscanf(name, "%hx:%hx:%hx:%hx", &w[0], &w[1], &w[2], &w[3]) != 4) + goto out; + *guid = get_unaligned_be64(w); + ret = 0; +out: + return ret; +} + /** * srpt_parse_i_port_id() - Parse an initiator port ID. * @name: ASCII representation of a 128-bit initiator port ID. @@ -2772,20 +2804,23 @@ out: */ static int srpt_init_nodeacl(struct se_node_acl *se_nacl, const char *name) { + u64 guid; u8 i_port_id[16]; + int ret; - if (srpt_parse_i_port_id(i_port_id, name) < 0) { + ret = srpt_parse_guid(&guid, name); + if (ret < 0) + ret = srpt_parse_i_port_id(i_port_id, name); + if (ret < 0) pr_err("invalid initiator port ID %s\n", name); - return -EINVAL; - } - return 0; + return ret; } static ssize_t srpt_tpg_attrib_srp_max_rdma_size_show(struct config_item *item, char *page) { struct se_portal_group *se_tpg = attrib_to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size); } @@ -2794,7 +2829,7 @@ static ssize_t srpt_tpg_attrib_srp_max_rdma_size_store(struct config_item *item, const char *page, size_t count) { struct se_portal_group *se_tpg = attrib_to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); unsigned long val; int ret; @@ -2822,7 +2857,7 @@ static ssize_t srpt_tpg_attrib_srp_max_rsp_size_show(struct config_item *item, char *page) { struct se_portal_group *se_tpg = attrib_to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size); } @@ -2831,7 +2866,7 @@ static ssize_t srpt_tpg_attrib_srp_max_rsp_size_store(struct config_item *item, const char *page, size_t count) { struct se_portal_group *se_tpg = attrib_to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); unsigned long val; int ret; @@ -2859,7 +2894,7 @@ static ssize_t srpt_tpg_attrib_srp_sq_size_show(struct config_item *item, char *page) { struct se_portal_group *se_tpg = attrib_to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size); } @@ -2868,7 +2903,7 @@ static ssize_t srpt_tpg_attrib_srp_sq_size_store(struct config_item *item, const char *page, size_t count) { struct se_portal_group *se_tpg = attrib_to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); unsigned long val; int ret; @@ -2906,7 +2941,7 @@ static struct configfs_attribute *srpt_tpg_attrib_attrs[] = { static ssize_t srpt_tpg_enable_show(struct config_item *item, char *page) { struct se_portal_group *se_tpg = to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0); } @@ -2915,7 +2950,7 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, const char *page, size_t count) { struct se_portal_group *se_tpg = to_tpg(item); - struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); struct srpt_device *sdev = sport->sdev; struct srpt_rdma_ch *ch; unsigned long tmp; @@ -2967,15 +3002,19 @@ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, struct config_group *group, const char *name) { - struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); + struct srpt_port *sport = wwn->priv; + static struct se_portal_group *tpg; int res; - /* Initialize sport->port_wwn and sport->port_tpg_1 */ - res = core_tpg_register(&sport->port_wwn, &sport->port_tpg_1, SCSI_PROTOCOL_SRP); + WARN_ON_ONCE(wwn != &sport->port_guid_wwn && + wwn != &sport->port_gid_wwn); + tpg = wwn == &sport->port_guid_wwn ? &sport->port_guid_tpg : + &sport->port_gid_tpg; + res = core_tpg_register(wwn, tpg, SCSI_PROTOCOL_SRP); if (res) return ERR_PTR(res); - return &sport->port_tpg_1; + return tpg; } /** @@ -2984,11 +3023,10 @@ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, */ static void srpt_drop_tpg(struct se_portal_group *tpg) { - struct srpt_port *sport = container_of(tpg, - struct srpt_port, port_tpg_1); + struct srpt_port *sport = srpt_tpg_to_sport(tpg); sport->enabled = false; - core_tpg_deregister(&sport->port_tpg_1); + core_tpg_deregister(tpg); } /** @@ -2999,19 +3037,7 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, struct config_group *group, const char *name) { - struct srpt_port *sport; - int ret; - - sport = srpt_lookup_port(name); - pr_debug("make_tport(%s)\n", name); - ret = -EINVAL; - if (!sport) - goto err; - - return &sport->port_wwn; - -err: - return ERR_PTR(ret); + return srpt_lookup_wwn(name) ? : ERR_PTR(-EINVAL); } /** @@ -3020,9 +3046,6 @@ err: */ static void srpt_drop_tport(struct se_wwn *wwn) { - struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); - - pr_debug("drop_tport(%s\n", config_item_name(&sport->port_wwn.wwn_group.cg_item)); } static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 581878782854..cc1183851af5 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -258,6 +258,7 @@ enum rdma_ch_state { * against concurrent modification by the cm_id spinlock. * @sess: Session information associated with this SRP channel. * @sess_name: Session name. + * @ini_guid: Initiator port GUID. * @release_work: Allows scheduling of srpt_release_channel(). * @release_done: Enables waiting for srpt_release_channel() completion. */ @@ -284,6 +285,7 @@ struct srpt_rdma_ch { struct list_head cmd_wait_list; struct se_session *sess; u8 sess_name[36]; + u8 ini_guid[24]; struct work_struct release_work; struct completion *release_done; }; @@ -306,28 +308,34 @@ struct srpt_port_attrib { * @mad_agent: per-port management datagram processing information. * @enabled: Whether or not this target port is enabled. * @port_guid: ASCII representation of Port GUID + * @port_gid: ASCII representation of Port GID * @port: one-based port number. * @sm_lid: cached value of the port's sm_lid. * @lid: cached value of the port's lid. * @gid: cached value of the port's gid. * @port_acl_lock spinlock for port_acl_list: * @work: work structure for refreshing the aforementioned cached values. - * @port_tpg_1 Target portal group = 1 data. - * @port_wwn: Target core WWN data. + * @port_guid_tpg: TPG associated with target port GUID. + * @port_guid_wwn: WWN associated with target port GUID. + * @port_gid_tpg: TPG associated with target port GID. + * @port_gid_wwn: WWN associated with target port GID. * @port_acl_list: Head of the list with all node ACLs for this port. */ struct srpt_port { struct srpt_device *sdev; struct ib_mad_agent *mad_agent; bool enabled; - u8 port_guid[64]; + u8 port_guid[24]; + u8 port_gid[64]; u8 port; u16 sm_lid; u16 lid; union ib_gid gid; struct work_struct work; - struct se_portal_group port_tpg_1; - struct se_wwn port_wwn; + struct se_portal_group port_guid_tpg; + struct se_wwn port_guid_wwn; + struct se_portal_group port_gid_tpg; + struct se_wwn port_gid_wwn; struct srpt_port_attrib port_attrib; }; |