diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-03 17:49:17 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-03 17:49:17 -0700 |
commit | aa9d4648c2fbb455df7750ade1b73dd9ad9b3690 (patch) | |
tree | bc4590c27e6f30ec0612b28f3f38a539535b9930 /drivers/infiniband/core/verbs.c | |
parent | 906dde0f355bd97c080c215811ae7db1137c4af8 (diff) | |
parent | 8eb19e8e7c8658226d8b7e75728e6dfa2ef32717 (diff) | |
download | blackbird-obmc-linux-aa9d4648c2fbb455df7750ade1b73dd9ad9b3690.tar.gz blackbird-obmc-linux-aa9d4648c2fbb455df7750ade1b73dd9ad9b3690.zip |
Merge tag 'for-linus-ioctl' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull rdma updates from Doug Ledford:
"This is a big pull request.
Of note is that I'm sending you the new ioctl API for the rdma
subsystem. We put it up on linux-api@, but didn't get much response.
The API is complex, but it solves two different problems in one go:
1) The bi-directional nature of the RDMA file write calls, which
created the security hole we had to handle (and for which the fix
is now causing problems for systems in production, we were a bit
over zealous in the fix and the ability to open a device, then
fork, then create new queue pairs on the device and use them is
broken).
2) The bloat caused by different vendors implementing extensions to
the base verbs API. Each vendor's hardware is slightly different,
and the hardware might be suitable for one extension but not
another.
By the time we add generic extensions for all the different ways
that the different hardware can offload things, the API becomes
bloated. Things like our completion structs have started to exceed
a cache line in size because of all the elements needed to support
this. That in turn shows up heavily in the performance graphs with
a noticable drop in performance on 100Gigabit links as our
completion structs go from occupying one cache line to 1+.
This API makes things like the completion structs modular in a
very similar way to netlink so that your structs can only include
the items needed for the offloads/features you are actually using
on a given queue pair. In that way we support everything, but only
use what we need, and our structs stay smaller.
The ioctl API is better explained by the posting on linux-api@ than I
can explain it here, so I'll just leave it at that.
The rest of the pull request is typical stuff.
Updates for 4.14 kernel merge window
- Lots of hfi1 driver updates (mixed with a few qib and core updates
as well)
- rxe updates
- various mlx updates
- Set default roce type to RoCEv2
- Several larger fixes for bnxt_re that were too big for -rc
- Several larger fixes for qedr that, likewise, were too big for -rc
- Misc core changes
- Make the hns_roce driver compilable on arches other than aarch64 so
we can more easily debug build issues related to it
- Add rdma-netlink infrastructure updates
- Add automatic IRQ affinity infrastructure
- Add 32bit lid support
- Lots of misc fixes across the subsystem from random people
- Autoloading of RDMA netlink modules
- PCI pool cleanups from Romain Perier
- mlx5 driver feature additions and fixes
- Hardware tag matchine feature
- Fix sleeping in atomic when resolving roce ah
- Add experimental ioctl interface as posted to linux-api@"
* tag 'for-linus-ioctl' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (328 commits)
IB/core: Expose ioctl interface through experimental Kconfig
IB/core: Assign root to all drivers
IB/core: Add completion queue (cq) object actions
IB/core: Add legacy driver's user-data
IB/core: Export ioctl enum types to user-space
IB/core: Explicitly destroy an object while keeping uobject
IB/core: Add macros for declaring methods and attributes
IB/core: Add uverbs merge trees functionality
IB/core: Add DEVICE object and root tree structure
IB/core: Declare an object instead of declaring only type attributes
IB/core: Add new ioctl interface
RDMA/vmw_pvrdma: Fix a signedness
RDMA/vmw_pvrdma: Report network header type in WC
IB/core: Add might_sleep() annotation to ib_init_ah_from_wc()
IB/cm: Fix sleeping in atomic when RoCE is used
IB/core: Add support to finalize objects in one transaction
IB/core: Add a generic way to execute an operation on a uobject
Documentation: Hardware tag matching
IB/mlx5: Support IB_SRQT_TM
net/mlx5: Add XRQ support
...
Diffstat (limited to 'drivers/infiniband/core/verbs.c')
-rw-r--r-- | drivers/infiniband/core/verbs.c | 169 |
1 files changed, 134 insertions, 35 deletions
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b456e3ca1876..ee9e27dc799b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -180,39 +180,29 @@ EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { - switch (node_type) { - case RDMA_NODE_IB_CA: - case RDMA_NODE_IB_SWITCH: - case RDMA_NODE_IB_ROUTER: - return RDMA_TRANSPORT_IB; - case RDMA_NODE_RNIC: - return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_USNIC: + + if (node_type == RDMA_NODE_USNIC) return RDMA_TRANSPORT_USNIC; - case RDMA_NODE_USNIC_UDP: + if (node_type == RDMA_NODE_USNIC_UDP) return RDMA_TRANSPORT_USNIC_UDP; - default: - BUG(); - return 0; - } + if (node_type == RDMA_NODE_RNIC) + return RDMA_TRANSPORT_IWARP; + + return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) { + enum rdma_transport_type lt; if (device->get_link_layer) return device->get_link_layer(device, port_num); - switch (rdma_node_get_transport(device->node_type)) { - case RDMA_TRANSPORT_IB: + lt = rdma_node_get_transport(device->node_type); + if (lt == RDMA_TRANSPORT_IB) return IB_LINK_LAYER_INFINIBAND; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_USNIC: - case RDMA_TRANSPORT_USNIC_UDP: - return IB_LINK_LAYER_ETHERNET; - default: - return IB_LINK_LAYER_UNSPECIFIED; - } + + return IB_LINK_LAYER_ETHERNET; } EXPORT_SYMBOL(rdma_port_get_link_layer); @@ -478,6 +468,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, union ib_gid dgid; union ib_gid sgid; + might_sleep(); + memset(ah_attr, 0, sizeof *ah_attr); ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { @@ -632,11 +624,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -677,18 +671,18 @@ int ib_destroy_srq(struct ib_srq *srq) pd = srq->pd; srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { + if (ib_srq_has_cq(srq_type)) + cq = srq->ext.cq; + if (srq_type == IB_SRQT_XRC) xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { + if (srq_type == IB_SRQT_XRC) atomic_dec(&xrcd->usecnt); + if (ib_srq_has_cq(srq_type)) atomic_dec(&cq->usecnt); - } } return ret; @@ -1244,6 +1238,18 @@ int ib_resolve_eth_dmac(struct ib_device *device, if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); + return 0; + } + if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { + if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { + __be32 addr = 0; + + memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); + ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); + } else { + ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, + (char *)ah_attr->roce.dmac); + } } else { union ib_gid sgid; struct ib_gid_attr sgid_attr; @@ -1306,6 +1312,61 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +{ + int rc; + u32 netdev_speed; + struct net_device *netdev; + struct ethtool_link_ksettings lksettings; + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + if (!dev->get_netdev) + return -EOPNOTSUPP; + + netdev = dev->get_netdev(dev, port_num); + if (!netdev) + return -ENODEV; + + rtnl_lock(); + rc = __ethtool_get_link_ksettings(netdev, &lksettings); + rtnl_unlock(); + + dev_put(netdev); + + if (!rc) { + netdev_speed = lksettings.base.speed; + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, + netdev_speed); + } + + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } + + return 0; +} +EXPORT_SYMBOL(ib_get_eth_speed); + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) @@ -1573,15 +1634,53 @@ EXPORT_SYMBOL(ib_dealloc_fmr); /* Multicast groups */ +static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_qp_attr attr = {}; + int num_eth_ports = 0; + int port; + + /* If QP state >= init, it is assigned to a port and we can check this + * port only. + */ + if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { + if (attr.qp_state >= IB_QPS_INIT) { + if (qp->device->get_link_layer(qp->device, attr.port_num) != + IB_LINK_LAYER_INFINIBAND) + return true; + goto lid_check; + } + } + + /* Can't get a quick answer, iterate over all ports */ + for (port = 0; port < qp->device->phys_port_cnt; port++) + if (qp->device->get_link_layer(qp->device, port) != + IB_LINK_LAYER_INFINIBAND) + num_eth_ports++; + + /* If we have at lease one Ethernet port, RoCE annex declares that + * multicast LID should be ignored. We can't tell at this step if the + * QP belongs to an IB or Ethernet port. + */ + if (num_eth_ports) + return true; + + /* If all the ports are IB, we can check according to IB spec. */ +lid_check: + return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)); +} + int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); @@ -1597,9 +1696,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); |