diff options
275 files changed, 14866 insertions, 5221 deletions
diff --git a/Documentation/infiniband/tag_matching.txt b/Documentation/infiniband/tag_matching.txt new file mode 100644 index 000000000000..d2a3bf819226 --- /dev/null +++ b/Documentation/infiniband/tag_matching.txt @@ -0,0 +1,64 @@ +Tag matching logic + +The MPI standard defines a set of rules, known as tag-matching, for matching +source send operations to destination receives. The following parameters must +match the following source and destination parameters: +* Communicator +* User tag - wild card may be specified by the receiver +* Source rank – wild car may be specified by the receiver +* Destination rank – wild +The ordering rules require that when more than one pair of send and receive +message envelopes may match, the pair that includes the earliest posted-send +and the earliest posted-receive is the pair that must be used to satisfy the +matching operation. However, this doesn’t imply that tags are consumed in +the order they are created, e.g., a later generated tag may be consumed, if +earlier tags can’t be used to satisfy the matching rules. + +When a message is sent from the sender to the receiver, the communication +library may attempt to process the operation either after or before the +corresponding matching receive is posted. If a matching receive is posted, +this is an expected message, otherwise it is called an unexpected message. +Implementations frequently use different matching schemes for these two +different matching instances. + +To keep MPI library memory footprint down, MPI implementations typically use +two different protocols for this purpose: + +1. The Eager protocol- the complete message is sent when the send is +processed by the sender. A completion send is received in the send_cq +notifying that the buffer can be reused. + +2. The Rendezvous Protocol - the sender sends the tag-matching header, +and perhaps a portion of data when first notifying the receiver. When the +corresponding buffer is posted, the responder will use the information from +the header to initiate an RDMA READ operation directly to the matching buffer. +A fin message needs to be received in order for the buffer to be reused. + +Tag matching implementation + +There are two types of matching objects used, the posted receive list and the +unexpected message list. The application posts receive buffers through calls +to the MPI receive routines in the posted receive list and posts send messages +using the MPI send routines. The head of the posted receive list may be +maintained by the hardware, with the software expected to shadow this list. + +When send is initiated and arrives at the receive side, if there is no +pre-posted receive for this arriving message, it is passed to the software and +placed in the unexpected message list. Otherwise the match is processed, +including rendezvous processing, if appropriate, delivering the data to the +specified receive buffer. This allows overlapping receive-side MPI tag +matching with computation. + +When a receive-message is posted, the communication library will first check +the software unexpected message list for a matching receive. If a match is +found, data is delivered to the user buffer, using a software controlled +protocol. The UCX implementation uses either an eager or rendezvous protocol, +depending on data size. If no match is found, the entire pre-posted receive +list is maintained by the hardware, and there is space to add one more +pre-posted receive to this list, this receive is passed to the hardware. +Software is expected to shadow this list, to help with processing MPI cancel +operations. In addition, because hardware and software are not expected to be +tightly synchronized with respect to the tag-matching operation, this shadow +list is used to detect the case that a pre-posted receive is passed to the +hardware, as the matching unexpected message is being passed from the hardware +to the software. diff --git a/block/Kconfig b/block/Kconfig index 89cd28f8d051..3ab42bbb06d5 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -206,4 +206,9 @@ config BLK_MQ_VIRTIO depends on BLOCK && VIRTIO default y +config BLK_MQ_RDMA + bool + depends on BLOCK && INFINIBAND + default y + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 2b281cf258a0..9396ebc85d24 100644 --- a/block/Makefile +++ b/block/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o +obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c new file mode 100644 index 000000000000..996167f1de18 --- /dev/null +++ b/block/blk-mq-rdma.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 Sagi Grimberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/blk-mq.h> +#include <linux/blk-mq-rdma.h> +#include <rdma/ib_verbs.h> + +/** + * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device + * @set: tagset to provide the mapping for + * @dev: rdma device associated with @set. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the rdma device @dev has at least as many available + * interrupt vetors as @set has queues. It will then query it's affinity mask + * and built queue mapping that maps a queue to the CPUs that have irq affinity + * for the corresponding vector. + * + * In case either the driver passed a @dev with less vectors than + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = ib_get_vector_affinity(dev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; + +fallback: + return blk_mq_map_queues(set); +} +EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 234fe01904e7..3726205c8704 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -34,6 +34,15 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from <http://www.openfabrics.org/git/>. +config INFINIBAND_EXP_USER_ACCESS + bool "Allow experimental support for Infiniband ABI" + depends on INFINIBAND_USER_ACCESS + ---help--- + IOCTL based ABI support for Infiniband. This allows userspace + to invoke the experimental IOCTL based ABI. + These commands are parsed via per-device parsing tree and + enables per-device features. + config INFINIBAND_USER_MEM bool depends on INFINIBAND_USER_ACCESS != n diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index e3cdafff8ece..b4df164f71a6 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -11,7 +11,8 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o + security.o nldev.o + ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o @@ -31,4 +32,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ - rdma_core.o uverbs_std_types.o + rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ + uverbs_ioctl_merge.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 437522ca97b4..12523f630b61 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -130,13 +130,11 @@ static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) } int ib_nl_handle_ip_res_resp(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; - if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; if (ib_nl_is_good_ip_resp(nlh)) @@ -186,7 +184,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. @@ -326,7 +324,7 @@ static void queue_req(struct addr_req *req) static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; /* We fill in what we can, the response will fill the rest */ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index efc94304dee3..77515638c55c 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1199,30 +1199,23 @@ int ib_cache_setup_one(struct ib_device *device) device->cache.ports = kzalloc(sizeof(*device->cache.ports) * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); - if (!device->cache.ports) { - err = -ENOMEM; - goto out; - } + if (!device->cache.ports) + return -ENOMEM; err = gid_table_setup_one(device); - if (err) - goto out; + if (err) { + kfree(device->cache.ports); + device->cache.ports = NULL; + return err; + } for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) ib_cache_update(device, p + rdma_start_port(device), true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); - err = ib_register_event_handler(&device->cache.event_handler); - if (err) - goto err; - + ib_register_event_handler(&device->cache.event_handler); return 0; - -err: - gid_table_cleanup_one(device); -out: - return err; } void ib_cache_release_one(struct ib_device *device) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2b4d613a3474..4c4b46586af2 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -373,11 +373,19 @@ out: return ret; } -static int cm_alloc_response_msg(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc, - struct ib_mad_send_buf **msg) +static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); +} + +static int cm_create_response_msg_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf *msg) { - struct ib_mad_send_buf *m; struct ib_ah *ah; ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, @@ -385,27 +393,40 @@ static int cm_alloc_response_msg(struct cm_port *port, if (IS_ERR(ah)) return PTR_ERR(ah); - m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, - 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, - IB_MGMT_BASE_VERSION); - if (IS_ERR(m)) { - rdma_destroy_ah(ah); - return PTR_ERR(m); - } - m->ah = ah; - *msg = m; + msg->ah = ah; return 0; } static void cm_free_msg(struct ib_mad_send_buf *msg) { - rdma_destroy_ah(msg->ah); + if (msg->ah) + rdma_destroy_ah(msg->ah); if (msg->context[0]) cm_deref_id(msg->context[0]); ib_free_send_mad(msg); } +static int cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf **msg) +{ + struct ib_mad_send_buf *m; + int ret; + + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + if (IS_ERR(m)) + return PTR_ERR(m); + + ret = cm_create_response_msg_ah(port, mad_recv_wc, m); + if (ret) { + cm_free_msg(m); + return ret; + } + + *msg = m; + return 0; +} + static void * cm_copy_private_data(const void *private_data, u8 private_data_len) { @@ -1175,6 +1196,11 @@ static void cm_format_req(struct cm_req_msg *req_msg, { struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; + bool pri_ext = false; + + if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) + pri_ext = opa_is_extended_lid(pri_path->opa.dlid, + pri_path->opa.slid); cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -1202,18 +1228,24 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_srq(req_msg, param->srq); } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + if (pri_ext) { + req_msg->primary_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + req_msg->primary_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = + req_msg->primary_local_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_slid(pri_path))); - req_msg->primary_remote_lid = + req_msg->primary_remote_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_dlid(pri_path))); } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } - req_msg->primary_local_gid = pri_path->sgid; - req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; @@ -1225,17 +1257,29 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_path->packet_life_time)); if (alt_path) { + bool alt_ext = false; + + if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alt_path->opa.dlid, + alt_path->opa.slid); + + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; + if (alt_ext) { + req_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + req_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + } if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = + req_msg->alt_local_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_slid(alt_path))); - req_msg->alt_remote_lid = + req_msg->alt_remote_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_dlid(alt_path))); } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } - req_msg->alt_local_gid = alt_path->sgid; - req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); @@ -1405,16 +1449,63 @@ static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); } +static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) +{ + return ((req_msg->alt_local_lid) || + (ib_is_opa_gid(&req_msg->alt_local_gid))); +} + +static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, + struct sa_path_rec *path, union ib_gid *gid) +{ + if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) + path->rec_type = SA_PATH_REC_TYPE_OPA; + else + path->rec_type = SA_PATH_REC_TYPE_IB; +} + +static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path) +{ + u32 lid; + + if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(primary_path, + htonl(ntohs(req_msg->primary_local_lid))); + sa_path_set_slid(primary_path, + htonl(ntohs(req_msg->primary_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->primary_local_gid); + sa_path_set_dlid(primary_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid); + sa_path_set_slid(primary_path, cpu_to_be32(lid)); + } + + if (!cm_req_has_alt_path(req_msg)) + return; + + if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(alt_path, + htonl(ntohs(req_msg->alt_local_lid))); + sa_path_set_slid(alt_path, + htonl(ntohs(req_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->alt_local_gid); + sa_path_set_dlid(alt_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid); + sa_path_set_slid(alt_path, cpu_to_be32(lid)); + } +} + static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path) { primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; - sa_path_set_dlid(primary_path, - htonl(ntohs(req_msg->primary_local_lid))); - sa_path_set_slid(primary_path, - htonl(ntohs(req_msg->primary_remote_lid))); primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; @@ -1431,13 +1522,9 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = req_msg->service_id; - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; - sa_path_set_dlid(alt_path, - htonl(ntohs(req_msg->alt_local_lid))); - sa_path_set_slid(alt_path, - htonl(ntohs(req_msg->alt_remote_lid))); alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; @@ -1454,6 +1541,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = req_msg->service_id; } + cm_format_path_lid_from_req(req_msg, primary_path, alt_path); } static u16 cm_get_bth_pkey(struct cm_work *work) @@ -1703,7 +1791,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = cpu_to_be16(wc->slid); + req_msg->primary_local_lid = ib_lid_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } @@ -1713,7 +1801,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = cpu_to_be16(wc->slid); + req_msg->alt_local_lid = ib_lid_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } @@ -1784,9 +1872,12 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); @@ -1811,16 +1902,19 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, cm_id_priv); if (ret) { @@ -2424,7 +2518,8 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) goto unlock; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, @@ -2432,7 +2527,8 @@ static int cm_dreq_handler(struct cm_work *work) cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); - if (ib_post_send_mad(msg, NULL)) + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: @@ -2843,6 +2939,11 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, const void *private_data, u8 private_data_len) { + bool alt_ext = false; + + if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, + alternate_path->opa.slid); cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; @@ -2856,6 +2957,12 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, htons(ntohl(sa_path_get_dlid(alternate_path))); lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; + if (alt_ext) { + lap_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid)); + lap_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid)); + } cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; @@ -2924,16 +3031,29 @@ out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_lap); +static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, + struct sa_path_rec *path) +{ + u32 lid; + + if (path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); + sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid); + sa_path_set_dlid(path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid); + sa_path_set_slid(path, cpu_to_be32(lid)); + } +} + static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { - memset(path, 0, sizeof *path); - path->rec_type = SA_PATH_REC_TYPE_IB; path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; - sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); - sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); @@ -2947,6 +3067,7 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); path->packet_life_time -= (path->packet_life_time > 0); + cm_format_path_lid_from_lap(lap_msg, path); } static int cm_lap_handler(struct cm_work *work) @@ -2965,6 +3086,11 @@ static int cm_lap_handler(struct cm_work *work) return -EINVAL; param = &work->cm_event.param.lap_rcvd; + memset(&work->path[0], 0, sizeof(work->path[1])); + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &lap_msg->alt_local_gid); param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = &lap_msg->private_data; @@ -2980,7 +3106,8 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, @@ -2990,7 +3117,8 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); - if (ib_post_send_mad(msg, NULL)) + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: @@ -4201,7 +4329,7 @@ static int __init ib_cm_init(void) goto error1; } - cm.wq = create_workqueue("ib_cm"); + cm.wq = alloc_workqueue("ib_cm", 0, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0eb393237ba2..852c8fec8088 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -72,6 +72,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", @@ -3998,7 +3999,8 @@ static void iboe_mcast_work_handler(struct work_struct *work) kfree(mw); } -static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; @@ -4008,8 +4010,8 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = 0xff; - mgid->raw[1] = 0x0e; + mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; + mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4050,7 +4052,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, goto out1; } - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) @@ -4066,8 +4070,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; @@ -4280,8 +4282,12 @@ static void cma_add_one(struct ib_device *device) for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); - cma_dev->default_gid_type[i - rdma_start_port(device)] = - find_first_bit(&supported_gids, BITS_PER_LONG); + if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) + cma_dev->default_gid_type[i - rdma_start_port(device)] = + CMA_PREFERRED_ROCE_GID_TYPE; + else + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } @@ -4452,9 +4458,8 @@ out: return skb->len; } -static const struct ibnl_client_cbs cma_cb_table[] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats, - .module = THIS_MODULE }, +static const struct rdma_nl_cbs cma_cb_table[] = { + [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, }; static int cma_init_net(struct net *net) @@ -4506,9 +4511,7 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), - cma_cb_table)) - pr_warn("RDMA CMA: failed to add netlink callback\n"); + rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4525,7 +4528,7 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - ibnl_remove_client(RDMA_NL_RDMA_CM); + rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); @@ -4534,5 +4537,7 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); + module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 11ae67514e13..a1d687a664f8 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,7 @@ #include <linux/cgroup_rdma.h> #include <rdma/ib_verbs.h> +#include <rdma/opa_addr.h> #include <rdma/ib_mad.h> #include "mad_priv.h" @@ -102,6 +103,14 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, roce_netdev_callback cb, void *cookie); +typedef int (*nldev_callback)(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx); + +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE @@ -179,8 +188,8 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int ibnl_init(void); -void ibnl_cleanup(void); +int rdma_nl_init(void); +void rdma_nl_exit(void); /** * Check if there are any listeners to the netlink group @@ -190,11 +199,14 @@ void ibnl_cleanup(void); int ibnl_chk_listeners(unsigned int group); int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_nl_handle_ip_res_resp(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_get_cached_subnet_prefix(struct ib_device *device, u8 port_num, @@ -301,4 +313,9 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, return 0; } #endif + +struct ib_device *__ib_device_get_by_index(u32 ifindex); +/* RDMA device netlink */ +void nldev_init(void); +void nldev_exit(void); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 221468f77128..84fc32a2c8b3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -134,6 +134,17 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } +struct ib_device *__ib_device_get_by_index(u32 index) +{ + struct ib_device *device; + + list_for_each_entry(device, &device_list, core_list) + if (device->index == index) + return device; + + return NULL; +} + static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; @@ -145,7 +156,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } - static int alloc_name(char *name) { unsigned long *inuse; @@ -326,10 +336,10 @@ static int read_port_immutable(struct ib_device *device) return 0; } -void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->get_dev_fw_str) - dev->get_dev_fw_str(dev, str, str_len); + dev->get_dev_fw_str(dev, str); else str[0] = '\0'; } @@ -395,6 +405,30 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, } /** + * __dev_new_index - allocate an device index + * + * Returns a suitable unique value for a new device interface + * number. It assumes that there are less than 2^32-1 ib devices + * will be present in the system. + */ +static u32 __dev_new_index(void) +{ + /* + * The device index to allow stable naming. + * Similar to struct net -> ifindex. + */ + static u32 index; + + for (;;) { + if (!(++index)) + index = 1; + + if (!__ib_device_get_by_index(index)) + return index; + } +} + +/** * ib_register_device - Register an IB device with IB core * @device:Device to register * @@ -489,9 +523,10 @@ int ib_register_device(struct ib_device *device, device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) - if (client->add && !add_client_context(device, client)) + if (!add_client_context(device, client) && client->add) client->add(device); + device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); @@ -578,7 +613,7 @@ int ib_register_client(struct ib_client *client) mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) - if (client->add && !add_client_context(device, client)) + if (!add_client_context(device, client) && client->add) client->add(device); down_write(&lists_rwsem); @@ -712,7 +747,7 @@ EXPORT_SYMBOL(ib_set_client_data); * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ -int ib_register_event_handler (struct ib_event_handler *event_handler) +void ib_register_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; @@ -720,8 +755,6 @@ int ib_register_event_handler (struct ib_event_handler *event_handler) list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_register_event_handler); @@ -732,15 +765,13 @@ EXPORT_SYMBOL(ib_register_event_handler); * Unregister an event handler registered with * ib_register_event_handler(). */ -int ib_unregister_event_handler(struct ib_event_handler *event_handler) +void ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); @@ -894,6 +925,31 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, } /** + * ib_enum_all_devs - enumerate all ib_devices + * @cb: Callback to call for each found ib_device + * + * Enumerates all ib_devices and calls callback() on each device. + */ +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ib_device *dev; + unsigned int idx = 0; + int ret = 0; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) { + ret = nldev_cb(dev, skb, cb, idx); + if (ret) + break; + idx++; + } + + up_read(&lists_rwsem); + return ret; +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query @@ -945,14 +1001,17 @@ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { - if (!device->modify_port) - return -ENOSYS; + int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - return device->modify_port(device, port_num, port_modify_mask, - port_modify); + if (device->modify_port) + rc = device->modify_port(device, port_num, port_modify_mask, + port_modify); + else + rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; + return rc; } EXPORT_SYMBOL(ib_modify_port); @@ -1087,29 +1146,21 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); -static struct ibnl_client_cbs ibnl_ls_cb_table[] = { +static const struct rdma_nl_cbs ibnl_ls_cb_table[] = { [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, + .doit = ib_nl_handle_resolve_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, + .doit = ib_nl_handle_set_timeout, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_IP_RESOLVE] = { - .dump = ib_nl_handle_ip_res_resp, - .module = THIS_MODULE }, + .doit = ib_nl_handle_ip_res_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, }; -static int ib_add_ibnl_clients(void) -{ - return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), - ibnl_ls_cb_table); -} - -static void ib_remove_ibnl_clients(void) -{ - ibnl_remove_client(RDMA_NL_LS); -} - static int __init ib_core_init(void) { int ret; @@ -1131,9 +1182,9 @@ static int __init ib_core_init(void) goto err_comp; } - ret = ibnl_init(); + ret = rdma_nl_init(); if (ret) { - pr_warn("Couldn't init IB netlink interface\n"); + pr_warn("Couldn't init IB netlink interface: err %d\n", ret); goto err_sysfs; } @@ -1155,24 +1206,18 @@ static int __init ib_core_init(void) goto err_mad; } - ret = ib_add_ibnl_clients(); - if (ret) { - pr_warn("Couldn't register ibnl clients\n"); - goto err_sa; - } - ret = register_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); - goto err_ibnl_clients; + goto err_sa; } + nldev_init(); + rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ib_cache_setup(); return 0; -err_ibnl_clients: - ib_remove_ibnl_clients(); err_sa: ib_sa_cleanup(); err_mad: @@ -1180,7 +1225,7 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - ibnl_cleanup(); + rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -1192,18 +1237,21 @@ err: static void __exit ib_core_cleanup(void) { - unregister_lsm_notifier(&ibdev_lsm_nb); ib_cache_cleanup(); - ib_remove_ibnl_clients(); + nldev_exit(); + rdma_nl_unregister(RDMA_NL_LS); + unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); - ibnl_cleanup(); + rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); + module_init(ib_core_init); module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 31661b5c1743..fcf42f6bb82a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason) } EXPORT_SYMBOL(iwcm_reject_msg); -static struct ibnl_client_cbs iwcm_nl_cb_table[] = { +static struct rdma_nl_cbs iwcm_nl_cb_table[] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, @@ -1175,13 +1175,9 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - - ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), - iwcm_nl_cb_table); - if (ret) - pr_err("iw_cm: couldn't register netlink callbacks\n"); - - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); + else + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); if (!iwcm_wq) return -ENOMEM; @@ -1200,9 +1196,11 @@ static void __exit iw_cm_cleanup(void) { unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - ibnl_remove_client(RDMA_NL_IWCM); + rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); + module_init(iw_cm_init); module_exit(iw_cm_cleanup); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index a0e7c16d8bd8..30825bb9b8e9 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -42,7 +42,6 @@ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -EXPORT_SYMBOL(iwpm_valid_pid); /* * iwpm_register_pid - Send a netlink query to user space @@ -104,7 +103,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; @@ -122,7 +121,6 @@ pid_query_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_register_pid); /* * iwpm_add_mapping - Send a netlink add mapping message @@ -174,7 +172,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto add_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -191,7 +189,6 @@ add_mapping_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_add_mapping); /* * iwpm_add_and_query_mapping - Send a netlink add and query @@ -251,7 +248,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto query_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -267,7 +264,6 @@ query_mapping_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_add_and_query_mapping); /* * iwpm_remove_mapping - Send a netlink remove mapping message @@ -312,7 +308,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -328,7 +324,6 @@ remove_mapping_error: dev_kfree_skb_any(skb); return ret; } -EXPORT_SYMBOL(iwpm_remove_mapping); /* netlink attribute policy for the received response to register pid request */ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { @@ -397,7 +392,6 @@ register_pid_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_register_pid_cb); /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { @@ -466,7 +460,6 @@ add_mapping_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_add_mapping_cb); /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ @@ -558,7 +551,6 @@ query_mapping_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); /* * iwpm_remote_info_cb - Process a port mapper message, containing @@ -627,7 +619,6 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) "remote_info: Mapped remote sockaddr:"); return ret; } -EXPORT_SYMBOL(iwpm_remote_info_cb); /* netlink attribute policy for the received request for mapping info */ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { @@ -677,7 +668,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); return ret; } -EXPORT_SYMBOL(iwpm_mapping_info_cb); /* netlink attribute policy for the received mapping info ack */ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { @@ -707,7 +697,6 @@ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); return 0; } -EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); /* netlink attribute policy for the received port mapper error message */ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { @@ -751,4 +740,3 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index f13870e69ccd..c81c55942626 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -54,8 +54,6 @@ static struct iwpm_admin_data iwpm_admin; int iwpm_init(u8 nl_client) { int ret = 0; - if (iwpm_valid_client(nl_client)) - return -EINVAL; mutex_lock(&iwpm_admin_lock); if (atomic_read(&iwpm_admin.refcount) == 0) { iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE * @@ -83,7 +81,6 @@ init_exit: } return ret; } -EXPORT_SYMBOL(iwpm_init); static void free_hash_bucket(void); static void free_reminfo_bucket(void); @@ -109,7 +106,6 @@ int iwpm_exit(u8 nl_client) iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } -EXPORT_SYMBOL(iwpm_exit); static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); @@ -148,7 +144,6 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_create_mapinfo); int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) @@ -184,7 +179,6 @@ remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_remove_mapinfo); static void free_hash_bucket(void) { @@ -297,7 +291,6 @@ get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_get_remote_info); struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) @@ -383,15 +376,11 @@ int iwpm_get_nlmsg_seq(void) int iwpm_valid_client(u8 nl_client) { - if (nl_client >= RDMA_NL_NUM_CLIENTS) - return 0; return iwpm_admin.client_list[nl_client]; } void iwpm_set_valid(u8 nl_client, int valid) { - if (nl_client >= RDMA_NL_NUM_CLIENTS) - return; iwpm_admin.client_list[nl_client] = valid; } @@ -608,7 +597,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; - ret = ibnl_unicast(skb, nlh, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -637,7 +626,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 0d3cca0a8890..e5cf09c66fe6 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -64,7 +64,7 @@ struct mad_rmpp_recv { __be64 tid; u32 src_qp; - u16 slid; + u32 slid; u8 mgmt_class; u8 class_version; u8 method; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 94931c474d41..e685148dd3e6 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved. * Copyright (c) 2010 Voltaire Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -37,239 +38,267 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> +#include <linux/module.h> #include "core_priv.h" -struct ibnl_client { - struct list_head list; - int index; - int nops; - const struct ibnl_client_cbs *cb_table; -}; +#include "core_priv.h" -static DEFINE_MUTEX(ibnl_mutex); +static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; -static LIST_HEAD(client_list); +static struct { + const struct rdma_nl_cbs *cb_table; +} rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int ibnl_chk_listeners(unsigned int group) +int rdma_nl_chk_listeners(unsigned int group) { - if (netlink_has_listeners(nls, group) == 0) - return -1; - return 0; + return (netlink_has_listeners(nls, group)) ? 0 : -1; } +EXPORT_SYMBOL(rdma_nl_chk_listeners); -int ibnl_add_client(int index, int nops, - const struct ibnl_client_cbs cb_table[]) +static bool is_nl_msg_valid(unsigned int type, unsigned int op) { - struct ibnl_client *cur; - struct ibnl_client *nl_client; + static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS - 1] = { + RDMA_NL_RDMA_CM_NUM_OPS, + RDMA_NL_IWPM_NUM_OPS, + 0, + RDMA_NL_LS_NUM_OPS, + RDMA_NLDEV_NUM_OPS }; - nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL); - if (!nl_client) - return -ENOMEM; + /* + * This BUILD_BUG_ON is intended to catch addition of new + * RDMA netlink protocol without updating the array above. + */ + BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); - nl_client->index = index; - nl_client->nops = nops; - nl_client->cb_table = cb_table; + if (type > RDMA_NL_NUM_CLIENTS - 1) + return false; - mutex_lock(&ibnl_mutex); + return (op < max_num_ops[type - 1]) ? true : false; +} - list_for_each_entry(cur, &client_list, list) { - if (cur->index == index) { - pr_warn("Client for %d already exists\n", index); - mutex_unlock(&ibnl_mutex); - kfree(nl_client); - return -EINVAL; - } +static bool is_nl_valid(unsigned int type, unsigned int op) +{ + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_msg_valid(type, op)) + return false; + + cb_table = rdma_nl_types[type].cb_table; +#ifdef CONFIG_MODULES + if (!cb_table) { + mutex_unlock(&rdma_nl_mutex); + request_module("rdma-netlink-subsys-%d", type); + mutex_lock(&rdma_nl_mutex); + cb_table = rdma_nl_types[type].cb_table; } +#endif - list_add_tail(&nl_client->list, &client_list); - - mutex_unlock(&ibnl_mutex); - - return 0; + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) + return false; + return true; } -EXPORT_SYMBOL(ibnl_add_client); -int ibnl_remove_client(int index) +void rdma_nl_register(unsigned int index, + const struct rdma_nl_cbs cb_table[]) { - struct ibnl_client *cur, *next; - - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - if (cur->index == index) { - list_del(&(cur->list)); - mutex_unlock(&ibnl_mutex); - kfree(cur); - return 0; - } + mutex_lock(&rdma_nl_mutex); + if (!is_nl_msg_valid(index, 0)) { + /* + * All clients are not interesting in success/failure of + * this call. They want to see the print to error log and + * continue their initialization. Print warning for them, + * because it is programmer's error to be here. + */ + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The not-valid %u index was supplied to RDMA netlink\n", + index); + return; } - pr_warn("Can't remove callback for client idx %d. Not found\n", index); - mutex_unlock(&ibnl_mutex); - return -EINVAL; + if (rdma_nl_types[index].cb_table) { + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The %u index is already registered in RDMA netlink\n", + index); + return; + } + + rdma_nl_types[index].cb_table = cb_table; + mutex_unlock(&rdma_nl_mutex); +} +EXPORT_SYMBOL(rdma_nl_register); + +void rdma_nl_unregister(unsigned int index) +{ + mutex_lock(&rdma_nl_mutex); + rdma_nl_types[index].cb_table = NULL; + mutex_unlock(&rdma_nl_mutex); } -EXPORT_SYMBOL(ibnl_remove_client); +EXPORT_SYMBOL(rdma_nl_unregister); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int len, int client, int op, int flags) { - unsigned char *prev_tail; - - prev_tail = skb_tail_pointer(skb); - *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), - len, flags); + *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags); if (!*nlh) - goto out_nlmsg_trim; - (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; + return NULL; return nlmsg_data(*nlh); - -out_nlmsg_trim: - nlmsg_trim(skb, prev_tail); - return NULL; } EXPORT_SYMBOL(ibnl_put_msg); int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type) { - unsigned char *prev_tail; - - prev_tail = skb_tail_pointer(skb); - if (nla_put(skb, type, len, data)) - goto nla_put_failure; - nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail; + if (nla_put(skb, type, len, data)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } return 0; - -nla_put_failure: - nlmsg_trim(skb, prev_tail - nlh->nlmsg_len); - return -EMSGSIZE; } EXPORT_SYMBOL(ibnl_put_attr); -static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct ibnl_client *client; int type = nlh->nlmsg_type; - int index = RDMA_NL_GET_CLIENT(type); + unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_valid(index, op)) + return -EINVAL; + + cb_table = rdma_nl_types[index].cb_table; - list_for_each_entry(client, &client_list, list) { - if (client->index == index) { - if (op >= client->nops || !client->cb_table[op].dump) - return -EINVAL; - - /* - * For response or local service set_timeout request, - * there is no need to use netlink_dump_start. - */ - if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - (index == RDMA_NL_LS && - op == RDMA_NL_LS_OP_SET_TIMEOUT)) { - struct netlink_callback cb = { - .skb = skb, - .nlh = nlh, - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - - return cb.dump(skb, &cb); - } - - { - struct netlink_dump_control c = { - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - return netlink_dump_start(nls, skb, nlh, &c); - } - } + if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + /* FIXME: Convert IWCM to properly handle doit callbacks */ + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || + index == RDMA_NL_IWCM) { + struct netlink_dump_control c = { + .dump = cb_table[op].dump, + }; + return netlink_dump_start(nls, skb, nlh, &c); } - pr_info("Index %d wasn't found in client list\n", index); - return -EINVAL; + if (cb_table[op].doit) + return cb_table[op].doit(skb, nlh, extack); + + return 0; } -static void ibnl_rcv_reply_skb(struct sk_buff *skb) +/* + * This function is similar to netlink_rcv_skb with one exception: + * It calls to the callback for the netlink messages without NLM_F_REQUEST + * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed + * for that consumer only. + */ +static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, + struct netlink_ext_ack *)) { + struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; - int msglen; + int err; - /* - * Process responses until there is no more message or the first - * request. Generally speaking, it is not recommended to mix responses - * with requests. - */ while (skb->len >= nlmsg_total_size(0)) { + int msglen; + nlh = nlmsg_hdr(skb); + err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) - return; - - /* Handle response only */ - if (nlh->nlmsg_flags & NLM_F_REQUEST) - return; - - ibnl_rcv_msg(skb, nlh, NULL); + return 0; + /* + * Generally speaking, the only requests are handled + * by the kernel, but RDMA_NL_LS is different, because it + * runs backward netlink scheme. Kernel initiates messages + * and waits for reply with data to keep pathrecord cache + * in sync. + */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) && + (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS)) + goto ack; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + + err = cb(skb, nlh, &extack); + if (err == -EINTR) + goto skip; + +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) + netlink_ack(skb, nlh, err, &extack); + +skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } + + return 0; } -static void ibnl_rcv(struct sk_buff *skb) +static void rdma_nl_rcv(struct sk_buff *skb) { - mutex_lock(&ibnl_mutex); - ibnl_rcv_reply_skb(skb); - netlink_rcv_skb(skb, &ibnl_rcv_msg); - mutex_unlock(&ibnl_mutex); + mutex_lock(&rdma_nl_mutex); + rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); + mutex_unlock(&rdma_nl_mutex); } -int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid) +int rdma_nl_unicast(struct sk_buff *skb, u32 pid) +{ + int err; + + err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); + return (err < 0) ? err : 0; +} +EXPORT_SYMBOL(rdma_nl_unicast); + +int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) { int err; err = netlink_unicast(nls, skb, pid, 0); return (err < 0) ? err : 0; } -EXPORT_SYMBOL(ibnl_unicast); +EXPORT_SYMBOL(rdma_nl_unicast_wait); -int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, - unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) { return nlmsg_multicast(nls, skb, 0, group, flags); } -EXPORT_SYMBOL(ibnl_multicast); +EXPORT_SYMBOL(rdma_nl_multicast); -int __init ibnl_init(void) +int __init rdma_nl_init(void) { struct netlink_kernel_cfg cfg = { - .input = ibnl_rcv, + .input = rdma_nl_rcv, }; nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); - if (!nls) { - pr_warn("Failed to create netlink socket\n"); + if (!nls) return -ENOMEM; - } nls->sk_sndtimeo = 10 * HZ; return 0; } -void ibnl_cleanup(void) +void rdma_nl_exit(void) { - struct ibnl_client *cur, *next; + int idx; - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - list_del(&(cur->list)); - kfree(cur); - } - mutex_unlock(&ibnl_mutex); + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + rdma_nl_unregister(idx); netlink_kernel_release(nls); } + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c new file mode 100644 index 000000000000..3ba24c428c3b --- /dev/null +++ b/drivers/infiniband/core/nldev.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <net/netlink.h> +#include <rdma/rdma_netlink.h> + +#include "core_priv.h" + +static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = IB_FW_VERSION_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, +}; + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) + return -EMSGSIZE; + + BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + device->attrs.device_cap_flags, 0)) + return -EMSGSIZE; + + ib_get_device_fw_str(device, fw); + /* Device without FW has strlen(fw) */ + if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, + be64_to_cpu(device->node_guid), 0)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, + be64_to_cpu(device->attrs.sys_image_guid), 0)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) + return -EMSGSIZE; + return 0; +} + +static int fill_port_info(struct sk_buff *msg, + struct ib_device *device, u32 port) +{ + struct ib_port_attr attr; + int ret; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + return -EMSGSIZE; + + ret = ib_query_port(device, port, &attr); + if (ret) + return ret; + + BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + (u64)attr.port_cap_flags, 0)) + return -EMSGSIZE; + if (rdma_protocol_ib(device, port) && + nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, + attr.subnet_prefix, 0)) + return -EMSGSIZE; + if (rdma_protocol_ib(device, port)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) + return -EMSGSIZE; + return 0; +} + +static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + + device = __ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_dev_info(msg, device); + if (err) { + nlmsg_free(msg); + return err; + } + + nlmsg_end(msg, nlh); + + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int _nldev_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, NLM_F_MULTI); + + if (fill_dev_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: cb->args[0] = idx; + return skb->len; +} + +static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + /* + * There is no need to take lock, because + * we are relying on ib_core's lists_rwsem + */ + return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); +} + +static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + u32 port; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = __ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_port_info(msg, device, port); + if (err) { + nlmsg_free(msg); + return err; + } + + nlmsg_end(msg, nlh); + + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int nldev_port_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + int start = cb->args[0]; + struct nlmsghdr *nlh; + u32 idx = 0; + u32 ifindex; + int err; + u32 p; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = __ib_device_get_by_index(ifindex); + if (!device) + return -EINVAL; + + for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + /* + * The dumpit function returns all information from specific + * index. This specific index is taken from the netlink + * messages request sent by user and it is available + * in cb->args[0]. + * + * Usually, the user doesn't fill this field and it causes + * to return everything. + * + */ + if (idx < start) { + idx++; + continue; + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_PORT_GET), + 0, NLM_F_MULTI); + + if (fill_port_info(skb, device, p)) { + nlmsg_cancel(skb, nlh); + goto out; + } + idx++; + nlmsg_end(skb, nlh); + } + +out: cb->args[0] = idx; + return skb->len; +} + +static const struct rdma_nl_cbs nldev_cb_table[] = { + [RDMA_NLDEV_CMD_GET] = { + .doit = nldev_get_doit, + .dump = nldev_get_dumpit, + }, + [RDMA_NLDEV_CMD_PORT_GET] = { + .doit = nldev_port_get_doit, + .dump = nldev_port_get_dumpit, + }, +}; + +void __init nldev_init(void) +{ + rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); +} + +void __exit nldev_exit(void) +{ + rdma_nl_unregister(RDMA_NL_NLDEV); +} + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 41c31a2bf093..85b5ee4defa4 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -35,10 +35,57 @@ #include <rdma/ib_verbs.h> #include <rdma/uverbs_types.h> #include <linux/rcupdate.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" +int uverbs_ns_idx(u16 *id, unsigned int ns_count) +{ + int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT; + + if (ret >= ns_count) + return -EINVAL; + + *id &= ~UVERBS_ID_NS_MASK; + return ret; +} + +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object) +{ + const struct uverbs_root_spec *object_hash = ibdev->specs_root; + const struct uverbs_object_spec_hash *objects; + int ret = uverbs_ns_idx(&object, object_hash->num_buckets); + + if (ret < 0) + return NULL; + + objects = object_hash->object_buckets[ret]; + + if (object >= objects->num_objects) + return NULL; + + return objects->objects[object]; +} + +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method) +{ + const struct uverbs_method_spec_hash *methods; + int ret = uverbs_ns_idx(&method, object->num_buckets); + + if (ret < 0) + return NULL; + + methods = object->method_buckets[ret]; + if (method >= methods->num_methods) + return NULL; + + return methods->methods[method]; +} + void uverbs_uobject_get(struct ib_uobject *uobject) { kref_get(&uobject->ref); @@ -404,6 +451,41 @@ int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj) return ret; } +static int null_obj_type_class_remove_commit(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +} + +static const struct uverbs_obj_type null_obj_type = { + .type_class = &((const struct uverbs_obj_type_class){ + .remove_commit = null_obj_type_class_remove_commit, + /* be cautious */ + .needs_kfree_rcu = true}), +}; + +int rdma_explicit_destroy(struct ib_uobject *uobject) +{ + int ret; + struct ib_ucontext *ucontext = uobject->context; + + /* Cleanup is running. Calling this should have been impossible */ + if (!down_read_trylock(&ucontext->cleanup_rwsem)) { + WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); + return 0; + } + lockdep_check(uobject, true); + ret = uobject->type->type_class->remove_commit(uobject, + RDMA_REMOVE_DESTROY); + if (ret) + return ret; + + uobject->type = &null_obj_type; + + up_read(&ucontext->cleanup_rwsem); + return 0; +} + static void alloc_commit_idr_uobject(struct ib_uobject *uobj) { uverbs_uobject_add(uobj); @@ -625,3 +707,100 @@ const struct uverbs_obj_type_class uverbs_fd_class = { .needs_kfree_rcu = false, }; +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id) +{ + switch (access) { + case UVERBS_ACCESS_READ: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, false); + case UVERBS_ACCESS_DESTROY: + case UVERBS_ACCESS_WRITE: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, true); + case UVERBS_ACCESS_NEW: + return rdma_alloc_begin_uobject(type_attrs, ucontext); + default: + WARN_ON(true); + return ERR_PTR(-EOPNOTSUPP); + } +} + +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit) +{ + int ret = 0; + + /* + * refcounts should be handled at the object level and not at the + * uobject level. Refcounts of the objects themselves are done in + * handlers. + */ + + switch (access) { + case UVERBS_ACCESS_READ: + rdma_lookup_put_uobject(uobj, false); + break; + case UVERBS_ACCESS_WRITE: + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_DESTROY: + if (commit) + ret = rdma_remove_commit_uobject(uobj); + else + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_NEW: + if (commit) + ret = rdma_alloc_commit_uobject(uobj); + else + rdma_alloc_abort_uobject(uobj); + break; + default: + WARN_ON(true); + ret = -EOPNOTSUPP; + } + + return ret; +} + +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < num; i++) { + struct uverbs_attr_bundle_hash *curr_bundle = + &attrs_bundle->hash[i]; + const struct uverbs_attr_spec_hash *curr_spec_bucket = + spec_hash[i]; + unsigned int j; + + for (j = 0; j < curr_bundle->num_attrs; j++) { + struct uverbs_attr *attr; + const struct uverbs_attr_spec *spec; + + if (!uverbs_attr_is_valid_in_hash(curr_bundle, j)) + continue; + + attr = &curr_bundle->attrs[j]; + spec = &curr_spec_bucket->attrs[j]; + + if (spec->type == UVERBS_ATTR_TYPE_IDR || + spec->type == UVERBS_ATTR_TYPE_FD) { + int current_ret; + + current_ret = uverbs_finalize_object(attr->obj_attr.uobject, + spec->obj.access, + commit); + if (!ret) + ret = current_ret; + } + } + } + return ret; +} diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 1b82e7ff7fe8..1efcf93238dd 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -39,9 +39,15 @@ #include <linux/idr.h> #include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> #include <rdma/ib_verbs.h> #include <linux/mutex.h> +int uverbs_ns_idx(u16 *id, unsigned int ns_count); +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object); +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method); /* * These functions initialize the context and cleanups its uobjects. * The context has a list of objects which is protected by a mutex @@ -75,4 +81,40 @@ void uverbs_uobject_put(struct ib_uobject *uobject); */ void uverbs_close_fd(struct file *f); +/* + * Get an ib_uobject that corresponds to the given id from ucontext, assuming + * the object is from the given type. Lock it to the required access when + * applicable. + * This function could create (access == NEW), destroy (access == DESTROY) + * or unlock (access == READ || access == WRITE) objects if required. + * The action will be finalized only when uverbs_finalize_object or + * uverbs_finalize_objects are called. + */ +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id); +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit); +/* + * Note that certain finalize stages could return a status: + * (a) alloc_commit could return a failure if the object is committed at the + * same time when the context is destroyed. + * (b) remove_commit could fail if the object wasn't destroyed successfully. + * Since multiple objects could be finalized in one transaction, it is very NOT + * recommended to have several finalize actions which have side effects. + * For example, it's NOT recommended to have a certain action which has both + * a commit action and a destroy action or two destroy objects in the same + * action. The rule of thumb is to have one destroy or commit action with + * multiple lookups. + * The first non zero return value of finalize_object is returned from this + * function. For example, this could happen when we couldn't destroy an + * object. + */ +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit); + #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 94a9eefb3cfc..90e3889b7fbe 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -44,6 +44,8 @@ static struct workqueue_struct *gid_cache_wq; +static struct workqueue_struct *gid_cache_wq; + enum gid_op_type { GID_DEL = 0, GID_ADD diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 70fa4cabe48e..ab5e1024fea9 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -50,6 +50,7 @@ #include <uapi/rdma/ib_user_sa.h> #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> +#include <rdma/opa_addr.h> #include "sa.h" #include "core_priv.h" @@ -861,7 +862,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); if (!ret) ret = len; else @@ -1021,9 +1022,9 @@ static void ib_nl_request_timeout(struct work_struct *work) } int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; int timeout, delta, abs_delta; const struct nlattr *attr; unsigned long flags; @@ -1033,8 +1034,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, int ret; if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -1098,9 +1098,9 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) } int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; unsigned long flags; struct ib_sa_query *query; struct ib_mad_send_buf *send_buf; @@ -1109,8 +1109,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); @@ -1241,6 +1240,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))) + rdma_ah_set_make_grd(ah_attr, true); + rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & get_src_path_mask(device, port_num)); @@ -1420,7 +1424,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } @@ -2290,12 +2294,15 @@ static void update_sm_ah(struct work_struct *work) rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); rdma_ah_set_port_num(&ah_attr, port->port_num); if (port_attr.grh_required) { - rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); - - rdma_ah_set_subnet_prefix(&ah_attr, - cpu_to_be64(port_attr.subnet_prefix)); - rdma_ah_set_interface_id(&ah_attr, - cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) { + rdma_ah_set_make_grd(&ah_attr, true); + } else { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + } } new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr); @@ -2410,8 +2417,7 @@ static void ib_sa_add_one(struct ib_device *device) */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); - if (ib_register_event_handler(&sa_dev->event_handler)) - goto err; + ib_register_event_handler(&sa_dev->event_handler); for (i = 0; i <= e - s; ++i) { if (rdma_cap_ib_sa(device, i + 1)) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7ebe1ef23652..abc5ab581f82 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1210,8 +1210,8 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct ib_device *dev = container_of(device, struct ib_device, dev); - ib_get_device_fw_str(dev, buf, PAGE_SIZE); - strlcat(buf, "\n", PAGE_SIZE); + ib_get_device_fw_str(dev, buf); + strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 112099c86a19..f2a7f62c2834 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -618,7 +618,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, if (result) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 276f0ef835bd..eb85b546e223 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -248,14 +248,15 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, dst->qp_num = src->qp_num; } -static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, +static void ucma_copy_ud_event(struct ib_device *device, + struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } @@ -335,7 +336,8 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, uevent->resp.event = event->event; uevent->resp.status = event->status; if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); + ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, + &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); @@ -1157,7 +1159,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, if (ret) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 36a6f5c8914c..c1696e6084b2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 64d494a64daf..37c8903e7fd0 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,6 +100,7 @@ struct ib_uverbs_device { struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; struct list_head uverbs_events_file_list; + struct uverbs_root_spec *specs_root; }; struct ib_uverbs_event_queue { @@ -218,6 +219,8 @@ int uverbs_dealloc_mw(struct ib_mw *mw); void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj); +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); + struct ib_uverbs_flow_spec { union { union { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 739bd69ef1d4..e0cb99860934 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -91,9 +91,10 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err; } - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); if (ret) @@ -275,8 +276,14 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.lid = attr.lid; - resp.sm_lid = attr.sm_lid; + + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { + resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); + resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); + } else { + resp.lid = ib_lid_cpu16(attr.lid); + resp.sm_lid = ib_lid_cpu16(attr.sm_lid); + } resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; @@ -313,9 +320,10 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); uobj = uobj_alloc(uobj_get_type(pd), file->ucontext); if (IS_ERR(uobj)) @@ -482,9 +490,10 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); mutex_lock(&file->device->xrcd_tree_mutex); @@ -646,9 +655,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; @@ -740,7 +750,8 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long) cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) return -EINVAL; @@ -1080,7 +1091,8 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, INIT_UDATA(&uhw, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; @@ -1161,9 +1173,10 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) @@ -1185,7 +1198,8 @@ out: return ret ? ret : in_len; } -static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, + struct ib_wc *wc) { struct ib_uverbs_wc tmp; @@ -1199,7 +1213,10 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; - tmp.slid = wc->slid; + if (rdma_cap_opa_ah(ib_dev, wc->port_num)) + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + else + tmp.slid = ib_lid_cpu16(wc->slid); tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; @@ -1243,7 +1260,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (!ret) break; - ret = copy_wc_to_user(data_ptr, &wc); + ret = copy_wc_to_user(ib_dev, data_ptr, &wc); if (ret) goto out_put; @@ -1383,8 +1400,9 @@ static int create_qp(struct ib_uverbs_file *file, attr.rwq_ind_tbl = ind_tbl; } - if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + - sizeof(cmd->reserved1)) && cmd->reserved1) { + if (cmd_sz > sizeof(*cmd) && + !ib_is_udata_cleared(ucore, sizeof(*cmd), + cmd_sz - sizeof(*cmd))) { ret = -EOPNOTSUPP; goto err_put; } @@ -1420,7 +1438,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, cmd->srq_handle, file->ucontext); - if (!srq || srq->srq_type != IB_SRQT_BASIC) { + if (!srq || srq->srq_type == IB_SRQT_XRC) { ret = -EINVAL; goto err_put; } @@ -1482,11 +1500,21 @@ static int create_qp(struct ib_uverbs_file *file, IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV | IB_QP_CREATE_SCATTER_FCS | - IB_QP_CREATE_CVLAN_STRIPPING)) { + IB_QP_CREATE_CVLAN_STRIPPING | + IB_QP_CREATE_SOURCE_QPN)) { ret = -EINVAL; goto err_put; } + if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + goto err_put; + } + + attr.source_qpn = cmd->source_qpn; + } + buf = (void *)cmd + sizeof(*cmd); if (cmd_sz > sizeof(*cmd)) if (!(buf[0] == 0 && !memcmp(buf, buf + 1, @@ -1722,9 +1750,10 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), file->ucontext); @@ -1791,6 +1820,28 @@ err_put: return ret; } +static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr, + struct rdma_ah_attr *rdma_attr) +{ + const struct ib_global_route *grh; + + uverb_attr->dlid = rdma_ah_get_dlid(rdma_attr); + uverb_attr->sl = rdma_ah_get_sl(rdma_attr); + uverb_attr->src_path_bits = rdma_ah_get_path_bits(rdma_attr); + uverb_attr->static_rate = rdma_ah_get_static_rate(rdma_attr); + uverb_attr->is_global = !!(rdma_ah_get_ah_flags(rdma_attr) & + IB_AH_GRH); + if (uverb_attr->is_global) { + grh = rdma_ah_read_grh(rdma_attr); + memcpy(uverb_attr->dgid, grh->dgid.raw, 16); + uverb_attr->flow_label = grh->flow_label; + uverb_attr->sgid_index = grh->sgid_index; + uverb_attr->hop_limit = grh->hop_limit; + uverb_attr->traffic_class = grh->traffic_class; + } + uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr); +} + ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, @@ -1801,7 +1852,6 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; - const struct ib_global_route *grh; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -1851,39 +1901,8 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; - resp.dest.dlid = rdma_ah_get_dlid(&attr->ah_attr); - resp.dest.sl = rdma_ah_get_sl(&attr->ah_attr); - resp.dest.src_path_bits = rdma_ah_get_path_bits(&attr->ah_attr); - resp.dest.static_rate = rdma_ah_get_static_rate(&attr->ah_attr); - resp.dest.is_global = !!(rdma_ah_get_ah_flags(&attr->ah_attr) & - IB_AH_GRH); - if (resp.dest.is_global) { - grh = rdma_ah_read_grh(&attr->ah_attr); - memcpy(resp.dest.dgid, grh->dgid.raw, 16); - resp.dest.flow_label = grh->flow_label; - resp.dest.sgid_index = grh->sgid_index; - resp.dest.hop_limit = grh->hop_limit; - resp.dest.traffic_class = grh->traffic_class; - } - resp.dest.port_num = rdma_ah_get_port_num(&attr->ah_attr); - - resp.alt_dest.dlid = rdma_ah_get_dlid(&attr->alt_ah_attr); - resp.alt_dest.sl = rdma_ah_get_sl(&attr->alt_ah_attr); - resp.alt_dest.src_path_bits = rdma_ah_get_path_bits(&attr->alt_ah_attr); - resp.alt_dest.static_rate - = rdma_ah_get_static_rate(&attr->alt_ah_attr); - resp.alt_dest.is_global - = !!(rdma_ah_get_ah_flags(&attr->alt_ah_attr) & - IB_AH_GRH); - if (resp.alt_dest.is_global) { - grh = rdma_ah_read_grh(&attr->alt_ah_attr); - memcpy(resp.alt_dest.dgid, grh->dgid.raw, 16); - resp.alt_dest.flow_label = grh->flow_label; - resp.alt_dest.sgid_index = grh->sgid_index; - resp.alt_dest.hop_limit = grh->hop_limit; - resp.alt_dest.traffic_class = grh->traffic_class; - } - resp.alt_dest.port_num = rdma_ah_get_port_num(&attr->alt_ah_attr); + copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr); + copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr); resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; @@ -1917,6 +1936,29 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) } } +static void copy_ah_attr_from_uverbs(struct ib_device *dev, + struct rdma_ah_attr *rdma_attr, + struct ib_uverbs_qp_dest *uverb_attr) +{ + rdma_attr->type = rdma_ah_find_type(dev, uverb_attr->port_num); + if (uverb_attr->is_global) { + rdma_ah_set_grh(rdma_attr, NULL, + uverb_attr->flow_label, + uverb_attr->sgid_index, + uverb_attr->hop_limit, + uverb_attr->traffic_class); + rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid); + } else { + rdma_ah_set_ah_flags(rdma_attr, 0); + } + rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid); + rdma_ah_set_sl(rdma_attr, uverb_attr->sl); + rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits); + rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate); + rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num); + rdma_ah_set_make_grd(rdma_attr, false); +} + static int modify_qp(struct ib_uverbs_file *file, struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata) { @@ -1964,48 +2006,12 @@ static int modify_qp(struct ib_uverbs_file *file, attr->rate_limit = cmd->rate_limit; if (cmd->base.attr_mask & IB_QP_AV) - attr->ah_attr.type = rdma_ah_find_type(qp->device, - cmd->base.dest.port_num); - if (cmd->base.dest.is_global) { - rdma_ah_set_grh(&attr->ah_attr, NULL, - cmd->base.dest.flow_label, - cmd->base.dest.sgid_index, - cmd->base.dest.hop_limit, - cmd->base.dest.traffic_class); - rdma_ah_set_dgid_raw(&attr->ah_attr, cmd->base.dest.dgid); - } else { - rdma_ah_set_ah_flags(&attr->ah_attr, 0); - } - rdma_ah_set_dlid(&attr->ah_attr, cmd->base.dest.dlid); - rdma_ah_set_sl(&attr->ah_attr, cmd->base.dest.sl); - rdma_ah_set_path_bits(&attr->ah_attr, cmd->base.dest.src_path_bits); - rdma_ah_set_static_rate(&attr->ah_attr, cmd->base.dest.static_rate); - rdma_ah_set_port_num(&attr->ah_attr, - cmd->base.dest.port_num); + copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr, + &cmd->base.dest); if (cmd->base.attr_mask & IB_QP_ALT_PATH) - attr->alt_ah_attr.type = - rdma_ah_find_type(qp->device, cmd->base.dest.port_num); - if (cmd->base.alt_dest.is_global) { - rdma_ah_set_grh(&attr->alt_ah_attr, NULL, - cmd->base.alt_dest.flow_label, - cmd->base.alt_dest.sgid_index, - cmd->base.alt_dest.hop_limit, - cmd->base.alt_dest.traffic_class); - rdma_ah_set_dgid_raw(&attr->alt_ah_attr, - cmd->base.alt_dest.dgid); - } else { - rdma_ah_set_ah_flags(&attr->alt_ah_attr, 0); - } - - rdma_ah_set_dlid(&attr->alt_ah_attr, cmd->base.alt_dest.dlid); - rdma_ah_set_sl(&attr->alt_ah_attr, cmd->base.alt_dest.sl); - rdma_ah_set_path_bits(&attr->alt_ah_attr, - cmd->base.alt_dest.src_path_bits); - rdma_ah_set_static_rate(&attr->alt_ah_attr, - cmd->base.alt_dest.static_rate); - rdma_ah_set_port_num(&attr->alt_ah_attr, - cmd->base.alt_dest.port_num); + copy_ah_attr_from_uverbs(qp->device, &attr->alt_ah_attr, + &cmd->base.alt_dest); ret = ib_modify_qp_with_udata(qp, attr, modify_qp_mask(qp->qp_type, @@ -2037,7 +2043,8 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, return -EOPNOTSUPP; INIT_UDATA(&udata, buf + sizeof(cmd.base), NULL, - in_len - sizeof(cmd.base), out_len); + in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr), + out_len); ret = modify_qp(file, &cmd, &udata); if (ret) @@ -2543,7 +2550,8 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); uobj = uobj_alloc(uobj_get_type(ah), file->ucontext); if (IS_ERR(uobj)) @@ -2556,6 +2564,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, } attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num); + rdma_ah_set_make_grd(&attr, false); rdma_ah_set_dlid(&attr, cmd.attr.dlid); rdma_ah_set_sl(&attr, cmd.attr.sl); rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits); @@ -3472,6 +3481,9 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, if (IS_ERR(obj)) return PTR_ERR(obj); + if (cmd->srq_type == IB_SRQT_TM) + attr.ext.tag_matching.max_num_tags = cmd->max_num_tags; + if (cmd->srq_type == IB_SRQT_XRC) { xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->xrcd_handle, file->ucontext); @@ -3488,10 +3500,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); + } - attr.ext.xrc.cq = uobj_get_obj_read(cq, cmd->cq_handle, - file->ucontext); - if (!attr.ext.xrc.cq) { + if (ib_srq_has_cq(cmd->srq_type)) { + attr.ext.cq = uobj_get_obj_read(cq, cmd->cq_handle, + file->ucontext); + if (!attr.ext.cq) { ret = -EINVAL; goto err_put_xrcd; } @@ -3526,10 +3540,13 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; + if (ib_srq_has_cq(cmd->srq_type)) { + srq->ext.cq = attr.ext.cq; + atomic_inc(&attr.ext.cq->usecnt); + } + if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.cq = attr.ext.xrc.cq; srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.cq->usecnt); atomic_inc(&attr.ext.xrc.xrcd->usecnt); } @@ -3552,10 +3569,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, goto err_copy; } - if (cmd->srq_type == IB_SRQT_XRC) { + if (cmd->srq_type == IB_SRQT_XRC) uobj_put_read(xrcd_uobj); - uobj_put_obj_read(attr.ext.xrc.cq); - } + + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); + uobj_put_obj_read(pd); uobj_alloc_commit(&obj->uevent.uobject); @@ -3568,8 +3587,8 @@ err_put: uobj_put_obj_read(pd); err_put_cq: - if (cmd->srq_type == IB_SRQT_XRC) - uobj_put_obj_read(attr.ext.xrc.cq); + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { @@ -3599,6 +3618,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; + memset(&xcmd, 0, sizeof(xcmd)); xcmd.response = cmd.response; xcmd.user_handle = cmd.user_handle; xcmd.srq_type = IB_SRQT_BASIC; @@ -3607,10 +3627,10 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, xcmd.max_sge = cmd.max_sge; xcmd.srq_limit = cmd.srq_limit; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), - out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) @@ -3634,10 +3654,10 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), - out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) @@ -3848,6 +3868,16 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); + + if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + goto end; + + resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; + resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; + resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; + resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; + resp.xrq_caps.flags = attr.xrq_caps.flags; + resp.response_length += sizeof(resp.xrq_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c new file mode 100644 index 000000000000..5286ad57d903 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/rdma_user_ioctl.h> +#include <rdma/uverbs_ioctl.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_process_attr(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattr, + u16 attr_id, + const struct uverbs_attr_spec_hash *attr_spec_bucket, + struct uverbs_attr_bundle_hash *attr_bundle_h, + struct ib_uverbs_attr __user *uattr_ptr) +{ + const struct uverbs_attr_spec *spec; + struct uverbs_attr *e; + const struct uverbs_object_spec *object; + struct uverbs_obj_attr *o_attr; + struct uverbs_attr *elements = attr_bundle_h->attrs; + + if (uattr->reserved) + return -EINVAL; + + if (attr_id >= attr_spec_bucket->num_attrs) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) + return -EINVAL; + else + return 0; + } + + spec = &attr_spec_bucket->attrs[attr_id]; + e = &elements[attr_id]; + e->uattr = uattr_ptr; + + switch (spec->type) { + case UVERBS_ATTR_TYPE_PTR_IN: + case UVERBS_ATTR_TYPE_PTR_OUT: + if (uattr->len < spec->len || + (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) && + uattr->len > spec->len)) + return -EINVAL; + + e->ptr_attr.data = uattr->data; + e->ptr_attr.len = uattr->len; + e->ptr_attr.flags = uattr->flags; + break; + + case UVERBS_ATTR_TYPE_IDR: + if (uattr->data >> 32) + return -EINVAL; + /* fall through */ + case UVERBS_ATTR_TYPE_FD: + if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX) + return -EINVAL; + + o_attr = &e->obj_attr; + object = uverbs_get_object(ibdev, spec->obj.obj_type); + if (!object) + return -EINVAL; + o_attr->type = object->type_attrs; + + o_attr->id = (int)uattr->data; + o_attr->uobject = uverbs_get_uobject_from_context( + o_attr->type, + ucontext, + spec->obj.access, + o_attr->id); + + if (IS_ERR(o_attr->uobject)) + return PTR_ERR(o_attr->uobject); + + if (spec->obj.access == UVERBS_ACCESS_NEW) { + u64 id = o_attr->uobject->id; + + /* Copy the allocated id to the user-space */ + if (put_user(id, &e->uattr->data)) { + uverbs_finalize_object(o_attr->uobject, + UVERBS_ACCESS_NEW, + false); + return -EFAULT; + } + } + + break; + default: + return -EOPNOTSUPP; + } + + set_bit(attr_id, attr_bundle_h->valid_bitmap); + return 0; +} + +static int uverbs_uattrs_process(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + const struct uverbs_method_spec *method, + struct uverbs_attr_bundle *attr_bundle, + struct ib_uverbs_attr __user *uattr_ptr) +{ + size_t i; + int ret = 0; + int num_given_buckets = 0; + + for (i = 0; i < num_uattrs; i++) { + const struct ib_uverbs_attr *uattr = &uattrs[i]; + u16 attr_id = uattr->attr_id; + struct uverbs_attr_spec_hash *attr_spec_bucket; + + ret = uverbs_ns_idx(&attr_id, method->num_buckets); + if (ret < 0) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + continue; + } + + /* + * ret is the found ns, so increase num_given_buckets if + * necessary. + */ + if (ret >= num_given_buckets) + num_given_buckets = ret + 1; + + attr_spec_bucket = method->attr_buckets[ret]; + ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id, + attr_spec_bucket, &attr_bundle->hash[ret], + uattr_ptr++); + if (ret) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + } + + return num_given_buckets; +} + +static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + unsigned int i; + + for (i = 0; i < attr_bundle->num_buckets; i++) { + struct uverbs_attr_spec_hash *attr_spec_bucket = + method_spec->attr_buckets[i]; + + if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask, + attr_bundle->hash[i].valid_bitmap, + attr_spec_bucket->num_attrs)) + return -EINVAL; + } + + return 0; +} + +static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + struct ib_device *ibdev, + struct ib_uverbs_file *ufile, + const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + int ret; + int finalize_ret; + int num_given_buckets; + + num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs, + num_uattrs, method_spec, + attr_bundle, uattr_ptr); + if (num_given_buckets <= 0) + return -EINVAL; + + attr_bundle->num_buckets = num_given_buckets; + ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle); + if (ret) + goto cleanup; + + ret = method_spec->handler(ibdev, ufile, attr_bundle); +cleanup: + finalize_ret = uverbs_finalize_objects(attr_bundle, + method_spec->attr_buckets, + attr_bundle->num_buckets, + !ret); + + return ret ? ret : finalize_ret; +} + +#define UVERBS_OPTIMIZE_USING_STACK_SZ 256 +static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct ib_uverbs_ioctl_hdr *hdr, + void __user *buf) +{ + const struct uverbs_object_spec *object_spec; + const struct uverbs_method_spec *method_spec; + long err = 0; + unsigned int i; + struct { + struct ib_uverbs_attr *uattrs; + struct uverbs_attr_bundle *uverbs_attr_bundle; + } *ctx = NULL; + struct uverbs_attr *curr_attr; + unsigned long *curr_bitmap; + size_t ctx_size; +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; +#endif + + if (hdr->reserved) + return -EINVAL; + + object_spec = uverbs_get_object(ib_dev, hdr->object_id); + if (!object_spec) + return -EOPNOTSUPP; + + method_spec = uverbs_get_method(object_spec, hdr->method_id); + if (!method_spec) + return -EOPNOTSUPP; + + if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) + return -EINVAL; + + ctx_size = sizeof(*ctx) + + sizeof(struct uverbs_attr_bundle) + + sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets + + sizeof(*ctx->uattrs) * hdr->num_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) * + method_spec->num_child_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) * + (method_spec->num_child_attrs / BITS_PER_LONG + + method_spec->num_buckets); + +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ) + ctx = (void *)data; + + if (!ctx) +#endif + ctx = kmalloc(ctx_size, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx); + ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) + + (sizeof(ctx->uverbs_attr_bundle->hash[0]) * + method_spec->num_buckets); + curr_attr = (void *)(ctx->uattrs + hdr->num_attrs); + curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs); + + /* + * We just fill the pointers and num_attrs here. The data itself will be + * filled at a later stage (uverbs_process_attr) + */ + for (i = 0; i < method_spec->num_buckets; i++) { + unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs; + + ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr; + curr_attr += curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap; + bitmap_zero(curr_bitmap, curr_num_attrs); + curr_bitmap += BITS_TO_LONGS(curr_num_attrs); + } + + err = copy_from_user(ctx->uattrs, buf, + sizeof(*ctx->uattrs) * hdr->num_attrs); + if (err) { + err = -EFAULT; + goto out; + } + + err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, + file, method_spec, ctx->uverbs_attr_bundle); +out: +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size > UVERBS_OPTIMIZE_USING_STACK_SZ) +#endif + kfree(ctx); + return err; +} + +#define IB_UVERBS_MAX_CMD_SZ 4096 + +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_ioctl_hdr __user *user_hdr = + (struct ib_uverbs_ioctl_hdr __user *)arg; + struct ib_uverbs_ioctl_hdr hdr; + struct ib_device *ib_dev; + int srcu_key; + long err; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + err = -EIO; + goto out; + } + + if (cmd == RDMA_VERBS_IOCTL) { + err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + + if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ || + hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) { + err = -EINVAL; + goto out; + } + + if (hdr.reserved) { + err = -EOPNOTSUPP; + goto out; + } + + err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr, + (__user void *)arg + sizeof(hdr)); + } else { + err = -ENOIOCTLCMD; + } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + + return err; +} diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c new file mode 100644 index 000000000000..76ddb6564578 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl_merge.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> +#include <linux/bitops.h> +#include "uverbs.h" + +#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT) +#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT) +#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK) + +#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset, \ + buckets_offset) \ + for (tmpj = 0, \ + elem = (*(const void ***)((hashes)[tmpi] + \ + (buckets_offset)))[0]; \ + tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset)); \ + tmpj++) \ + if ((elem = ((*(const void ***)(hashes[tmpi] + \ + (buckets_offset)))[tmpj]))) + +/* + * Iterate all elements of a few @hashes. The number of given hashes is + * indicated by @num_hashes. The offset of the number of buckets in the hash is + * represented by @num_buckets_offset, while the offset of the buckets array in + * the hash structure is represented by @buckets_offset. tmpi and tmpj are two + * short (or int) based indices that are given by the user. tmpi iterates over + * the different hashes. @elem points the current element in the hashes[tmpi] + * bucket we are looping on. To be honest, @hashes representation isn't exactly + * a hash, but more a collection of elements. These elements' ids are treated + * in a hash like manner, where the first upper bits are the bucket number. + * These elements are later mapped into a perfect-hash. + */ +#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes, \ + num_buckets_offset, buckets_offset) \ + for (tmpi = 0; tmpi < (num_hashes); tmpi++) \ + _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\ + buckets_offset) + +#define get_elements_iterators_entry_above(iters, num_elements, elements, \ + num_objects_fld, objects_fld, bucket,\ + min_id) \ + get_elements_above_id((const void **)iters, num_elements, \ + (const void **)(elements), \ + offsetof(typeof(**elements), \ + num_objects_fld), \ + offsetof(typeof(**elements), objects_fld),\ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket, min_id) + +#define get_objects_above_id(iters, num_trees, trees, bucket, min_id) \ + get_elements_iterators_entry_above(iters, num_trees, trees, \ + num_objects, objects, bucket, min_id) + +#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(method_iters, num_iters, iters, \ + num_methods, methods, bucket, min_id) + +#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(attrs_iters, num_iters, iters, \ + num_attrs, attrs, bucket, min_id) + +/* + * get_elements_above_id get a few hashes represented by @elements and + * @num_elements. The hashes fields are described by @num_offset, @data_offset + * and @id_offset in the same way as required by for_each_element. The function + * returns an array of @iters, represents an array of elements in the hashes + * buckets, which their ids are the smallest ids in all hashes but are all + * larger than the id given by min_id. Elements are only added to the iters + * array if their id belongs to the bucket @bucket. The number of elements in + * the returned array is returned by the function. @min_id is also updated to + * reflect the new min_id of all elements in iters. + */ +static size_t get_elements_above_id(const void **iters, + unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket, + short *min_id) +{ + size_t num_iters = 0; + short min = SHRT_MAX; + const void *elem; + int i, j, last_stored = -1; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) != bucket) + continue; + + if (GET_ID(id) < *min_id || + (min != SHRT_MAX && GET_ID(id) > min)) + continue; + + /* + * We first iterate all hashes represented by @elements. When + * we do, we try to find an element @elem in the bucket @bucket + * which its id is min. Since we can't ensure the user sorted + * the elements in increasing order, we override this hash's + * minimal id element we found, if a new element with a smaller + * id was just found. + */ + iters[last_stored == i ? num_iters - 1 : num_iters++] = elem; + last_stored = i; + min = GET_ID(id); + } + + /* + * We only insert to our iters array an element, if its id is smaller + * than all previous ids. Therefore, the final iters array is sorted so + * that smaller ids are in the end of the array. + * Therefore, we need to clean the beginning of the array to make sure + * all ids of final elements are equal to min. + */ + for (i = num_iters - 1; i >= 0 && + GET_ID(*(u16 *)(iters[i] + id_offset)) == min; i--) + ; + + num_iters -= i + 1; + memmove(iters, iters + i + 1, sizeof(*iters) * num_iters); + + *min_id = min; + return num_iters; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +static short find_max_element_ns_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset) +{ + short max_ns = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) > max_ns) + max_ns = GET_NS_ID(id); + } + + return max_ns; +} + +static short find_max_element_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket) +{ + short max_id = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) == bucket && + GET_ID(id) > max_id) + max_id = GET_ID(id); + } + return max_id; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +#define find_max_element_ns_entry_id(num_elements, elements, \ + num_objects_fld, objects_fld) \ + find_max_element_ns_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld),\ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id)) + +/* + * find_max_xxxx_ns_id gets a few elements. Each element is described by an id + * which its upper bits represents a namespace. It finds the max namespace. This + * could be used in order to know how many buckets do we need to allocate. If no + * elements exist, SHRT_MIN is returned. Namespace represents here different + * buckets. The common example is "common bucket" and "driver bucket". + * + * find_max_xxxx_id gets a few elements and a bucket. Each element is described + * by an id which its upper bits represent a namespace. It returns the max id + * which is contained in the same namespace defined in @bucket. This could be + * used in order to know how many elements do we need to allocate in the bucket. + * If no elements exist, SHRT_MIN is returned. + */ + +#define find_max_object_id(num_trees, trees, bucket) \ + find_max_element_entry_id(num_trees, trees, num_objects,\ + objects, bucket) +#define find_max_object_ns_id(num_trees, trees) \ + find_max_element_ns_entry_id(num_trees, trees, \ + num_objects, objects) + +#define find_max_method_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_methods,\ + methods, bucket) +#define find_max_method_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_methods, methods) + +#define find_max_attr_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_attrs, \ + attrs, bucket) +#define find_max_attr_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_attrs, attrs) + +static void free_method(struct uverbs_method_spec *method) +{ + unsigned int i; + + if (!method) + return; + + for (i = 0; i < method->num_buckets; i++) + kfree(method->attr_buckets[i]); + + kfree(method); +} + +#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \ + (attr)->type == UVERBS_ATTR_TYPE_FD) + +/* + * This function gets array of size @num_method_defs which contains pointers to + * method definitions @method_defs. The function allocates an + * uverbs_method_spec structure and initializes its number of buckets and the + * elements in buckets to the correct attributes. While doing that, it + * validates that there aren't conflicts between attributes of different + * method_defs. + */ +static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs, + size_t num_method_defs) +{ + int bucket_idx; + int max_attr_buckets = 0; + size_t num_attr_buckets = 0; + int res = 0; + struct uverbs_method_spec *method = NULL; + const struct uverbs_attr_def **attr_defs; + unsigned int num_of_singularities = 0; + + max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs); + if (max_attr_buckets >= 0) + num_attr_buckets = max_attr_buckets + 1; + + method = kzalloc(sizeof(*method) + + num_attr_buckets * sizeof(*method->attr_buckets), + GFP_KERNEL); + if (!method) + return ERR_PTR(-ENOMEM); + + method->num_buckets = num_attr_buckets; + attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL); + if (!attr_defs) { + res = -ENOMEM; + goto free_method; + } + for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int attr_max_bucket = 0; + struct uverbs_attr_spec_hash *hash = NULL; + + attr_max_bucket = find_max_attr_id(num_method_defs, method_defs, + bucket_idx); + if (attr_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1), + sizeof(long)) + + BITS_TO_LONGS(attr_max_bucket) * sizeof(long), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_attrs = attr_max_bucket + 1; + method->num_child_attrs += hash->num_attrs; + hash->mandatory_attrs_bitmask = (void *)(hash + 1) + + ALIGN(sizeof(*hash->attrs) * + (attr_max_bucket + 1), + sizeof(long)); + + method->attr_buckets[bucket_idx] = hash; + + do { + size_t num_attr_defs; + struct uverbs_attr_spec *attr; + bool attr_obj_with_special_access; + + num_attr_defs = + get_attrs_above_id(attr_defs, + num_method_defs, + method_defs, + bucket_idx, + &min_id); + /* Last attr in bucket */ + if (!num_attr_defs) + break; + + if (num_attr_defs > 1) { + /* + * We don't allow two attribute definitions for + * the same attribute. This is usually a + * programmer error. If required, it's better to + * just add a new attribute to capture the new + * semantics. + */ + res = -EEXIST; + goto free; + } + + attr = &hash->attrs[min_id]; + memcpy(attr, &attr_defs[0]->attr, sizeof(*attr)); + + attr_obj_with_special_access = IS_ATTR_OBJECT(attr) && + (attr->obj.access == UVERBS_ACCESS_NEW || + attr->obj.access == UVERBS_ACCESS_DESTROY); + num_of_singularities += !!attr_obj_with_special_access; + if (WARN(num_of_singularities > 1, + "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n", + min_id) || + WARN(attr_obj_with_special_access && + !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY), + "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy aceess but isn't mandatory\n", + min_id) || + WARN(IS_ATTR_OBJECT(attr) && + attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ, + "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n", + min_id)) { + res = -EINVAL; + goto free; + } + + if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY) + set_bit(min_id, hash->mandatory_attrs_bitmask); + min_id++; + + } while (1); + } + kfree(attr_defs); + return method; + +free: + kfree(attr_defs); +free_method: + free_method(method); + return ERR_PTR(res); +} + +static void free_object(struct uverbs_object_spec *object) +{ + unsigned int i, j; + + if (!object) + return; + + for (i = 0; i < object->num_buckets; i++) { + struct uverbs_method_spec_hash *method_buckets = + object->method_buckets[i]; + + if (!method_buckets) + continue; + + for (j = 0; j < method_buckets->num_methods; j++) + free_method(method_buckets->methods[j]); + + kfree(method_buckets); + } + + kfree(object); +} + +/* + * This function gets array of size @num_object_defs which contains pointers to + * object definitions @object_defs. The function allocated an + * uverbs_object_spec structure and initialize its number of buckets and the + * elements in buckets to the correct methods. While doing that, it + * sorts out the correct relationship between conflicts in the same method. + */ +static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs, + size_t num_object_defs) +{ + u16 bucket_idx; + int max_method_buckets = 0; + u16 num_method_buckets = 0; + int res = 0; + struct uverbs_object_spec *object = NULL; + const struct uverbs_method_def **method_defs; + + max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs); + if (max_method_buckets >= 0) + num_method_buckets = max_method_buckets + 1; + + object = kzalloc(sizeof(*object) + + num_method_buckets * + sizeof(*object->method_buckets), GFP_KERNEL); + if (!object) + return ERR_PTR(-ENOMEM); + + object->num_buckets = num_method_buckets; + method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL); + if (!method_defs) { + res = -ENOMEM; + goto free_object; + } + + for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int methods_max_bucket = 0; + struct uverbs_method_spec_hash *hash = NULL; + + methods_max_bucket = find_max_method_id(num_object_defs, object_defs, + bucket_idx); + if (methods_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->methods) * (methods_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + + hash->num_methods = methods_max_bucket + 1; + object->method_buckets[bucket_idx] = hash; + + do { + size_t num_method_defs; + struct uverbs_method_spec *method; + int i; + + num_method_defs = + get_methods_above_id(method_defs, + num_object_defs, + object_defs, + bucket_idx, + &min_id); + /* Last method in bucket */ + if (!num_method_defs) + break; + + method = build_method_with_attrs(method_defs, + num_method_defs); + if (IS_ERR(method)) { + res = PTR_ERR(method); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous method handler. + * Therefore, we iterate backwards and search for the + * first handler which != NULL. This also defines the + * set of flags used for this handler. + */ + for (i = num_object_defs - 1; + i >= 0 && !method_defs[i]->handler; i--) + ; + hash->methods[min_id++] = method; + /* NULL handler isn't allowed */ + if (WARN(i < 0, + "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n", + min_id)) { + res = -EINVAL; + goto free; + } + method->handler = method_defs[i]->handler; + method->flags = method_defs[i]->flags; + + } while (1); + } + kfree(method_defs); + return object; + +free: + kfree(method_defs); +free_object: + free_object(object); + return ERR_PTR(res); +} + +void uverbs_free_spec_tree(struct uverbs_root_spec *root) +{ + unsigned int i, j; + + if (!root) + return; + + for (i = 0; i < root->num_buckets; i++) { + struct uverbs_object_spec_hash *object_hash = + root->object_buckets[i]; + + if (!object_hash) + continue; + + for (j = 0; j < object_hash->num_objects; j++) + free_object(object_hash->objects[j]); + + kfree(object_hash); + } + + kfree(root); +} +EXPORT_SYMBOL(uverbs_free_spec_tree); + +struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees) +{ + u16 bucket_idx; + short max_object_buckets = 0; + size_t num_objects_buckets = 0; + struct uverbs_root_spec *root_spec = NULL; + const struct uverbs_object_def **object_defs; + int i; + int res = 0; + + max_object_buckets = find_max_object_ns_id(num_trees, trees); + /* + * Devices which don't want to support ib_uverbs, should just allocate + * an empty parsing tree. Every user-space command won't hit any valid + * entry in the parsing tree and thus will fail. + */ + if (max_object_buckets >= 0) + num_objects_buckets = max_object_buckets + 1; + + root_spec = kzalloc(sizeof(*root_spec) + + num_objects_buckets * sizeof(*root_spec->object_buckets), + GFP_KERNEL); + if (!root_spec) + return ERR_PTR(-ENOMEM); + root_spec->num_buckets = num_objects_buckets; + + object_defs = kcalloc(num_trees, sizeof(*object_defs), + GFP_KERNEL); + if (!object_defs) { + res = -ENOMEM; + goto free_root; + } + + for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + short objects_max_bucket; + struct uverbs_object_spec_hash *hash = NULL; + + objects_max_bucket = find_max_object_id(num_trees, trees, + bucket_idx); + if (objects_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->objects) * (objects_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_objects = objects_max_bucket + 1; + root_spec->object_buckets[bucket_idx] = hash; + + do { + size_t num_object_defs; + struct uverbs_object_spec *object; + + num_object_defs = get_objects_above_id(object_defs, + num_trees, + trees, + bucket_idx, + &min_id); + /* Last object in bucket */ + if (!num_object_defs) + break; + + object = build_object_with_methods(object_defs, + num_object_defs); + if (IS_ERR(object)) { + res = PTR_ERR(object); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous object's type_attrs. + * Therefore, we iterate backwards and search for the + * first type_attrs which != NULL. + */ + for (i = num_object_defs - 1; + i >= 0 && !object_defs[i]->type_attrs; i--) + ; + /* + * NULL is a valid type_attrs. It means an object we + * can't instantiate (like DEVICE). + */ + object->type_attrs = i < 0 ? NULL : + object_defs[i]->type_attrs; + + hash->objects[min_id++] = object; + } while (1); + } + + kfree(object_defs); + return root_spec; + +free: + kfree(object_defs); +free_root: + uverbs_free_spec_tree(root_spec); + return ERR_PTR(res); +} +EXPORT_SYMBOL(uverbs_alloc_spec_tree); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5e530d2bee44..dc2aed6fb21b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -49,6 +49,7 @@ #include <linux/uaccess.h> #include <rdma/ib.h> +#include <rdma/uverbs_std_types.h> #include "uverbs.h" #include "core_priv.h" @@ -595,7 +596,6 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file { struct ib_uverbs_async_event_file *ev_file; struct file *filp; - int ret; ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) @@ -621,21 +621,11 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, ib_dev, ib_uverbs_event_handler); - ret = ib_register_event_handler(&uverbs_file->event_handler); - if (ret) - goto err_put_file; - + ib_register_event_handler(&uverbs_file->event_handler); /* At that point async file stuff was fully set */ return filp; -err_put_file: - fput(filp); - kref_put(&uverbs_file->async_file->ref, - ib_uverbs_release_async_event_file); - uverbs_file->async_file = NULL; - return ERR_PTR(ret); - err_put_refs: kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&ev_file->ref, ib_uverbs_release_async_event_file); @@ -949,6 +939,9 @@ static const struct file_operations uverbs_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) + .unlocked_ioctl = ib_uverbs_ioctl, +#endif }; static const struct file_operations uverbs_mmap_fops = { @@ -958,6 +951,9 @@ static const struct file_operations uverbs_mmap_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) + .unlocked_ioctl = ib_uverbs_ioctl, +#endif }; static struct ib_client uverbs_client = { @@ -1108,6 +1104,18 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + if (!device->specs_root) { + const struct uverbs_object_tree_def *default_root[] = { + uverbs_default_get_objects()}; + + uverbs_dev->specs_root = uverbs_alloc_spec_tree(1, + default_root); + if (IS_ERR(uverbs_dev->specs_root)) + goto err_class; + + device->specs_root = uverbs_dev->specs_root; + } + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1239,6 +1247,11 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); + if (uverbs_dev->specs_root) { + uverbs_free_spec_tree(uverbs_dev->specs_root); + device->specs_root = NULL; + } + kobject_put(&uverbs_dev->kobj); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 94fd989c9060..bd0acf376af0 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -33,10 +33,47 @@ #include <linux/export.h> #include <rdma/ib_marshall.h> -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct rdma_ah_attr *src) +#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +static int rdma_ah_conv_opa_to_ib(struct ib_device *dev, + struct rdma_ah_attr *ib, + struct rdma_ah_attr *opa) { + struct ib_port_attr port_attr; + int ret = 0; + + /* Do structure copy and the over-write fields */ + *ib = *opa; + + ib->type = RDMA_AH_ATTR_TYPE_IB; + rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0); + + if (ib_query_port(dev, opa->port_num, &port_attr)) { + /* Set to default subnet to indicate error */ + rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX); + ret = -EINVAL; + } else { + rdma_ah_set_subnet_prefix(ib, + cpu_to_be64(port_attr.subnet_prefix)); + } + rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa))); + return ret; +} + +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, + struct rdma_ah_attr *ah_attr) +{ + struct rdma_ah_attr *src = ah_attr; + struct rdma_ah_attr conv_ah; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr))) + src = &conv_ah; + dst->dlid = rdma_ah_get_dlid(src); dst->sl = rdma_ah_get_sl(src); dst->src_path_bits = rdma_ah_get_path_bits(src); @@ -57,7 +94,8 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { dst->qp_state = src->qp_state; @@ -76,8 +114,8 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->max_recv_sge = src->cap.max_recv_sge; dst->max_inline_data = src->cap.max_inline_data; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); - ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr); dst->pkey_index = src->pkey_index; dst->alt_pkey_index = src->alt_pkey_index; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index ef293379f37a..0a98579700ec 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -209,67 +209,244 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; -const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel = { - .type = UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), 0), - .context_closed = uverbs_hot_unplug_completion_event_file, - .fops = &uverbs_event_fops, - .name = "[infinibandevent]", - .flags = O_RDONLY, -}; +/* + * This spec is used in order to pass information to the hardware driver in a + * legacy way. Every verb that could get driver specific data should get this + * spec. + */ +static const struct uverbs_attr_def uverbs_uhw_compat_in = + UVERBS_ATTR_PTR_IN_SZ(UVERBS_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); +static const struct uverbs_attr_def uverbs_uhw_compat_out = + UVERBS_ATTR_PTR_OUT_SZ(UVERBS_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); -const struct uverbs_obj_idr_type uverbs_type_attrs_cq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0), - .destroy_object = uverbs_free_cq, -}; +static void create_udata(struct uverbs_attr_bundle *ctx, + struct ib_udata *udata) +{ + /* + * This is for ease of conversion. The purpose is to convert all drivers + * to use uverbs_attr_bundle instead of ib_udata. + * Assume attr == 0 is input and attr == 1 is output. + */ + void __user *inbuf; + size_t inbuf_len = 0; + void __user *outbuf; + size_t outbuf_len = 0; + const struct uverbs_attr *uhw_in = + uverbs_attr_get(ctx, UVERBS_UHW_IN); + const struct uverbs_attr *uhw_out = + uverbs_attr_get(ctx, UVERBS_UHW_OUT); + + if (!IS_ERR(uhw_in)) { + inbuf = uhw_in->ptr_attr.ptr; + inbuf_len = uhw_in->ptr_attr.len; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_qp = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0), - .destroy_object = uverbs_free_qp, -}; + if (!IS_ERR(uhw_out)) { + outbuf = uhw_out->ptr_attr.ptr; + outbuf_len = uhw_out->ptr_attr.len; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_mw = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_mw, -}; + INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len); +} -const struct uverbs_obj_idr_type uverbs_type_attrs_mr = { - /* 1 is used in order to free the MR after all the MWs */ - .type = UVERBS_TYPE_ALLOC_IDR(1), - .destroy_object = uverbs_free_mr, -}; +static int uverbs_create_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = file->ucontext; + struct ib_ucq_object *obj; + struct ib_udata uhw; + int ret; + u64 user_handle; + struct ib_cq_init_attr attr = {}; + struct ib_cq *cq; + struct ib_uverbs_completion_event_file *ev_file = NULL; + const struct uverbs_attr *ev_file_attr; + struct ib_uobject *ev_file_uobj; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.comp_vector, attrs, CREATE_CQ_COMP_VECTOR); + if (!ret) + ret = uverbs_copy_from(&attr.cqe, attrs, CREATE_CQ_CQE); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, CREATE_CQ_USER_HANDLE); + if (ret) + return ret; -const struct uverbs_obj_idr_type uverbs_type_attrs_srq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0), - .destroy_object = uverbs_free_srq, -}; + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + if (uverbs_copy_from(&attr.flags, attrs, CREATE_CQ_FLAGS) == -EFAULT) + return -EFAULT; -const struct uverbs_obj_idr_type uverbs_type_attrs_ah = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_ah, -}; + ev_file_attr = uverbs_attr_get(attrs, CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_attr)) { + ev_file_uobj = ev_file_attr->obj_attr.uobject; -const struct uverbs_obj_idr_type uverbs_type_attrs_flow = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_flow, -}; + ev_file = container_of(ev_file_uobj, + struct ib_uverbs_completion_event_file, + uobj_file.uobj); + uverbs_uobject_get(ev_file_uobj); + } -const struct uverbs_obj_idr_type uverbs_type_attrs_wq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0), - .destroy_object = uverbs_free_wq, -}; + if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { + ret = -EINVAL; + goto err_event_file; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_rwq_ind_tbl, -}; + obj = container_of(uverbs_attr_get(attrs, CREATE_CQ_HANDLE)->obj_attr.uobject, + typeof(*obj), uobject); + obj->uverbs_file = ucontext->ufile; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); + + /* Temporary, only until drivers get the new uverbs_attr_bundle */ + create_udata(attrs, &uhw); + + cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto err_event_file; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0), - .destroy_object = uverbs_free_xrcd, -}; + cq->device = ib_dev; + cq->uobject = &obj->uobject; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = &ev_file->ev_queue; + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; + atomic_set(&cq->usecnt, 0); + + ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); + if (ret) + goto err_cq; -const struct uverbs_obj_idr_type uverbs_type_attrs_pd = { - /* 2 is used in order to free the PD after MRs */ - .type = UVERBS_TYPE_ALLOC_IDR(2), - .destroy_object = uverbs_free_pd, + return 0; +err_cq: + ib_destroy_cq(cq); + +err_event_file: + if (ev_file) + uverbs_uobject_put(ev_file_uobj); + return ret; }; + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_create, UVERBS_CQ_CREATE, uverbs_create_cq_handler, + &UVERBS_ATTR_IDR(CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_USER_HANDLE, u64, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_FD(CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_COMP_VECTOR, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_FLAGS, u32), + &UVERBS_ATTR_PTR_OUT(CREATE_CQ_RESP_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); + +static int uverbs_destroy_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, DESTROY_CQ_HANDLE)->obj_attr.uobject; + struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, + uobject); + int ret; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) + return -EOPNOTSUPP; + + ret = rdma_explicit_destroy(uobj); + if (ret) + return ret; + + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; + + return uverbs_copy_to(attrs, DESTROY_CQ_RESP, &resp); +} + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_destroy, UVERBS_CQ_DESTROY, uverbs_destroy_cq_handler, + &UVERBS_ATTR_IDR(DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(DESTROY_CQ_RESP, struct ib_uverbs_destroy_cq_resp, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, + UVERBS_OBJECT_COMP_CHANNEL, + &UVERBS_TYPE_ALLOC_FD(0, + sizeof(struct ib_uverbs_completion_event_file), + uverbs_hot_unplug_completion_event_file, + &uverbs_event_fops, + "[infinibandevent]", O_RDONLY)); + +DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, + uverbs_free_cq), + &uverbs_method_cq_create, + &uverbs_method_cq_destroy); + +DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, + uverbs_free_qp)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mw, UVERBS_OBJECT_MW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mr, UVERBS_OBJECT_MR, + /* 1 is used in order to free the MR after all the MWs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); + +DECLARE_UVERBS_OBJECT(uverbs_object_srq, UVERBS_OBJECT_SRQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, + uverbs_free_srq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_ah, UVERBS_OBJECT_AH, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); + +DECLARE_UVERBS_OBJECT(uverbs_object_flow, UVERBS_OBJECT_FLOW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); + +DECLARE_UVERBS_OBJECT(uverbs_object_wq, UVERBS_OBJECT_WQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, + uverbs_free_wq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_rwq_ind_table, + UVERBS_OBJECT_RWQ_IND_TBL, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); + +DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, + uverbs_free_xrcd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD, + /* 2 is used in order to free the PD after MRs */ + &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_device, UVERBS_OBJECT_DEVICE, NULL); + +DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, + &uverbs_object_device, + &uverbs_object_pd, + &uverbs_object_mr, + &uverbs_object_comp_channel, + &uverbs_object_cq, + &uverbs_object_qp, + &uverbs_object_ah, + &uverbs_object_mw, + &uverbs_object_srq, + &uverbs_object_flow, + &uverbs_object_wq, + &uverbs_object_rwq_ind_table, + &uverbs_object_xrcd); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b456e3ca1876..ee9e27dc799b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -180,39 +180,29 @@ EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { - switch (node_type) { - case RDMA_NODE_IB_CA: - case RDMA_NODE_IB_SWITCH: - case RDMA_NODE_IB_ROUTER: - return RDMA_TRANSPORT_IB; - case RDMA_NODE_RNIC: - return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_USNIC: + + if (node_type == RDMA_NODE_USNIC) return RDMA_TRANSPORT_USNIC; - case RDMA_NODE_USNIC_UDP: + if (node_type == RDMA_NODE_USNIC_UDP) return RDMA_TRANSPORT_USNIC_UDP; - default: - BUG(); - return 0; - } + if (node_type == RDMA_NODE_RNIC) + return RDMA_TRANSPORT_IWARP; + + return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) { + enum rdma_transport_type lt; if (device->get_link_layer) return device->get_link_layer(device, port_num); - switch (rdma_node_get_transport(device->node_type)) { - case RDMA_TRANSPORT_IB: + lt = rdma_node_get_transport(device->node_type); + if (lt == RDMA_TRANSPORT_IB) return IB_LINK_LAYER_INFINIBAND; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_USNIC: - case RDMA_TRANSPORT_USNIC_UDP: - return IB_LINK_LAYER_ETHERNET; - default: - return IB_LINK_LAYER_UNSPECIFIED; - } + + return IB_LINK_LAYER_ETHERNET; } EXPORT_SYMBOL(rdma_port_get_link_layer); @@ -478,6 +468,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, union ib_gid dgid; union ib_gid sgid; + might_sleep(); + memset(ah_attr, 0, sizeof *ah_attr); ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { @@ -632,11 +624,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -677,18 +671,18 @@ int ib_destroy_srq(struct ib_srq *srq) pd = srq->pd; srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { + if (ib_srq_has_cq(srq_type)) + cq = srq->ext.cq; + if (srq_type == IB_SRQT_XRC) xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { + if (srq_type == IB_SRQT_XRC) atomic_dec(&xrcd->usecnt); + if (ib_srq_has_cq(srq_type)) atomic_dec(&cq->usecnt); - } } return ret; @@ -1244,6 +1238,18 @@ int ib_resolve_eth_dmac(struct ib_device *device, if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); + return 0; + } + if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { + if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { + __be32 addr = 0; + + memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); + ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); + } else { + ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, + (char *)ah_attr->roce.dmac); + } } else { union ib_gid sgid; struct ib_gid_attr sgid_attr; @@ -1306,6 +1312,61 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +{ + int rc; + u32 netdev_speed; + struct net_device *netdev; + struct ethtool_link_ksettings lksettings; + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + if (!dev->get_netdev) + return -EOPNOTSUPP; + + netdev = dev->get_netdev(dev, port_num); + if (!netdev) + return -ENODEV; + + rtnl_lock(); + rc = __ethtool_get_link_ksettings(netdev, &lksettings); + rtnl_unlock(); + + dev_put(netdev); + + if (!rc) { + netdev_speed = lksettings.base.speed; + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, + netdev_speed); + } + + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } + + return 0; +} +EXPORT_SYMBOL(ib_get_eth_speed); + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) @@ -1573,15 +1634,53 @@ EXPORT_SYMBOL(ib_dealloc_fmr); /* Multicast groups */ +static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_qp_attr attr = {}; + int num_eth_ports = 0; + int port; + + /* If QP state >= init, it is assigned to a port and we can check this + * port only. + */ + if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { + if (attr.qp_state >= IB_QPS_INIT) { + if (qp->device->get_link_layer(qp->device, attr.port_num) != + IB_LINK_LAYER_INFINIBAND) + return true; + goto lid_check; + } + } + + /* Can't get a quick answer, iterate over all ports */ + for (port = 0; port < qp->device->phys_port_cnt; port++) + if (qp->device->get_link_layer(qp->device, port) != + IB_LINK_LAYER_INFINIBAND) + num_eth_ports++; + + /* If we have at lease one Ethernet port, RoCE annex declares that + * multicast LID should be ignored. We can't tell at this step if the + * QP belongs to an IB or Ethernet port. + */ + if (num_eth_ports) + return true; + + /* If all the ports are IB, we can check according to IB spec. */ +lid_check: + return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)); +} + int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); @@ -1597,9 +1696,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile index 036f84efbc73..afbaa0e20670 100644 --- a/drivers/infiniband/hw/bnxt_re/Makefile +++ b/drivers/infiniband/hw/bnxt_re/Makefile @@ -3,4 +3,4 @@ ccflags-y := -Idrivers/net/ethernet/broadcom/bnxt obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o bnxt_re-y := main.o ib_verbs.o \ qplib_res.o qplib_rcfw.o \ - qplib_sp.o qplib_fp.o + qplib_sp.o qplib_fp.o hw_counters.o diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 85527532c49d..b3ad37fec578 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -85,7 +85,7 @@ struct bnxt_re_sqp_entries { }; #define BNXT_RE_MIN_MSIX 2 -#define BNXT_RE_MAX_MSIX 16 +#define BNXT_RE_MAX_MSIX 9 #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 @@ -116,7 +116,7 @@ struct bnxt_re_dev { struct bnxt_qplib_rcfw rcfw; /* NQ */ - struct bnxt_qplib_nq nq; + struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; /* Device Resources */ struct bnxt_qplib_dev_attr dev_attr; @@ -140,6 +140,7 @@ struct bnxt_re_dev { struct bnxt_re_qp *qp1_sqp; struct bnxt_re_ah *sqp_ah; struct bnxt_re_sqp_entries sqp_tbl[1024]; + atomic_t nq_alloc_cnt; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c new file mode 100644 index 000000000000..7b28219eba46 --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -0,0 +1,114 @@ +/* + * Broadcom NetXtreme-E RoCE driver. + * + * Copyright (c) 2016 - 2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Statistics + * + */ + +#include <linux/interrupt.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/prefetch.h> +#include <linux/delay.h> + +#include <rdma/ib_addr.h> + +#include "bnxt_ulp.h" +#include "roce_hsi.h" +#include "qplib_res.h" +#include "qplib_sp.h" +#include "qplib_fp.h" +#include "qplib_rcfw.h" +#include "bnxt_re.h" +#include "hw_counters.h" + +static const char * const bnxt_re_stat_name[] = { + [BNXT_RE_ACTIVE_QP] = "active_qps", + [BNXT_RE_ACTIVE_SRQ] = "active_srqs", + [BNXT_RE_ACTIVE_CQ] = "active_cqs", + [BNXT_RE_ACTIVE_MR] = "active_mrs", + [BNXT_RE_ACTIVE_MW] = "active_mws", + [BNXT_RE_RX_PKTS] = "rx_pkts", + [BNXT_RE_RX_BYTES] = "rx_bytes", + [BNXT_RE_TX_PKTS] = "tx_pkts", + [BNXT_RE_TX_BYTES] = "tx_bytes", + [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors" +}; + +int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma; + + if (!port || !stats) + return -EINVAL; + + stats->value[BNXT_RE_ACTIVE_QP] = atomic_read(&rdev->qp_count); + stats->value[BNXT_RE_ACTIVE_SRQ] = atomic_read(&rdev->srq_count); + stats->value[BNXT_RE_ACTIVE_CQ] = atomic_read(&rdev->cq_count); + stats->value[BNXT_RE_ACTIVE_MR] = atomic_read(&rdev->mr_count); + stats->value[BNXT_RE_ACTIVE_MW] = atomic_read(&rdev->mw_count); + if (bnxt_re_stats) { + stats->value[BNXT_RE_RECOVERABLE_ERRORS] = + le64_to_cpu(bnxt_re_stats->tx_bcast_pkts); + stats->value[BNXT_RE_RX_PKTS] = + le64_to_cpu(bnxt_re_stats->rx_ucast_pkts); + stats->value[BNXT_RE_RX_BYTES] = + le64_to_cpu(bnxt_re_stats->rx_ucast_bytes); + stats->value[BNXT_RE_TX_PKTS] = + le64_to_cpu(bnxt_re_stats->tx_ucast_pkts); + stats->value[BNXT_RE_TX_BYTES] = + le64_to_cpu(bnxt_re_stats->tx_ucast_bytes); + } + return ARRAY_SIZE(bnxt_re_stat_name); +} + +struct rdma_hw_stats *bnxt_re_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(bnxt_re_stat_name) != BNXT_RE_NUM_COUNTERS); + /* We support only per port stats */ + if (!port_num) + return NULL; + + return rdma_alloc_hw_stats_struct(bnxt_re_stat_name, + ARRAY_SIZE(bnxt_re_stat_name), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h new file mode 100644 index 000000000000..be0dc0093b58 --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -0,0 +1,62 @@ +/* + * Broadcom NetXtreme-E RoCE driver. + * + * Copyright (c) 2016 - 2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Statistics (header) + * + */ + +#ifndef __BNXT_RE_HW_STATS_H__ +#define __BNXT_RE_HW_STATS_H__ + +enum bnxt_re_hw_stats { + BNXT_RE_ACTIVE_QP, + BNXT_RE_ACTIVE_SRQ, + BNXT_RE_ACTIVE_CQ, + BNXT_RE_ACTIVE_MR, + BNXT_RE_ACTIVE_MW, + BNXT_RE_RX_PKTS, + BNXT_RE_RX_BYTES, + BNXT_RE_TX_PKTS, + BNXT_RE_TX_BYTES, + BNXT_RE_RECOVERABLE_ERRORS, + BNXT_RE_NUM_COUNTERS +}; + +struct rdma_hw_stats *bnxt_re_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num); +int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index); +#endif /* __BNXT_RE_HW_STATS_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index f0e01b3ac711..01eee15bbd65 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -223,50 +223,6 @@ int bnxt_re_modify_device(struct ib_device *ibdev, return 0; } -static void __to_ib_speed_width(struct net_device *netdev, u8 *speed, u8 *width) -{ - struct ethtool_link_ksettings lksettings; - u32 espeed; - - if (netdev->ethtool_ops && netdev->ethtool_ops->get_link_ksettings) { - memset(&lksettings, 0, sizeof(lksettings)); - rtnl_lock(); - netdev->ethtool_ops->get_link_ksettings(netdev, &lksettings); - rtnl_unlock(); - espeed = lksettings.base.speed; - } else { - espeed = SPEED_UNKNOWN; - } - switch (espeed) { - case SPEED_1000: - *speed = IB_SPEED_SDR; - *width = IB_WIDTH_1X; - break; - case SPEED_10000: - *speed = IB_SPEED_QDR; - *width = IB_WIDTH_1X; - break; - case SPEED_20000: - *speed = IB_SPEED_DDR; - *width = IB_WIDTH_4X; - break; - case SPEED_25000: - *speed = IB_SPEED_EDR; - *width = IB_WIDTH_1X; - break; - case SPEED_40000: - *speed = IB_SPEED_QDR; - *width = IB_WIDTH_4X; - break; - case SPEED_50000: - break; - default: - *speed = IB_SPEED_SDR; - *width = IB_WIDTH_1X; - break; - } -} - /* Port */ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, struct ib_port_attr *port_attr) @@ -308,25 +264,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, * IB stack to avoid race in the NETDEV_UNREG path */ if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) - __to_ib_speed_width(rdev->netdev, &port_attr->active_speed, - &port_attr->active_width); - return 0; -} - -int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num, - int port_modify_mask, - struct ib_port_modify *port_modify) -{ - switch (port_modify_mask) { - case IB_PORT_SHUTDOWN: - break; - case IB_PORT_INIT_TYPE: - break; - case IB_PORT_RESET_QKEY_CNTR: - break; - default: - break; - } + if (ib_get_eth_speed(ibdev, port_num, &port_attr->active_speed, + &port_attr->active_width)) + return -EINVAL; return 0; } @@ -846,6 +786,7 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) struct bnxt_re_dev *rdev = qp->rdev; int rc; + bnxt_qplib_del_flush_qp(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP"); @@ -860,6 +801,7 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) return rc; } + bnxt_qplib_del_flush_qp(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &rdev->qp1_sqp->qplib_qp); if (rc) { @@ -969,7 +911,6 @@ static struct bnxt_re_ah *bnxt_re_create_shadow_qp_ah if (!ah) return NULL; - memset(ah, 0, sizeof(*ah)); ah->rdev = rdev; ah->qplib_ah.pd = &pd->qplib_pd; @@ -1016,7 +957,6 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp if (!qp) return NULL; - memset(qp, 0, sizeof(*qp)); qp->rdev = rdev; /* Initialize the shadow QP structure from the QP1 values */ @@ -1404,6 +1344,21 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, } qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE; qp->qplib_qp.state = __from_ib_qp_state(qp_attr->qp_state); + + if (!qp->sumem && + qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { + dev_dbg(rdev_to_dev(rdev), + "Move QP = %p to flush list\n", + qp); + bnxt_qplib_add_flush_qp(&qp->qplib_qp); + } + if (!qp->sumem && + qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_RESET) { + dev_dbg(rdev_to_dev(rdev), + "Move QP = %p out of flush list\n", + qp); + bnxt_qplib_del_flush_qp(&qp->qplib_qp); + } } if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { qp->qplib_qp.modify_flags |= @@ -2333,6 +2288,7 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq) struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); struct bnxt_re_dev *rdev = cq->rdev; int rc; + struct bnxt_qplib_nq *nq = cq->qplib_cq.nq; rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -2347,7 +2303,7 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq) kfree(cq); } atomic_dec(&rdev->cq_count); - rdev->nq.budget--; + nq->budget--; return 0; } @@ -2361,6 +2317,8 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, struct bnxt_re_cq *cq = NULL; int rc, entries; int cqe = attr->cqe; + struct bnxt_qplib_nq *nq = NULL; + unsigned int nq_alloc_cnt; /* Validate CQ fields */ if (cqe < 1 || cqe > dev_attr->max_cq_wqes) { @@ -2412,8 +2370,15 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, cq->qplib_cq.sghead = NULL; cq->qplib_cq.nmap = 0; } + /* + * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a + * used for getting the NQ index. + */ + nq_alloc_cnt = atomic_inc_return(&rdev->nq_alloc_cnt); + nq = &rdev->nq[nq_alloc_cnt % (rdev->num_msix - 1)]; cq->qplib_cq.max_wqe = entries; - cq->qplib_cq.cnq_hw_ring_id = rdev->nq.ring_id; + cq->qplib_cq.cnq_hw_ring_id = nq->ring_id; + cq->qplib_cq.nq = nq; rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -2423,7 +2388,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, cq->ib_cq.cqe = entries; cq->cq_period = cq->qplib_cq.period; - rdev->nq.budget++; + nq->budget++; atomic_inc(&rdev->cq_count); @@ -2921,6 +2886,10 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) sq->send_phantom = false; } } + if (ncqe < budget) + ncqe += bnxt_qplib_process_flush_list(&cq->qplib_cq, + cqe + ncqe, + budget - ncqe); if (!ncqe) break; @@ -3410,7 +3379,7 @@ int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) &rdev->qplib_res.dpi_tbl, &uctx->dpi); if (rc) - dev_err(rdev_to_dev(rdev), "Deallocte HW DPI failed!"); + dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!"); /* Don't fail, continue*/ uctx->dpi.dbr = NULL; } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index a0bb7e33d7ca..1df11ed272ea 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -141,9 +141,6 @@ int bnxt_re_modify_device(struct ib_device *ibdev, struct ib_device_modify *device_modify); int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, struct ib_port_attr *port_attr); -int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num, - int port_modify_mask, - struct ib_port_modify *port_modify); int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable); int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index ceae2d92fb08..82d1cbc27aee 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -64,13 +64,14 @@ #include "ib_verbs.h" #include <rdma/bnxt_re-abi.h> #include "bnxt.h" +#include "hw_counters.h" + static char version[] = BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n"; MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>"); MODULE_DESCRIPTION(BNXT_RE_DESC " Driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(ROCE_DRV_MODULE_VERSION); /* globals */ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list); @@ -162,7 +163,7 @@ static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait) static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) { - int rc = 0, num_msix_want = BNXT_RE_MIN_MSIX, num_msix_got; + int rc = 0, num_msix_want = BNXT_RE_MAX_MSIX, num_msix_got; struct bnxt_en_dev *en_dev; if (!rdev) @@ -170,6 +171,8 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) en_dev = rdev->en_dev; + num_msix_want = min_t(u32, BNXT_RE_MAX_MSIX, num_online_cpus()); + rtnl_lock(); num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP, rdev->msix_entries, @@ -474,7 +477,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->modify_device = bnxt_re_modify_device; ibdev->query_port = bnxt_re_query_port; - ibdev->modify_port = bnxt_re_modify_port; ibdev->get_port_immutable = bnxt_re_get_port_immutable; ibdev->query_pkey = bnxt_re_query_pkey; ibdev->query_gid = bnxt_re_query_gid; @@ -513,6 +515,8 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->alloc_ucontext = bnxt_re_alloc_ucontext; ibdev->dealloc_ucontext = bnxt_re_dealloc_ucontext; ibdev->mmap = bnxt_re_mmap; + ibdev->get_hw_stats = bnxt_re_ib_get_hw_stats; + ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; return ib_register_device(ibdev, NULL); } @@ -653,8 +657,12 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { - if (rdev->nq.hwq.max_elements) - bnxt_qplib_disable_nq(&rdev->nq); + int i; + + if (rdev->nq[0].hwq.max_elements) { + for (i = 1; i < rdev->num_msix; i++) + bnxt_qplib_disable_nq(&rdev->nq[i - 1]); + } if (rdev->qplib_res.rcfw) bnxt_qplib_cleanup_res(&rdev->qplib_res); @@ -662,31 +670,41 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) static int bnxt_re_init_res(struct bnxt_re_dev *rdev) { - int rc = 0; + int rc = 0, i; bnxt_qplib_init_res(&rdev->qplib_res); - if (rdev->msix_entries[BNXT_RE_NQ_IDX].vector <= 0) - return -EINVAL; + for (i = 1; i < rdev->num_msix ; i++) { + rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], + i - 1, rdev->msix_entries[i].vector, + rdev->msix_entries[i].db_offset, + &bnxt_re_cqn_handler, NULL); - rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq, - rdev->msix_entries[BNXT_RE_NQ_IDX].vector, - rdev->msix_entries[BNXT_RE_NQ_IDX].db_offset, - &bnxt_re_cqn_handler, - NULL); + if (rc) { + dev_err(rdev_to_dev(rdev), + "Failed to enable NQ with rc = 0x%x", rc); + goto fail; + } + } + return 0; +fail: + return rc; +} - if (rc) - dev_err(rdev_to_dev(rdev), "Failed to enable NQ: %#x", rc); +static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev, bool lock_wait) +{ + int i; - return rc; + for (i = 0; i < rdev->num_msix - 1; i++) { + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, lock_wait); + bnxt_qplib_free_nq(&rdev->nq[i]); + } } static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait) { - if (rdev->nq.hwq.max_elements) { - bnxt_re_net_ring_free(rdev, rdev->nq.ring_id, lock_wait); - bnxt_qplib_free_nq(&rdev->nq); - } + bnxt_re_free_nq_res(rdev, lock_wait); + if (rdev->qplib_res.dpi_tbl.max) { bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &rdev->qplib_res.dpi_tbl, @@ -700,7 +718,7 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait) static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) { - int rc = 0; + int rc = 0, i; /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; @@ -717,30 +735,42 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) &rdev->dpi_privileged, rdev); if (rc) - goto fail; + goto dealloc_res; - rdev->nq.hwq.max_elements = BNXT_RE_MAX_CQ_COUNT + - BNXT_RE_MAX_SRQC_COUNT + 2; - rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq); - if (rc) { - dev_err(rdev_to_dev(rdev), - "Failed to allocate NQ memory: %#x", rc); - goto fail; - } - rc = bnxt_re_net_ring_alloc - (rdev, rdev->nq.hwq.pbl[PBL_LVL_0].pg_map_arr, - rdev->nq.hwq.pbl[rdev->nq.hwq.level].pg_count, - HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_NQE_MAX_CNT - 1, - rdev->msix_entries[BNXT_RE_NQ_IDX].ring_idx, - &rdev->nq.ring_id); - if (rc) { - dev_err(rdev_to_dev(rdev), - "Failed to allocate NQ ring: %#x", rc); - goto free_nq; + for (i = 0; i < rdev->num_msix - 1; i++) { + rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT + + BNXT_RE_MAX_SRQC_COUNT + 2; + rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]); + if (rc) { + dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x", + i, rc); + goto dealloc_dpi; + } + rc = bnxt_re_net_ring_alloc + (rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr, + rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count, + HWRM_RING_ALLOC_CMPL, + BNXT_QPLIB_NQE_MAX_CNT - 1, + rdev->msix_entries[i + 1].ring_idx, + &rdev->nq[i].ring_id); + if (rc) { + dev_err(rdev_to_dev(rdev), + "Failed to allocate NQ fw id with rc = 0x%x", + rc); + goto free_nq; + } } return 0; free_nq: - bnxt_qplib_free_nq(&rdev->nq); + for (i = 0; i < rdev->num_msix - 1; i++) + bnxt_qplib_free_nq(&rdev->nq[i]); +dealloc_dpi: + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, + &rdev->qplib_res.dpi_tbl, + &rdev->dpi_privileged); +dealloc_res: + bnxt_qplib_free_res(&rdev->qplib_res); + fail: rdev->qplib_res.rcfw = NULL; return rc; @@ -835,6 +865,42 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) mutex_unlock(&rdev->qp_lock); } +static int bnxt_re_update_gid(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl; + struct bnxt_qplib_gid gid; + u16 gid_idx, index; + int rc = 0; + + if (!test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) + return 0; + + if (!sgid_tbl) { + dev_err(rdev_to_dev(rdev), "QPLIB: SGID table not allocated"); + return -EINVAL; + } + + for (index = 0; index < sgid_tbl->active; index++) { + gid_idx = sgid_tbl->hw_id[index]; + + if (!memcmp(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero, + sizeof(bnxt_qplib_gid_zero))) + continue; + /* need to modify the VLAN enable setting of non VLAN GID only + * as setting is done for VLAN GID while adding GID + */ + if (sgid_tbl->vlan[index]) + continue; + + memcpy(&gid, &sgid_tbl->tbl[index], sizeof(gid)); + + rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx, + rdev->qplib_res.netdev->dev_addr); + } + + return rc; +} + static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev) { u32 prio_map = 0, tmp_map = 0; @@ -854,8 +920,6 @@ static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev) tmp_map = dcb_ieee_getapp_mask(netdev, &app); prio_map |= tmp_map; - if (!prio_map) - prio_map = -EFAULT; return prio_map; } @@ -881,10 +945,7 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) int rc; /* Get priority for roce */ - rc = bnxt_re_get_priority_mask(rdev); - if (rc < 0) - return rc; - prio_map = (u8)rc; + prio_map = bnxt_re_get_priority_mask(rdev); if (prio_map == rdev->cur_prio_map) return 0; @@ -906,6 +967,16 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) return rc; } + /* Actual priorities are not programmed as they are already + * done by L2 driver; just enable or disable priority vlan tagging + */ + if ((prio_map == 0 && rdev->qplib_res.prio) || + (prio_map != 0 && !rdev->qplib_res.prio)) { + rdev->qplib_res.prio = prio_map ? true : false; + + bnxt_re_update_gid(rdev); + } + return 0; } @@ -998,7 +1069,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) /* Establish RCFW Communication Channel to initialize the context * memory for the function and all child VFs */ - rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw); + rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw, + BNXT_RE_MAX_QPC_COUNT); if (rc) goto fail; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 9af1514e5944..e8afc47f8949 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -51,6 +51,168 @@ #include "qplib_fp.h" static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq); +static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); + +static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp) +{ + qp->sq.condition = false; + qp->sq.send_phantom = false; + qp->sq.single = false; +} + +/* Flush list */ +static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp) +{ + struct bnxt_qplib_cq *scq, *rcq; + + scq = qp->scq; + rcq = qp->rcq; + + if (!qp->sq.flushed) { + dev_dbg(&scq->hwq.pdev->dev, + "QPLIB: FP: Adding to SQ Flush list = %p", + qp); + bnxt_qplib_cancel_phantom_processing(qp); + list_add_tail(&qp->sq_flush, &scq->sqf_head); + qp->sq.flushed = true; + } + if (!qp->srq) { + if (!qp->rq.flushed) { + dev_dbg(&rcq->hwq.pdev->dev, + "QPLIB: FP: Adding to RQ Flush list = %p", + qp); + list_add_tail(&qp->rq_flush, &rcq->rqf_head); + qp->rq.flushed = true; + } + } +} + +void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp, + unsigned long *flags) + __acquires(&qp->scq->hwq.lock) __acquires(&qp->rcq->hwq.lock) +{ + spin_lock_irqsave(&qp->scq->hwq.lock, *flags); + if (qp->scq == qp->rcq) + __acquire(&qp->rcq->hwq.lock); + else + spin_lock(&qp->rcq->hwq.lock); +} + +void bnxt_qplib_release_cq_locks(struct bnxt_qplib_qp *qp, + unsigned long *flags) + __releases(&qp->scq->hwq.lock) __releases(&qp->rcq->hwq.lock) +{ + if (qp->scq == qp->rcq) + __release(&qp->rcq->hwq.lock); + else + spin_unlock(&qp->rcq->hwq.lock); + spin_unlock_irqrestore(&qp->scq->hwq.lock, *flags); +} + +static struct bnxt_qplib_cq *bnxt_qplib_find_buddy_cq(struct bnxt_qplib_qp *qp, + struct bnxt_qplib_cq *cq) +{ + struct bnxt_qplib_cq *buddy_cq = NULL; + + if (qp->scq == qp->rcq) + buddy_cq = NULL; + else if (qp->scq == cq) + buddy_cq = qp->rcq; + else + buddy_cq = qp->scq; + return buddy_cq; +} + +static void bnxt_qplib_lock_buddy_cq(struct bnxt_qplib_qp *qp, + struct bnxt_qplib_cq *cq) + __acquires(&buddy_cq->hwq.lock) +{ + struct bnxt_qplib_cq *buddy_cq = NULL; + + buddy_cq = bnxt_qplib_find_buddy_cq(qp, cq); + if (!buddy_cq) + __acquire(&cq->hwq.lock); + else + spin_lock(&buddy_cq->hwq.lock); +} + +static void bnxt_qplib_unlock_buddy_cq(struct bnxt_qplib_qp *qp, + struct bnxt_qplib_cq *cq) + __releases(&buddy_cq->hwq.lock) +{ + struct bnxt_qplib_cq *buddy_cq = NULL; + + buddy_cq = bnxt_qplib_find_buddy_cq(qp, cq); + if (!buddy_cq) + __release(&cq->hwq.lock); + else + spin_unlock(&buddy_cq->hwq.lock); +} + +void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp) +{ + unsigned long flags; + + bnxt_qplib_acquire_cq_locks(qp, &flags); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_release_cq_locks(qp, &flags); +} + +static void __bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp) +{ + struct bnxt_qplib_cq *scq, *rcq; + + scq = qp->scq; + rcq = qp->rcq; + + if (qp->sq.flushed) { + qp->sq.flushed = false; + list_del(&qp->sq_flush); + } + if (!qp->srq) { + if (qp->rq.flushed) { + qp->rq.flushed = false; + list_del(&qp->rq_flush); + } + } +} + +void bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp) +{ + unsigned long flags; + + bnxt_qplib_acquire_cq_locks(qp, &flags); + __clean_cq(qp->scq, (u64)(unsigned long)qp); + qp->sq.hwq.prod = 0; + qp->sq.hwq.cons = 0; + __clean_cq(qp->rcq, (u64)(unsigned long)qp); + qp->rq.hwq.prod = 0; + qp->rq.hwq.cons = 0; + + __bnxt_qplib_del_flush_qp(qp); + bnxt_qplib_release_cq_locks(qp, &flags); +} + +static void bnxt_qpn_cqn_sched_task(struct work_struct *work) +{ + struct bnxt_qplib_nq_work *nq_work = + container_of(work, struct bnxt_qplib_nq_work, work); + + struct bnxt_qplib_cq *cq = nq_work->cq; + struct bnxt_qplib_nq *nq = nq_work->nq; + + if (cq && nq) { + spin_lock_bh(&cq->compl_lock); + if (atomic_read(&cq->arm_state) && nq->cqn_handler) { + dev_dbg(&nq->pdev->dev, + "%s:Trigger cq = %p event nq = %p\n", + __func__, cq, nq); + nq->cqn_handler(nq, cq); + } + spin_unlock_bh(&cq->compl_lock); + } + kfree(nq_work); +} static void bnxt_qplib_free_qp_hdr_buf(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) @@ -119,6 +281,7 @@ static void bnxt_qplib_service_nq(unsigned long data) struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data; struct bnxt_qplib_hwq *hwq = &nq->hwq; struct nq_base *nqe, **nq_ptr; + struct bnxt_qplib_cq *cq; int num_cqne_processed = 0; u32 sw_cons, raw_cons; u16 type; @@ -143,15 +306,17 @@ static void bnxt_qplib_service_nq(unsigned long data) q_handle = le32_to_cpu(nqcne->cq_handle_low); q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high) << 32; - bnxt_qplib_arm_cq_enable((struct bnxt_qplib_cq *) - ((unsigned long)q_handle)); - if (!nq->cqn_handler(nq, (struct bnxt_qplib_cq *) - ((unsigned long)q_handle))) + cq = (struct bnxt_qplib_cq *)(unsigned long)q_handle; + bnxt_qplib_arm_cq_enable(cq); + spin_lock_bh(&cq->compl_lock); + atomic_set(&cq->arm_state, 0); + if (!nq->cqn_handler(nq, (cq))) num_cqne_processed++; else dev_warn(&nq->pdev->dev, "QPLIB: cqn - type 0x%x not handled", type); + spin_unlock_bh(&cq->compl_lock); break; } case NQ_BASE_TYPE_DBQ_EVENT: @@ -190,12 +355,17 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance) void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) { + if (nq->cqn_wq) { + destroy_workqueue(nq->cqn_wq); + nq->cqn_wq = NULL; + } /* Make sure the HW is stopped! */ synchronize_irq(nq->vector); tasklet_disable(&nq->worker); tasklet_kill(&nq->worker); if (nq->requested) { + irq_set_affinity_hint(nq->vector, NULL); free_irq(nq->vector, nq); nq->requested = false; } @@ -209,14 +379,14 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) } int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, - int msix_vector, int bar_reg_offset, + int nq_idx, int msix_vector, int bar_reg_offset, int (*cqn_handler)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *), int (*srqn_handler)(struct bnxt_qplib_nq *nq, void *, u8 event)) { resource_size_t nq_base; - int rc; + int rc = -1; nq->pdev = pdev; nq->vector = msix_vector; @@ -227,14 +397,31 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq); + /* Have a task to schedule CQ notifiers in post send case */ + nq->cqn_wq = create_singlethread_workqueue("bnxt_qplib_nq"); + if (!nq->cqn_wq) + goto fail; + nq->requested = false; - rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, "bnxt_qplib_nq", nq); + memset(nq->name, 0, 32); + sprintf(nq->name, "bnxt_qplib_nq-%d", nq_idx); + rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, nq->name, nq); if (rc) { dev_err(&nq->pdev->dev, "Failed to request IRQ for NQ: %#x", rc); bnxt_qplib_disable_nq(nq); goto fail; } + + cpumask_clear(&nq->mask); + cpumask_set_cpu(nq_idx, &nq->mask); + rc = irq_set_affinity_hint(nq->vector, &nq->mask); + if (rc) { + dev_warn(&nq->pdev->dev, + "QPLIB: set affinity failed; vector: %d nq_idx: %d\n", + nq->vector, nq_idx); + } + nq->requested = true; nq->bar_reg = NQ_CONS_PCI_BAR_REGION; nq->bar_reg_off = bar_reg_offset; @@ -258,8 +445,10 @@ fail: void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq) { - if (nq->hwq.max_elements) + if (nq->hwq.max_elements) { bnxt_qplib_free_hwq(nq->pdev, &nq->hwq); + nq->hwq.max_elements = 0; + } } int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq) @@ -401,8 +590,8 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->id = le32_to_cpu(resp.xid); qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET; - sq->flush_in_progress = false; - rq->flush_in_progress = false; + rcfw->qp_tbl[qp->id].qp_id = qp->id; + rcfw->qp_tbl[qp->id].qp_handle = (void *)qp; return 0; @@ -615,8 +804,10 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->id = le32_to_cpu(resp.xid); qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET; - sq->flush_in_progress = false; - rq->flush_in_progress = false; + INIT_LIST_HEAD(&qp->sq_flush); + INIT_LIST_HEAD(&qp->rq_flush); + rcfw->qp_tbl[qp->id].qp_id = qp->id; + rcfw->qp_tbl[qp->id].qp_handle = (void *)qp; return 0; @@ -963,13 +1154,19 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, u16 cmd_flags = 0; int rc; + rcfw->qp_tbl[qp->id].qp_id = BNXT_QPLIB_QP_ID_INVALID; + rcfw->qp_tbl[qp->id].qp_handle = NULL; + RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags); req.qp_cid = cpu_to_le32(qp->id); rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL, 0); - if (rc) + if (rc) { + rcfw->qp_tbl[qp->id].qp_id = qp->id; + rcfw->qp_tbl[qp->id].qp_handle = qp; return rc; + } /* Must walk the associated CQs to nullified the QP ptr */ spin_lock_irqsave(&qp->scq->hwq.lock, flags); @@ -1074,14 +1271,21 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, struct bnxt_qplib_swq *swq; struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr; struct sq_sge *hw_sge; + struct bnxt_qplib_nq_work *nq_work = NULL; + bool sch_handler = false; u32 sw_prod; u8 wqe_size16; int i, rc = 0, data_len = 0, pkt_num = 0; __le32 temp32; if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) { - rc = -EINVAL; - goto done; + if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { + sch_handler = true; + dev_dbg(&sq->hwq.pdev->dev, + "%s Error QP. Scheduling for poll_cq\n", + __func__); + goto queue_err; + } } if (bnxt_qplib_queue_full(sq)) { @@ -1301,12 +1505,35 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) & SQ_PSN_SEARCH_NEXT_PSN_MASK)); } - +queue_err: + if (sch_handler) { + /* Store the ULP info in the software structures */ + sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq); + swq = &sq->swq[sw_prod]; + swq->wr_id = wqe->wr_id; + swq->type = wqe->type; + swq->flags = wqe->flags; + if (qp->sig_type) + swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP; + swq->start_psn = sq->psn & BTH_PSN_MASK; + } sq->hwq.prod++; - qp->wqe_cnt++; done: + if (sch_handler) { + nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC); + if (nq_work) { + nq_work->cq = qp->scq; + nq_work->nq = qp->scq->nq; + INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task); + queue_work(qp->scq->nq->cqn_wq, &nq_work->work); + } else { + dev_err(&sq->hwq.pdev->dev, + "QPLIB: FP: Failed to allocate SQ nq_work!"); + rc = -ENOMEM; + } + } return rc; } @@ -1334,15 +1561,17 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, struct bnxt_qplib_q *rq = &qp->rq; struct rq_wqe *rqe, **rqe_ptr; struct sq_sge *hw_sge; + struct bnxt_qplib_nq_work *nq_work = NULL; + bool sch_handler = false; u32 sw_prod; int i, rc = 0; if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { - dev_err(&rq->hwq.pdev->dev, - "QPLIB: FP: QP (0x%x) is in the 0x%x state", - qp->id, qp->state); - rc = -EINVAL; - goto done; + sch_handler = true; + dev_dbg(&rq->hwq.pdev->dev, + "%s Error QP. Scheduling for poll_cq\n", + __func__); + goto queue_err; } if (bnxt_qplib_queue_full(rq)) { dev_err(&rq->hwq.pdev->dev, @@ -1378,7 +1607,27 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, /* Supply the rqe->wr_id index to the wr_id_tbl for now */ rqe->wr_id[0] = cpu_to_le32(sw_prod); +queue_err: + if (sch_handler) { + /* Store the ULP info in the software structures */ + sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq); + rq->swq[sw_prod].wr_id = wqe->wr_id; + } + rq->hwq.prod++; + if (sch_handler) { + nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC); + if (nq_work) { + nq_work->cq = qp->rcq; + nq_work->nq = qp->rcq->nq; + INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task); + queue_work(qp->rcq->nq->cqn_wq, &nq_work->work); + } else { + dev_err(&rq->hwq.pdev->dev, + "QPLIB: FP: Failed to allocate RQ nq_work!"); + rc = -ENOMEM; + } + } done: return rc; } @@ -1471,6 +1720,9 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem; cq->period = BNXT_QPLIB_QUEUE_START_PERIOD; init_waitqueue_head(&cq->waitq); + INIT_LIST_HEAD(&cq->sqf_head); + INIT_LIST_HEAD(&cq->rqf_head); + spin_lock_init(&cq->compl_lock); bnxt_qplib_arm_cq_enable(cq); return 0; @@ -1513,9 +1765,13 @@ static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp, while (*budget) { sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq); if (sw_cons == sw_prod) { - sq->flush_in_progress = false; break; } + /* Skip the FENCE WQE completions */ + if (sq->swq[sw_cons].wr_id == BNXT_QPLIB_FENCE_WRID) { + bnxt_qplib_cancel_phantom_processing(qp); + goto skip_compl; + } memset(cqe, 0, sizeof(*cqe)); cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR; cqe->opcode = CQ_BASE_CQE_TYPE_REQ; @@ -1525,6 +1781,7 @@ static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp, cqe->type = sq->swq[sw_cons].type; cqe++; (*budget)--; +skip_compl: sq->hwq.cons++; } *pcqe = cqe; @@ -1536,11 +1793,24 @@ static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp, } static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp, - int opcode, struct bnxt_qplib_cqe **pcqe, int *budget) + struct bnxt_qplib_cqe **pcqe, int *budget) { struct bnxt_qplib_cqe *cqe; u32 sw_prod, sw_cons; int rc = 0; + int opcode = 0; + + switch (qp->type) { + case CMDQ_CREATE_QP1_TYPE_GSI: + opcode = CQ_BASE_CQE_TYPE_RES_RAWETH_QP1; + break; + case CMDQ_CREATE_QP_TYPE_RC: + opcode = CQ_BASE_CQE_TYPE_RES_RC; + break; + case CMDQ_CREATE_QP_TYPE_UD: + opcode = CQ_BASE_CQE_TYPE_RES_UD; + break; + } /* Flush the rest of the RQ */ sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq); @@ -1567,6 +1837,21 @@ static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp, return rc; } +void bnxt_qplib_mark_qp_error(void *qp_handle) +{ + struct bnxt_qplib_qp *qp = qp_handle; + + if (!qp) + return; + + /* Must block new posting of SQ and RQ */ + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + bnxt_qplib_cancel_phantom_processing(qp); + + /* Add qp to flush list of the CQ */ + __bnxt_qplib_add_flush_qp(qp); +} + /* Note: SQE is valid from sw_sq_cons up to cqe_sq_cons (exclusive) * CQE is track from sw_cq_cons to max_element but valid only if VALID=1 */ @@ -1694,10 +1979,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, cqe_sq_cons, sq->hwq.max_elements); return -EINVAL; } - /* If we were in the middle of flushing the SQ, continue */ - if (sq->flush_in_progress) - goto flush; + if (qp->sq.flushed) { + dev_dbg(&cq->hwq.pdev->dev, + "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + goto done; + } /* Require to walk the sq's swq to fabricate CQEs for all previously * signaled SWQEs due to CQE aggregation from the current sq cons * to the cqe_sq_cons @@ -1733,11 +2020,9 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, sw_sq_cons, cqe->wr_id, cqe->status); cqe++; (*budget)--; - sq->flush_in_progress = true; - /* Must block new posting of SQ and RQ */ - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - sq->condition = false; - sq->single = false; + bnxt_qplib_lock_buddy_cq(qp, cq); + bnxt_qplib_mark_qp_error(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); } else { if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { /* Before we complete, do WA 9060 */ @@ -1768,15 +2053,6 @@ out: * the WC for this CQE */ sq->single = false; - if (!sq->flush_in_progress) - goto done; -flush: - /* Require to walk the sq's swq to fabricate CQEs for all - * previously posted SWQEs due to the error CQE received - */ - rc = __flush_sq(sq, qp, pcqe, budget); - if (!rc) - sq->flush_in_progress = false; done: return rc; } @@ -1798,6 +2074,12 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL"); return -EINVAL; } + if (qp->rq.flushed) { + dev_dbg(&cq->hwq.pdev->dev, + "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + goto done; + } + cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; cqe->length = le32_to_cpu(hwcqe->length); @@ -1817,8 +2099,6 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, wr_id_idx, rq->hwq.max_elements); return -EINVAL; } - if (rq->flush_in_progress) - goto flush_rq; cqe->wr_id = rq->swq[wr_id_idx].wr_id; cqe++; @@ -1827,12 +2107,13 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, *pcqe = cqe; if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - rq->flush_in_progress = true; -flush_rq: - rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_RC, pcqe, budget); - if (!rc) - rq->flush_in_progress = false; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); } + +done: return rc; } @@ -1853,6 +2134,11 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL"); return -EINVAL; } + if (qp->rq.flushed) { + dev_dbg(&cq->hwq.pdev->dev, + "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + goto done; + } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; cqe->length = le32_to_cpu(hwcqe->length); @@ -1876,8 +2162,6 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, wr_id_idx, rq->hwq.max_elements); return -EINVAL; } - if (rq->flush_in_progress) - goto flush_rq; cqe->wr_id = rq->swq[wr_id_idx].wr_id; cqe++; @@ -1886,12 +2170,12 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, *pcqe = cqe; if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - rq->flush_in_progress = true; -flush_rq: - rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_UD, pcqe, budget); - if (!rc) - rq->flush_in_progress = false; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); } +done: return rc; } @@ -1932,6 +2216,11 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, "QPLIB: process_cq Raw/QP1 qp is NULL"); return -EINVAL; } + if (qp->rq.flushed) { + dev_dbg(&cq->hwq.pdev->dev, + "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + goto done; + } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; cqe->flags = le16_to_cpu(hwcqe->flags); @@ -1960,8 +2249,6 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, wr_id_idx, rq->hwq.max_elements); return -EINVAL; } - if (rq->flush_in_progress) - goto flush_rq; cqe->wr_id = rq->swq[wr_id_idx].wr_id; cqe++; @@ -1970,13 +2257,13 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, *pcqe = cqe; if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - rq->flush_in_progress = true; -flush_rq: - rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_RAWETH_QP1, pcqe, - budget); - if (!rc) - rq->flush_in_progress = false; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); } + +done: return rc; } @@ -1990,7 +2277,6 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe; u32 sw_cons = 0, cqe_cons; int rc = 0; - u8 opcode = 0; /* Check the Status */ if (hwcqe->status != CQ_TERMINAL_STATUS_OK) @@ -2005,6 +2291,7 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, "QPLIB: FP: CQ Process terminal qp is NULL"); return -EINVAL; } + /* Must block new posting of SQ and RQ */ qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; @@ -2023,9 +2310,12 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, cqe_cons, sq->hwq.max_elements); goto do_rq; } - /* If we were in the middle of flushing, continue */ - if (sq->flush_in_progress) - goto flush_sq; + + if (qp->sq.flushed) { + dev_dbg(&cq->hwq.pdev->dev, + "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + goto sq_done; + } /* Terminal CQE can also include aggregated successful CQEs prior. * So we must complete all CQEs from the current sq's cons to the @@ -2055,11 +2345,6 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, rc = -EAGAIN; goto sq_done; } - sq->flush_in_progress = true; -flush_sq: - rc = __flush_sq(sq, qp, pcqe, budget); - if (!rc) - sq->flush_in_progress = false; sq_done: if (rc) return rc; @@ -2075,26 +2360,23 @@ do_rq: cqe_cons, rq->hwq.max_elements); goto done; } + + if (qp->rq.flushed) { + dev_dbg(&cq->hwq.pdev->dev, + "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + rc = 0; + goto done; + } + /* Terminal CQE requires all posted RQEs to complete with FLUSHED_ERR * from the current rq->cons to the rq->prod regardless what the * rq->cons the terminal CQE indicates */ - rq->flush_in_progress = true; - switch (qp->type) { - case CMDQ_CREATE_QP1_TYPE_GSI: - opcode = CQ_BASE_CQE_TYPE_RES_RAWETH_QP1; - break; - case CMDQ_CREATE_QP_TYPE_RC: - opcode = CQ_BASE_CQE_TYPE_RES_RC; - break; - case CMDQ_CREATE_QP_TYPE_UD: - opcode = CQ_BASE_CQE_TYPE_RES_UD; - break; - } - rc = __flush_rq(rq, qp, opcode, pcqe, budget); - if (!rc) - rq->flush_in_progress = false; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); done: return rc; } @@ -2115,6 +2397,33 @@ static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq, return 0; } +int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq, + struct bnxt_qplib_cqe *cqe, + int num_cqes) +{ + struct bnxt_qplib_qp *qp = NULL; + u32 budget = num_cqes; + unsigned long flags; + + spin_lock_irqsave(&cq->hwq.lock, flags); + list_for_each_entry(qp, &cq->sqf_head, sq_flush) { + dev_dbg(&cq->hwq.pdev->dev, + "QPLIB: FP: Flushing SQ QP= %p", + qp); + __flush_sq(&qp->sq, qp, &cqe, &budget); + } + + list_for_each_entry(qp, &cq->rqf_head, rq_flush) { + dev_dbg(&cq->hwq.pdev->dev, + "QPLIB: FP: Flushing RQ QP= %p", + qp); + __flush_rq(&qp->rq, qp, &cqe, &budget); + } + spin_unlock_irqrestore(&cq->hwq.lock, flags); + + return num_cqes - budget; +} + int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, int num_cqes, struct bnxt_qplib_qp **lib_qp) { @@ -2205,6 +2514,7 @@ void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type) spin_lock_irqsave(&cq->hwq.lock, flags); if (arm_type) bnxt_qplib_arm_cq(cq, arm_type); - + /* Using cq->arm_state variable to track whether to issue cq handler */ + atomic_set(&cq->arm_state, 1); spin_unlock_irqrestore(&cq->hwq.lock, flags); } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 19176e06c98a..8ead70ca1c1d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -220,19 +220,20 @@ struct bnxt_qplib_q { u16 q_full_delta; u16 max_sge; u32 psn; - bool flush_in_progress; bool condition; bool single; bool send_phantom; u32 phantom_wqe_cnt; u32 phantom_cqe_cnt; u32 next_cq_cons; + bool flushed; }; struct bnxt_qplib_qp { struct bnxt_qplib_pd *pd; struct bnxt_qplib_dpi *dpi; u64 qp_handle; +#define BNXT_QPLIB_QP_ID_INVALID 0xFFFFFFFF u32 id; u8 type; u8 sig_type; @@ -296,6 +297,8 @@ struct bnxt_qplib_qp { dma_addr_t sq_hdr_buf_map; void *rq_hdr_buf; dma_addr_t rq_hdr_buf_map; + struct list_head sq_flush; + struct list_head rq_flush; }; #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE sizeof(struct cq_base) @@ -351,6 +354,7 @@ struct bnxt_qplib_cq { u16 period; struct bnxt_qplib_hwq hwq; u32 cnq_hw_ring_id; + struct bnxt_qplib_nq *nq; bool resize_in_progress; struct scatterlist *sghead; u32 nmap; @@ -360,6 +364,9 @@ struct bnxt_qplib_cq { unsigned long flags; #define CQ_FLAGS_RESIZE_IN_PROG 1 wait_queue_head_t waitq; + struct list_head sqf_head, rqf_head; + atomic_t arm_state; + spinlock_t compl_lock; /* synch CQ handlers */ }; #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE sizeof(struct xrrq_irrq) @@ -400,6 +407,7 @@ struct bnxt_qplib_nq { struct pci_dev *pdev; int vector; + cpumask_t mask; int budget; bool requested; struct tasklet_struct worker; @@ -417,11 +425,19 @@ struct bnxt_qplib_nq { (struct bnxt_qplib_nq *nq, void *srq, u8 event); + struct workqueue_struct *cqn_wq; + char name[32]; +}; + +struct bnxt_qplib_nq_work { + struct work_struct work; + struct bnxt_qplib_nq *nq; + struct bnxt_qplib_cq *cq; }; void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq); int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, - int msix_vector, int bar_reg_offset, + int nq_idx, int msix_vector, int bar_reg_offset, int (*cqn_handler)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *cq), int (*srqn_handler)(struct bnxt_qplib_nq *nq, @@ -453,4 +469,13 @@ bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq); void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type); void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq); int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq); +void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp); +void bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp); +void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp, + unsigned long *flags); +void bnxt_qplib_release_cq_locks(struct bnxt_qplib_qp *qp, + unsigned long *flags); +int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq, + struct bnxt_qplib_cqe *cqe, + int num_cqes); #endif /* __BNXT_QPLIB_FP_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 16e42754dbec..391bb7006e8f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -44,6 +44,9 @@ #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_rcfw.h" +#include "qplib_sp.h" +#include "qplib_fp.h" + static void bnxt_qplib_service_creq(unsigned long data); /* Hardware communication channel */ @@ -279,16 +282,29 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, struct creq_qp_event *qp_event) { struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq; + struct creq_qp_error_notification *err_event; struct bnxt_qplib_crsq *crsqe; unsigned long flags; + struct bnxt_qplib_qp *qp; u16 cbit, blocked = 0; u16 cookie; __le16 mcookie; + u32 qp_id; switch (qp_event->event) { case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: + err_event = (struct creq_qp_error_notification *)qp_event; + qp_id = le32_to_cpu(err_event->xid); + qp = rcfw->qp_tbl[qp_id].qp_handle; dev_dbg(&rcfw->pdev->dev, "QPLIB: Received QP error notification"); + dev_dbg(&rcfw->pdev->dev, + "QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", + qp_id, err_event->req_err_state_reason, + err_event->res_err_state_reason); + bnxt_qplib_acquire_cq_locks(qp, &flags); + bnxt_qplib_mark_qp_error(qp); + bnxt_qplib_release_cq_locks(qp, &flags); break; default: /* Command Response */ @@ -507,6 +523,7 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { + kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq); bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq); @@ -514,7 +531,8 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) } int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, - struct bnxt_qplib_rcfw *rcfw) + struct bnxt_qplib_rcfw *rcfw, + int qp_tbl_sz) { rcfw->pdev = pdev; rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT; @@ -541,6 +559,12 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, if (!rcfw->crsqe_tbl) goto fail; + rcfw->qp_tbl_size = qp_tbl_sz; + rcfw->qp_tbl = kcalloc(qp_tbl_sz, sizeof(struct bnxt_qplib_qp_node), + GFP_KERNEL); + if (!rcfw->qp_tbl) + goto fail; + return 0; fail: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 09ce121770cd..0ed312f17c8d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -148,6 +148,11 @@ struct bnxt_qplib_rcfw_sbuf { u32 size; }; +struct bnxt_qplib_qp_node { + u32 qp_id; /* QP id */ + void *qp_handle; /* ptr to qplib_qp */ +}; + /* RCFW Communication Channels */ struct bnxt_qplib_rcfw { struct pci_dev *pdev; @@ -181,11 +186,13 @@ struct bnxt_qplib_rcfw { /* Actual Cmd and Resp Queues */ struct bnxt_qplib_hwq cmdq; struct bnxt_qplib_crsq *crsqe_tbl; + int qp_tbl_size; + struct bnxt_qplib_qp_node *qp_tbl; }; void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, - struct bnxt_qplib_rcfw *rcfw); + struct bnxt_qplib_rcfw *rcfw, int qp_tbl_sz); void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, struct bnxt_qplib_rcfw *rcfw, @@ -207,4 +214,5 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx, int is_virtfn); +void bnxt_qplib_mark_qp_error(void *qp_handle); #endif /* __BNXT_QPLIB_RCFW_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 62447b3badec..4e101704e801 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -468,9 +468,11 @@ static void bnxt_qplib_free_sgid_tbl(struct bnxt_qplib_res *res, kfree(sgid_tbl->tbl); kfree(sgid_tbl->hw_id); kfree(sgid_tbl->ctx); + kfree(sgid_tbl->vlan); sgid_tbl->tbl = NULL; sgid_tbl->hw_id = NULL; sgid_tbl->ctx = NULL; + sgid_tbl->vlan = NULL; sgid_tbl->max = 0; sgid_tbl->active = 0; } @@ -491,8 +493,15 @@ static int bnxt_qplib_alloc_sgid_tbl(struct bnxt_qplib_res *res, if (!sgid_tbl->ctx) goto out_free2; + sgid_tbl->vlan = kcalloc(max, sizeof(u8), GFP_KERNEL); + if (!sgid_tbl->vlan) + goto out_free3; + sgid_tbl->max = max; return 0; +out_free3: + kfree(sgid_tbl->ctx); + sgid_tbl->ctx = NULL; out_free2: kfree(sgid_tbl->hw_id); sgid_tbl->hw_id = NULL; @@ -514,6 +523,7 @@ static void bnxt_qplib_cleanup_sgid_tbl(struct bnxt_qplib_res *res, } memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max); memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max); + memset(sgid_tbl->vlan, 0, sizeof(u8) * sgid_tbl->max); sgid_tbl->active = 0; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 2e4855509719..e87207526d2c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -116,6 +116,7 @@ struct bnxt_qplib_sgid_tbl { u16 max; u16 active; void *ctx; + u8 *vlan; }; struct bnxt_qplib_pkey_tbl { @@ -188,6 +189,7 @@ struct bnxt_qplib_res { struct bnxt_qplib_sgid_tbl sgid_tbl; struct bnxt_qplib_pkey_tbl pkey_tbl; struct bnxt_qplib_dpi_tbl dpi_tbl; + bool prio; }; #define to_bnxt_qplib(ptr, type, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index ef91ab786dd4..e277e54a05eb 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -213,6 +213,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, } memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero, sizeof(bnxt_qplib_gid_zero)); + sgid_tbl->vlan[index] = 0; sgid_tbl->active--; dev_dbg(&res->pdev->dev, "QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x", @@ -265,28 +266,32 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct cmdq_add_gid req; struct creq_add_gid_resp resp; u16 cmd_flags = 0; - u32 temp32[4]; - u16 temp16[3]; int rc; RCFW_CMD_PREP(req, ADD_GID, cmd_flags); - memcpy(temp32, gid->data, sizeof(struct bnxt_qplib_gid)); - req.gid[0] = cpu_to_be32(temp32[3]); - req.gid[1] = cpu_to_be32(temp32[2]); - req.gid[2] = cpu_to_be32(temp32[1]); - req.gid[3] = cpu_to_be32(temp32[0]); - if (vlan_id != 0xFFFF) - req.vlan = cpu_to_le16((vlan_id & - CMDQ_ADD_GID_VLAN_VLAN_ID_MASK) | - CMDQ_ADD_GID_VLAN_TPID_TPID_8100 | - CMDQ_ADD_GID_VLAN_VLAN_EN); + req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]); + req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]); + req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]); + req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]); + /* + * driver should ensure that all RoCE traffic is always VLAN + * tagged if RoCE traffic is running on non-zero VLAN ID or + * RoCE traffic is running on non-zero Priority. + */ + if ((vlan_id != 0xFFFF) || res->prio) { + if (vlan_id != 0xFFFF) + req.vlan = cpu_to_le16 + (vlan_id & CMDQ_ADD_GID_VLAN_VLAN_ID_MASK); + req.vlan |= cpu_to_le16 + (CMDQ_ADD_GID_VLAN_TPID_TPID_8100 | + CMDQ_ADD_GID_VLAN_VLAN_EN); + } /* MAC in network format */ - memcpy(temp16, smac, 6); - req.src_mac[0] = cpu_to_be16(temp16[0]); - req.src_mac[1] = cpu_to_be16(temp16[1]); - req.src_mac[2] = cpu_to_be16(temp16[2]); + req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]); + req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]); + req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]); rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL, 0); @@ -297,6 +302,9 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, /* Add GID to the sgid_tbl */ memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid)); sgid_tbl->active++; + if (vlan_id != 0xFFFF) + sgid_tbl->vlan[free_idx] = 1; + dev_dbg(&res->pdev->dev, "QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x", free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active); @@ -306,6 +314,43 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, return 0; } +int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, + struct bnxt_qplib_gid *gid, u16 gid_idx, + u8 *smac) +{ + struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl, + struct bnxt_qplib_res, + sgid_tbl); + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct creq_modify_gid_resp resp; + struct cmdq_modify_gid req; + int rc; + u16 cmd_flags = 0; + + RCFW_CMD_PREP(req, MODIFY_GID, cmd_flags); + + req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]); + req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]); + req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]); + req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]); + if (res->prio) { + req.vlan |= cpu_to_le16 + (CMDQ_ADD_GID_VLAN_TPID_TPID_8100 | + CMDQ_ADD_GID_VLAN_VLAN_EN); + } + + /* MAC in network format */ + req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]); + req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]); + req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]); + + req.gid_index = cpu_to_le16(gid_idx); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + return rc; +} + /* pkeys */ int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res, struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 2ce7e2a32cf0..11322582f5e4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -135,6 +135,8 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id, bool update, u32 *index); +int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, + struct bnxt_qplib_gid *gid, u16 gid_idx, u8 *smac); int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res, struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index, u16 *pkey); diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index fc23477ac52f..eeb55b2db57e 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -1473,8 +1473,8 @@ struct cmdq_modify_gid { u8 resp_size; u8 reserved8; __le64 resp_addr; - __le32 gid[4]; - __le16 src_mac[3]; + __be32 gid[4]; + __be16 src_mac[3]; __le16 vlan; #define CMDQ_MODIFY_GID_VLAN_VLAN_ID_MASK 0xfffUL #define CMDQ_MODIFY_GID_VLAN_VLAN_ID_SFT 0 diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c index 47b2ce2ef203..591de319c178 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.c +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -45,7 +45,6 @@ MODULE_AUTHOR("Boyd Faulkner, Steve Wise"); MODULE_DESCRIPTION("Chelsio T3 RDMA Driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); static void open_rnic_dev(struct t3cdev *); static void close_rnic_dev(struct t3cdev *); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 0cd0c1fa27d4..099e76f3758a 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1336,8 +1336,7 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) { struct iwch_dev *iwch_dev = to_iwch_dev(ibdev); struct ethtool_drvinfo info; @@ -1345,7 +1344,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str, pr_debug("%s dev 0x%p\n", __func__, iwch_dev); lldev->ethtool_ops->get_drvinfo(lldev, &info); - snprintf(str, str_len, "%s", info.fw_version); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version); } int iwch_register_device(struct iwch_dev *dev) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index e49b34c3b136..ceaa2fa54d32 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2871,7 +2871,6 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) return 0; pr_debug("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - BUG_ON(!ep); /* The cm_id may be null if we failed to connect */ mutex_lock(&ep->com.mutex); diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index ae0b79aeea2e..fc886f81b885 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -44,7 +44,6 @@ MODULE_AUTHOR("Steve Wise"); MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); static int allow_db_fc_on_t5; module_param(allow_db_fc_on_t5, int, 0644); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0771e9a4d061..346e8334279a 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -517,14 +517,13 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *dev, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *dev, char *str) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev); pr_debug("%s dev 0x%p\n", __func__, dev); - snprintf(str, str_len, "%u.%u.%u.%u", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u", FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers), FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers), FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers), diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index f6ea0881765a..7b146b67a80f 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -13,13 +13,6 @@ config HFI1_DEBUG_SDMA_ORDER ---help--- This is a debug flag to test for out of order sdma completions for unit testing -config HFI1_VERBS_31BIT_PSN - bool "HFI1 enable 31 bit PSN" - depends on INFINIBAND_HFI1 - default y - ---help--- - Setting this enables 31 BIT PSN - For verbs RC/UC config SDMA_VERBOSITY bool "Config SDMA Verbosity" depends on INFINIBAND_HFI1 diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 88085f65432e..66d538c033b0 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ - eprom.o file_ops.o firmware.o \ + eprom.o exp_rcv.o file_ops.o firmware.o \ init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index e2cd2cd3b28a..a97055dd4fbd 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -335,10 +335,10 @@ static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) sde->cpu = cpu; cpumask_clear(&msix->mask); cpumask_set_cpu(cpu, &msix->mask); - dd_dev_dbg(dd, "IRQ vector: %u, type %s engine %u -> cpu: %d\n", - msix->msix.vector, irq_type_names[msix->type], + dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", + msix->irq, irq_type_names[msix->type], sde->this_idx, cpu); - irq_set_affinity_hint(msix->msix.vector, &msix->mask); + irq_set_affinity_hint(msix->irq, &msix->mask); /* * Set the new cpu in the hfi1_affinity_node and clean @@ -387,7 +387,7 @@ static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) { struct irq_affinity_notify *notify = &msix->notify; - notify->irq = msix->msix.vector; + notify->irq = msix->irq; notify->notify = hfi1_irq_notifier_notify; notify->release = hfi1_irq_notifier_release; @@ -472,10 +472,10 @@ static int get_irq_affinity(struct hfi1_devdata *dd, } cpumask_set_cpu(cpu, &msix->mask); - dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n", - msix->msix.vector, irq_type_names[msix->type], + dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", + msix->irq, irq_type_names[msix->type], extra, cpu); - irq_set_affinity_hint(msix->msix.vector, &msix->mask); + irq_set_affinity_hint(msix->irq, &msix->mask); if (msix->type == IRQ_SDMA) { sde->cpu = cpu; @@ -533,7 +533,7 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, } } - irq_set_affinity_hint(msix->msix.vector, NULL); + irq_set_affinity_hint(msix->irq, NULL); cpumask_clear(&msix->mask); mutex_unlock(&node_affinity.lock); } diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index e78c7aa094e0..2a1e374169c0 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -75,24 +75,26 @@ struct hfi1_msix_entry; /* Initialize non-HT cpu cores mask */ void init_real_cpu_mask(void); /* Initialize driver affinity data */ -int hfi1_dev_affinity_init(struct hfi1_devdata *); +int hfi1_dev_affinity_init(struct hfi1_devdata *dd); /* * Set IRQ affinity to a CPU. The function will determine the * CPU and set the affinity to it. */ -int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); +int hfi1_get_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix); /* * Remove the IRQ's CPU affinity. This function also updates * any internal CPU tracking data */ -void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); +void hfi1_put_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix); /* * Determine a CPU affinity for a user process, if the process does not * have an affinity set yet. */ -int hfi1_get_proc_affinity(int); +int hfi1_get_proc_affinity(int node); /* Release a CPU used by a user process. */ -void hfi1_put_proc_affinity(int); +void hfi1_put_proc_affinity(int cpu); struct hfi1_affinity_node { int node; diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h index 794e6814a531..522b40ed9937 100644 --- a/drivers/infiniband/hw/hfi1/aspm.h +++ b/drivers/infiniband/hw/hfi1/aspm.h @@ -237,14 +237,17 @@ static inline void aspm_disable_all(struct hfi1_devdata *dd) { struct hfi1_ctxtdata *rcd; unsigned long flags; - unsigned i; + u16 i; for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { - rcd = dd->rcd[i]; - del_timer_sync(&rcd->aspm_timer); - spin_lock_irqsave(&rcd->aspm_lock, flags); - rcd->aspm_intr_enable = false; - spin_unlock_irqrestore(&rcd->aspm_lock, flags); + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) { + del_timer_sync(&rcd->aspm_timer); + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = false; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + hfi1_rcd_put(rcd); + } } aspm_disable(dd); @@ -256,7 +259,7 @@ static inline void aspm_enable_all(struct hfi1_devdata *dd) { struct hfi1_ctxtdata *rcd; unsigned long flags; - unsigned i; + u16 i; aspm_enable(dd); @@ -264,11 +267,14 @@ static inline void aspm_enable_all(struct hfi1_devdata *dd) return; for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { - rcd = dd->rcd[i]; - spin_lock_irqsave(&rcd->aspm_lock, flags); - rcd->aspm_intr_enable = true; - rcd->aspm_enabled = true; - spin_unlock_irqrestore(&rcd->aspm_lock, flags); + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) { + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = true; + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + hfi1_rcd_put(rcd); + } } } @@ -284,13 +290,18 @@ static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd) static inline void aspm_init(struct hfi1_devdata *dd) { - unsigned i; + struct hfi1_ctxtdata *rcd; + u16 i; spin_lock_init(&dd->aspm_lock); dd->aspm_supported = aspm_hw_l1_supported(dd); - for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) - aspm_ctx_init(dd->rcd[i]); + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + aspm_ctx_init(rcd); + hfi1_rcd_put(rcd); + } /* Start with ASPM disabled */ aspm_hw_set_l1_ent_latency(dd); diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 94b54850ec75..b2ed4b9cda6e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1012,14 +1012,15 @@ static struct flag_table dc8051_info_err_flags[] = { */ static struct flag_table dc8051_info_host_msg_flags[] = { FLAG_ENTRY0("Host request done", 0x0001), - FLAG_ENTRY0("BC SMA message", 0x0002), - FLAG_ENTRY0("BC PWR_MGM message", 0x0004), + FLAG_ENTRY0("BC PWR_MGM message", 0x0002), + FLAG_ENTRY0("BC SMA message", 0x0004), FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008), FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010), FLAG_ENTRY0("External device config request", 0x0020), FLAG_ENTRY0("VerifyCap all frames received", 0x0040), FLAG_ENTRY0("LinkUp achieved", 0x0080), FLAG_ENTRY0("Link going down", 0x0100), + FLAG_ENTRY0("Link width downgraded", 0x0200), }; static u32 encoded_size(u32 size); @@ -1064,8 +1065,13 @@ static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); static int thermal_init(struct hfi1_devdata *dd); +static void update_statusp(struct hfi1_pportdata *ppd, u32 state); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); +static void log_state_transition(struct hfi1_pportdata *ppd, u32 state); +static void log_physical_state(struct hfi1_pportdata *ppd, u32 state); +static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs); static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *dd); @@ -1294,25 +1300,71 @@ CNTR_ELEM(#name, \ CNTR_SYNTH, \ access_ibp_##cntr) +/** + * hfi_addr_from_offset - return addr for readq/writeq + * @dd - the dd device + * @offset - the offset of the CSR within bar0 + * + * This routine selects the appropriate base address + * based on the indicated offset. + */ +static inline void __iomem *hfi1_addr_from_offset( + const struct hfi1_devdata *dd, + u32 offset) +{ + if (offset >= dd->base2_start) + return dd->kregbase2 + (offset - dd->base2_start); + return dd->kregbase1 + offset; +} + +/** + * read_csr - read CSR at the indicated offset + * @dd - the dd device + * @offset - the offset of the CSR within bar0 + * + * Return: the value read or all FF's if there + * is no mapping + */ u64 read_csr(const struct hfi1_devdata *dd, u32 offset) { - if (dd->flags & HFI1_PRESENT) { - return readq((void __iomem *)dd->kregbase + offset); - } + if (dd->flags & HFI1_PRESENT) + return readq(hfi1_addr_from_offset(dd, offset)); return -1; } +/** + * write_csr - write CSR at the indicated offset + * @dd - the dd device + * @offset - the offset of the CSR within bar0 + * @value - value to write + */ void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value) { - if (dd->flags & HFI1_PRESENT) - writeq(value, (void __iomem *)dd->kregbase + offset); + if (dd->flags & HFI1_PRESENT) { + void __iomem *base = hfi1_addr_from_offset(dd, offset); + + /* avoid write to RcvArray */ + if (WARN_ON(offset >= RCV_ARRAY && offset < dd->base2_start)) + return; + writeq(value, base); + } } +/** + * get_csr_addr - return te iomem address for offset + * @dd - the dd device + * @offset - the offset of the CSR within bar0 + * + * Return: The iomem address to use in subsequent + * writeq/readq operations. + */ void __iomem *get_csr_addr( - struct hfi1_devdata *dd, + const struct hfi1_devdata *dd, u32 offset) { - return (void __iomem *)dd->kregbase + offset; + if (dd->flags & HFI1_PRESENT) + return hfi1_addr_from_offset(dd, offset); + return NULL; } static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr, @@ -5496,7 +5548,7 @@ static void update_rcverr_timer(unsigned long opaque) set_link_down_reason( ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN); - queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); + queue_work(ppd->link_wq, &ppd->link_bounce_work); } dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt; @@ -6051,7 +6103,7 @@ static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) * will not happen. We have to do it here * before turning the DC off. */ - queue_work(ppd->hfi1_wq, &ppd->link_down_work); + queue_work(ppd->link_wq, &ppd->link_down_work); } } else { dd_dev_info(dd, "%s: QSFP module inserted\n", @@ -6086,7 +6138,7 @@ static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) /* Schedule the QSFP work only if there is a cable attached. */ if (qsfp_mod_present(ppd)) - queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work); + queue_work(ppd->link_wq, &ppd->qsfp_info.qsfp_work); } static int request_host_lcb_access(struct hfi1_devdata *dd) @@ -6735,13 +6787,17 @@ static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze) static void rxe_freeze(struct hfi1_devdata *dd) { int i; + struct hfi1_ctxtdata *rcd; /* disable port */ clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); /* disable all receive contexts */ - for (i = 0; i < dd->num_rcv_contexts; i++) - hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i); + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, rcd); + hfi1_rcd_put(rcd); + } } /* @@ -6753,21 +6809,24 @@ static void rxe_freeze(struct hfi1_devdata *dd) static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) { u32 rcvmask; - int i; + u16 i; + struct hfi1_ctxtdata *rcd; /* enable all kernel contexts */ for (i = 0; i < dd->num_rcv_contexts; i++) { - struct hfi1_ctxtdata *rcd = dd->rcd[i]; + rcd = hfi1_rcd_get_by_index(dd, i); /* Ensure all non-user contexts(including vnic) are enabled */ - if (!rcd || !rcd->sc || (rcd->sc->type == SC_USER)) + if (!rcd || !rcd->sc || (rcd->sc->type == SC_USER)) { + hfi1_rcd_put(rcd); continue; - + } rcvmask = HFI1_RCVCTRL_CTXT_ENB; /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */ - rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; - hfi1_rcvctrl(dd, rcvmask, i); + hfi1_rcvctrl(dd, rcvmask, rcd); + hfi1_rcd_put(rcd); } /* enable port */ @@ -6906,7 +6965,7 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) static const char * const link_down_reason_strs[] = { [OPA_LINKDOWN_REASON_NONE] = "None", - [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0", + [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Receive error 0", [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length", [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long", [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short", @@ -6996,6 +7055,7 @@ void handle_link_down(struct work_struct *work) /* Go offline first, then deal with reading/writing through 8051 */ was_up = !!(ppd->host_link_state & HLS_UP); set_link_state(ppd, HLS_DN_OFFLINE); + xchg(&ppd->is_link_down_queued, 0); if (was_up) { lcl_reason = 0; @@ -7330,7 +7390,7 @@ void handle_verify_cap(struct work_struct *work) struct hfi1_devdata *dd = ppd->dd; u64 reg; u8 power_management; - u8 continious; + u8 continuous; u8 vcu; u8 vau; u8 z; @@ -7349,7 +7409,7 @@ void handle_verify_cap(struct work_struct *work) lcb_shutdown(dd, 0); adjust_lcb_for_fpga_serdes(dd); - read_vc_remote_phy(dd, &power_management, &continious); + read_vc_remote_phy(dd, &power_management, &continuous); read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf, &partner_supported_crc); read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths); @@ -7363,7 +7423,7 @@ void handle_verify_cap(struct work_struct *work) get_link_widths(dd, &active_tx, &active_rx); dd_dev_info(dd, "Peer PHY: power management 0x%x, continuous updates 0x%x\n", - (int)power_management, (int)continious); + (int)power_management, (int)continuous); dd_dev_info(dd, "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n", (int)vau, (int)z, (int)vcu, (int)vl15buf, @@ -7689,12 +7749,12 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)HOST_REQ_DONE; } if (host_msg & BC_SMA_MSG) { - queue_work(ppd->hfi1_wq, &ppd->sma_message_work); + queue_work(ppd->link_wq, &ppd->sma_message_work); host_msg &= ~(u64)BC_SMA_MSG; } if (host_msg & LINKUP_ACHIEVED) { dd_dev_info(dd, "8051: Link up\n"); - queue_work(ppd->hfi1_wq, &ppd->link_up_work); + queue_work(ppd->link_wq, &ppd->link_up_work); host_msg &= ~(u64)LINKUP_ACHIEVED; } if (host_msg & EXT_DEVICE_CFG_REQ) { @@ -7702,7 +7762,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; } if (host_msg & VERIFY_CAP_FRAME) { - queue_work(ppd->hfi1_wq, &ppd->link_vc_work); + queue_work(ppd->link_wq, &ppd->link_vc_work); host_msg &= ~(u64)VERIFY_CAP_FRAME; } if (host_msg & LINK_GOING_DOWN) { @@ -7717,7 +7777,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)LINK_GOING_DOWN; } if (host_msg & LINK_WIDTH_DOWNGRADED) { - queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work); + queue_work(ppd->link_wq, &ppd->link_downgrade_work); host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; } if (host_msg) { @@ -7752,15 +7812,22 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) if (queue_link_down) { /* * if the link is already going down or disabled, do not - * queue another + * queue another. If there's a link down entry already + * queued, don't queue another one. */ if ((ppd->host_link_state & (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) || ppd->link_enabled == 0) { - dd_dev_info(dd, "%s: not queuing link down\n", - __func__); + dd_dev_info(dd, "%s: not queuing link down. host_link_state %x, link_enabled %x\n", + __func__, ppd->host_link_state, + ppd->link_enabled); } else { - queue_work(ppd->hfi1_wq, &ppd->link_down_work); + if (xchg(&ppd->is_link_down_queued, 1) == 1) + dd_dev_info(dd, + "%s: link down request already queued\n", + __func__); + else + queue_work(ppd->link_wq, &ppd->link_down_work); } } } @@ -7968,7 +8035,7 @@ static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) dd_dev_info_ratelimited(dd, "%s: PortErrorAction bounce\n", __func__); set_link_down_reason(ppd, lcl_reason, 0, lcl_reason); - queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); + queue_work(ppd->link_wq, &ppd->link_bounce_work); } } @@ -8052,7 +8119,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) char *err_detail; if (likely(source < dd->num_rcv_contexts)) { - rcd = dd->rcd[source]; + rcd = hfi1_rcd_get_by_index(dd, source); if (rcd) { /* Check for non-user contexts, including vnic */ if ((source < dd->first_dyn_alloc_ctxt) || @@ -8060,6 +8127,8 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) rcd->do_interrupt(rcd, 0); else handle_user_interrupt(rcd); + + hfi1_rcd_put(rcd); return; /* OK */ } /* received an interrupt, but no rcd */ @@ -8081,12 +8150,14 @@ static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) char *err_detail; if (likely(source < dd->num_rcv_contexts)) { - rcd = dd->rcd[source]; + rcd = hfi1_rcd_get_by_index(dd, source); if (rcd) { /* only pay attention to user urgent interrupts */ if ((source >= dd->first_dyn_alloc_ctxt) && (!rcd->sc || (rcd->sc->type == SC_USER))) handle_user_interrupt(rcd); + + hfi1_rcd_put(rcd); return; /* OK */ } /* received an interrupt, but no rcd */ @@ -8219,8 +8290,8 @@ static irqreturn_t sdma_interrupt(int irq, void *data) /* handle the interrupt(s) */ sdma_engine_interrupt(sde, status); } else { - dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n", - sde->this_idx); + dd_dev_err_ratelimited(dd, "SDMA engine %u interrupt, but no status bits set\n", + sde->this_idx); } return IRQ_HANDLED; } @@ -8291,7 +8362,7 @@ static irqreturn_t receive_context_interrupt(int irq, void *data) int disposition; int present; - trace_hfi1_receive_interrupt(dd, rcd->ctxt); + trace_hfi1_receive_interrupt(dd, rcd); this_cpu_inc(*dd->int_counter); aspm_ctx_disable(rcd); @@ -8781,6 +8852,20 @@ static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, & REMOTE_DEVICE_REV_MASK; } +int write_host_interface_version(struct hfi1_devdata *dd, u8 version) +{ + u32 frame; + u32 mask; + + mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT); + read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame); + /* Clear, then set field */ + frame &= ~mask; + frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT); + return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, + frame); +} + void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, u8 *ver_patch) { @@ -9257,12 +9342,6 @@ int start_link(struct hfi1_pportdata *ppd) */ tune_serdes(ppd); - if (!ppd->link_enabled) { - dd_dev_info(ppd->dd, - "%s: stopping link start because link is disabled\n", - __func__); - return 0; - } if (!ppd->driver_link_ready) { dd_dev_info(ppd->dd, "%s: stopping link start because driver is not ready\n", @@ -9373,13 +9452,13 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) || (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING)) - dd_dev_info(dd, "%s: QSFP cable temperature too high\n", - __func__); + dd_dev_err(dd, "%s: QSFP cable temperature too high\n", + __func__); if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) || (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING)) - dd_dev_info(dd, "%s: QSFP cable temperature too low\n", - __func__); + dd_dev_err(dd, "%s: QSFP cable temperature too low\n", + __func__); /* * The remaining alarms/warnings don't matter if the link is down. @@ -9389,75 +9468,75 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) || (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) - dd_dev_info(dd, "%s: QSFP supply voltage too high\n", - __func__); + dd_dev_err(dd, "%s: QSFP supply voltage too high\n", + __func__); if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) || (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING)) - dd_dev_info(dd, "%s: QSFP supply voltage too low\n", - __func__); + dd_dev_err(dd, "%s: QSFP supply voltage too low\n", + __func__); /* Byte 2 is vendor specific */ if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) || (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n", - __func__); + dd_dev_err(dd, "%s: Cable RX channel 1/2 power too high\n", + __func__); if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) || (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n", - __func__); + dd_dev_err(dd, "%s: Cable RX channel 1/2 power too low\n", + __func__); if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) || (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n", - __func__); + dd_dev_err(dd, "%s: Cable RX channel 3/4 power too high\n", + __func__); if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) || (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n", - __func__); + dd_dev_err(dd, "%s: Cable RX channel 3/4 power too low\n", + __func__); if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) || (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too high\n", + __func__); if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) || (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too low\n", + __func__); if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) || (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too high\n", + __func__); if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) || (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too low\n", + __func__); if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) || (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 1/2 power too high\n", + __func__); if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) || (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 1/2 power too low\n", + __func__); if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) || (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 3/4 power too high\n", + __func__); if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) || (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n", - __func__); + dd_dev_err(dd, "%s: Cable TX channel 3/4 power too low\n", + __func__); /* Bytes 9-10 and 11-12 are reserved */ /* Bytes 13-15 are vendor specific */ @@ -9480,6 +9559,13 @@ void qsfp_event(struct work_struct *work) if (!qsfp_mod_present(ppd)) return; + if (ppd->host_link_state == HLS_DN_DISABLE) { + dd_dev_info(ppd->dd, + "%s: stopping link start because link is disabled\n", + __func__); + return; + } + /* * Turn DC back on after cable has been re-inserted. Up until * now, the DC has been in reset to save power. @@ -9635,7 +9721,7 @@ static void try_start_link(struct hfi1_pportdata *ppd) "QSFP not responding, waiting and retrying %d\n", (int)ppd->qsfp_retry_count); ppd->qsfp_retry_count++; - queue_delayed_work(ppd->hfi1_wq, &ppd->start_link_work, + queue_delayed_work(ppd->link_wq, &ppd->start_link_work, msecs_to_jiffies(QSFP_RETRY_WAIT)); return; } @@ -9742,17 +9828,6 @@ static inline int init_cpu_counters(struct hfi1_devdata *dd) return 0; } -static const char * const pt_names[] = { - "expected", - "eager", - "invalid" -}; - -static const char *pt_name(u32 type) -{ - return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type]; -} - /* * index is the index into the receive array */ @@ -9760,35 +9835,34 @@ void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, u32 type, unsigned long pa, u16 order) { u64 reg; - void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc : - (dd->kregbase + RCV_ARRAY)); if (!(dd->flags & HFI1_PRESENT)) goto done; - if (type == PT_INVALID) { + if (type == PT_INVALID || type == PT_INVALID_FLUSH) { pa = 0; + order = 0; } else if (type > PT_INVALID) { dd_dev_err(dd, "unexpected receive array type %u for index %u, not handled\n", type, index); goto done; } - - hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx", - pt_name(type), index, pa, (unsigned long)order); + trace_hfi1_put_tid(dd, index, type, pa, order); #define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */ reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK) << RCV_ARRAY_RT_ADDR_SHIFT; - writeq(reg, base + (index * 8)); + trace_hfi1_write_rcvarray(dd->rcvarray_wc + (index * 8), reg); + writeq(reg, dd->rcvarray_wc + (index * 8)); - if (type == PT_EAGER) + if (type == PT_EAGER || type == PT_INVALID_FLUSH || (index & 3) == 3) /* - * Eager entries are written one-by-one so we have to push them - * after we write the entry. + * Eager entries are written and flushed + * + * Expected entries are flushed every 4 writes */ flush_wc(); done: @@ -9810,15 +9884,6 @@ void hfi1_clear_tids(struct hfi1_ctxtdata *rcd) hfi1_put_tid(dd, i, PT_INVALID, 0, 0); } -struct ib_header *hfi1_get_msgheader( - struct hfi1_devdata *dd, __le32 *rhf_addr) -{ - u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr)); - - return (struct ib_header *) - (rhf_addr - dd->rhf_offset + offset); -} - static const char * const ib_cfg_name_strings[] = { "HFI1_IB_CFG_LIDLMC", "HFI1_IB_CFG_LWID_DG_ENB", @@ -10010,10 +10075,16 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) struct hfi1_devdata *dd = ppd->dd; u32 mask = ~((1U << ppd->lmc) - 1); u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1); + u32 lid; + /* + * Program 0 in CSR if port lid is extended. This prevents + * 9B packets being sent out for large lids. + */ + lid = (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ? 0 : ppd->lid; c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); - c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) + c1 |= ((lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) | ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK) << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT); @@ -10024,7 +10095,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) */ sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) << SEND_CTXT_CHECK_SLID_MASK_SHIFT) | - (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << + (((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << SEND_CTXT_CHECK_SLID_VALUE_SHIFT); for (i = 0; i < dd->chip_send_contexts; i++) { @@ -10034,29 +10105,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) } /* Now we have to do the same thing for the sdma engines */ - sdma_update_lmc(dd, mask, ppd->lid); -} - -static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs) -{ - unsigned long timeout; - u32 curr_state; - - timeout = jiffies + msecs_to_jiffies(msecs); - while (1) { - curr_state = read_physical_state(dd); - if (curr_state == state) - break; - if (time_after(jiffies, timeout)) { - dd_dev_err(dd, - "timeout waiting for phy link state 0x%x, current state is 0x%x\n", - state, curr_state); - return -ETIMEDOUT; - } - usleep_range(1950, 2050); /* sleep 2ms-ish */ - } - - return 0; + sdma_update_lmc(dd, mask, lid); } static const char *state_completed_string(u32 completed) @@ -10238,8 +10287,10 @@ static void force_logical_link_state_down(struct hfi1_pportdata *ppd) write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, 0); write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, 0); - /* call again to adjust ppd->statusp, if needed */ - get_logical_state(ppd); + /* adjust ppd->statusp, if needed */ + update_statusp(ppd, IB_PORT_DOWN); + + dd_dev_info(ppd->dd, "logical state forced to LINK_DOWN\n"); } /* @@ -10253,49 +10304,35 @@ static void force_logical_link_state_down(struct hfi1_pportdata *ppd) static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; - u32 pstate, previous_state; + u32 previous_state; int ret; - int do_transition; - int do_wait; update_lcb_cache(dd); previous_state = ppd->host_link_state; ppd->host_link_state = HLS_GOING_OFFLINE; - pstate = read_physical_state(dd); - if (pstate == PLS_OFFLINE) { - do_transition = 0; /* in right state */ - do_wait = 0; /* ...no need to wait */ - } else if ((pstate & 0xf0) == PLS_OFFLINE) { - do_transition = 0; /* in an offline transient state */ - do_wait = 1; /* ...wait for it to settle */ - } else { - do_transition = 1; /* need to move to offline */ - do_wait = 1; /* ...will need to wait */ - } - if (do_transition) { - ret = set_physical_link_state(dd, - (rem_reason << 8) | PLS_OFFLINE); + /* start offline transition */ + ret = set_physical_link_state(dd, (rem_reason << 8) | PLS_OFFLINE); - if (ret != HCMD_SUCCESS) { - dd_dev_err(dd, - "Failed to transition to Offline link state, return %d\n", - ret); - return -EINVAL; - } - if (ppd->offline_disabled_reason == - HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) - ppd->offline_disabled_reason = - HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Offline link state, return %d\n", + ret); + return -EINVAL; } + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); - if (do_wait) { - /* it can take a while for the link to go down */ - ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000); - if (ret < 0) - return ret; - } + /* + * Wait for offline transition. It can take a while for + * the link to go down. + */ + ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 10000); + if (ret < 0) + return ret; /* * Now in charge of LCB - must be after the physical state is @@ -10415,11 +10452,11 @@ static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state) } /* - * driver_physical_state - convert the driver's notion of a port's + * driver_pstate - convert the driver's notion of a port's * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*). * Return -1 (converted to a u32) to indicate error. */ -u32 driver_physical_state(struct hfi1_pportdata *ppd) +u32 driver_pstate(struct hfi1_pportdata *ppd) { switch (ppd->host_link_state) { case HLS_UP_INIT: @@ -10449,11 +10486,11 @@ u32 driver_physical_state(struct hfi1_pportdata *ppd) } /* - * driver_logical_state - convert the driver's notion of a port's + * driver_lstate - convert the driver's notion of a port's * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1 * (converted to a u32) to indicate error. */ -u32 driver_logical_state(struct hfi1_pportdata *ppd) +u32 driver_lstate(struct hfi1_pportdata *ppd) { if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN)) return IB_PORT_DOWN; @@ -10484,6 +10521,14 @@ void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, } /* + * Verify if BCT for data VLs is non-zero. + */ +static inline bool data_vls_operational(struct hfi1_pportdata *ppd) +{ + return !!ppd->actual_vls_operational; +} + +/* * Change the physical and/or logical link state. * * Do not call this routine while inside an interrupt. It contains @@ -10545,38 +10590,58 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) goto unexpected; } + /* + * Wait for Link_Up physical state. + * Physical and Logical states should already be + * be transitioned to LinkUp and LinkInit respectively. + */ + ret = wait_physical_linkstate(ppd, PLS_LINKUP, 1000); + if (ret) { + dd_dev_err(dd, + "%s: physical state did not change to LINK-UP\n", + __func__); + break; + } + ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000); if (ret) { dd_dev_err(dd, "%s: logical state did not change to INIT\n", __func__); - } else { - /* clear old transient LINKINIT_REASON code */ - if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR) - ppd->linkinit_reason = - OPA_LINKINIT_REASON_LINKUP; + break; + } - /* enable the port */ - add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + /* clear old transient LINKINIT_REASON code */ + if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR) + ppd->linkinit_reason = + OPA_LINKINIT_REASON_LINKUP; - handle_linkup_change(dd, 1); - ppd->host_link_state = HLS_UP_INIT; - } + /* enable the port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + handle_linkup_change(dd, 1); + ppd->host_link_state = HLS_UP_INIT; break; case HLS_UP_ARMED: if (ppd->host_link_state != HLS_UP_INIT) goto unexpected; - ppd->host_link_state = HLS_UP_ARMED; + if (!data_vls_operational(ppd)) { + dd_dev_err(dd, + "%s: data VLs not operational\n", __func__); + ret = -EINVAL; + break; + } + set_logical_state(dd, LSTATE_ARMED); ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000); if (ret) { - /* logical state didn't change, stay at init */ - ppd->host_link_state = HLS_UP_INIT; dd_dev_err(dd, "%s: logical state did not change to ARMED\n", __func__); + break; } + ppd->host_link_state = HLS_UP_ARMED; /* * The simulator does not currently implement SMA messages, * so neighbor_normal is not set. Set it here when we first @@ -10589,18 +10654,16 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) if (ppd->host_link_state != HLS_UP_ARMED) goto unexpected; - ppd->host_link_state = HLS_UP_ACTIVE; set_logical_state(dd, LSTATE_ACTIVE); ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000); if (ret) { - /* logical state didn't change, stay at armed */ - ppd->host_link_state = HLS_UP_ARMED; dd_dev_err(dd, "%s: logical state did not change to ACTIVE\n", __func__); } else { /* tell all engines to go running */ sdma_all_running(dd); + ppd->host_link_state = HLS_UP_ACTIVE; /* Signal the IB layer that the port has went active */ event.device = &dd->verbs_dev.rdi.ibdev; @@ -10658,6 +10721,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) */ if (ret) goto_offline(ppd, 0); + else + log_physical_state(ppd, PLS_POLLING); break; case HLS_DN_DISABLE: /* link is disabled */ @@ -10682,6 +10747,13 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) ret = -EINVAL; break; } + ret = wait_physical_linkstate(ppd, PLS_DISABLED, 10000); + if (ret) { + dd_dev_err(dd, + "%s: physical state did not change to DISABLED\n", + __func__); + break; + } dc_shutdown(dd); } ppd->host_link_state = HLS_DN_DISABLE; @@ -10699,6 +10771,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) if (ppd->host_link_state != HLS_DN_POLL) goto unexpected; ppd->host_link_state = HLS_VERIFY_CAP; + log_physical_state(ppd, PLS_CONFIGPHY_VERIFYCAP); break; case HLS_GOING_UP: if (ppd->host_link_state != HLS_VERIFY_CAP) @@ -11693,16 +11766,18 @@ static u32 encoded_size(u32 size) return 0x1; /* if invalid, go with the minimum size */ } -void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, + struct hfi1_ctxtdata *rcd) { - struct hfi1_ctxtdata *rcd; u64 rcvctrl, reg; int did_enable = 0; + u16 ctxt; - rcd = dd->rcd[ctxt]; if (!rcd) return; + ctxt = rcd->ctxt; + hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op); rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL); @@ -12604,20 +12679,8 @@ const char *opa_pstate_name(u32 pstate) return "unknown"; } -/* - * Read the hardware link state and set the driver's cached value of it. - * Return the (new) current value. - */ -u32 get_logical_state(struct hfi1_pportdata *ppd) +static void update_statusp(struct hfi1_pportdata *ppd, u32 state) { - u32 new_state; - - new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd)); - if (new_state != ppd->lstate) { - dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", - opa_lstate_name(new_state), new_state); - ppd->lstate = new_state; - } /* * Set port status flags in the page mapped into userspace * memory. Do it here to ensure a reliable state - this is @@ -12627,7 +12690,7 @@ u32 get_logical_state(struct hfi1_pportdata *ppd) * function. */ if (ppd->statusp) { - switch (ppd->lstate) { + switch (state) { case IB_PORT_DOWN: case IB_PORT_INIT: *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | @@ -12641,10 +12704,9 @@ u32 get_logical_state(struct hfi1_pportdata *ppd) break; } } - return ppd->lstate; } -/** +/* * wait_logical_linkstate - wait for an IB link state change to occur * @ppd: port device * @state: the state to wait for @@ -12658,35 +12720,88 @@ static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs) { unsigned long timeout; + u32 new_state; timeout = jiffies + msecs_to_jiffies(msecs); while (1) { - if (get_logical_state(ppd) == state) - return 0; - if (time_after(jiffies, timeout)) + new_state = chip_to_opa_lstate(ppd->dd, + read_logical_state(ppd->dd)); + if (new_state == state) break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for link state 0x%x\n", + state); + return -ETIMEDOUT; + } msleep(20); } - dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state); - return -ETIMEDOUT; + update_statusp(ppd, state); + dd_dev_info(ppd->dd, + "logical state changed to %s (0x%x)\n", + opa_lstate_name(state), + state); + return 0; } -u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd) +static void log_state_transition(struct hfi1_pportdata *ppd, u32 state) { - u32 pstate; - u32 ib_pstate; + u32 ib_pstate = chip_to_opa_pstate(ppd->dd, state); - pstate = read_physical_state(ppd->dd); - ib_pstate = chip_to_opa_pstate(ppd->dd, pstate); - if (ppd->last_pstate != ib_pstate) { - dd_dev_info(ppd->dd, - "%s: physical state changed to %s (0x%x), phy 0x%x\n", - __func__, opa_pstate_name(ib_pstate), ib_pstate, - pstate); - ppd->last_pstate = ib_pstate; + dd_dev_info(ppd->dd, + "physical state changed to %s (0x%x), phy 0x%x\n", + opa_pstate_name(ib_pstate), ib_pstate, state); +} + +/* + * Read the physical hardware link state and check if it matches host + * drivers anticipated state. + */ +static void log_physical_state(struct hfi1_pportdata *ppd, u32 state) +{ + u32 read_state = read_physical_state(ppd->dd); + + if (read_state == state) { + log_state_transition(ppd, state); + } else { + dd_dev_err(ppd->dd, + "anticipated phy link state 0x%x, read 0x%x\n", + state, read_state); } - return ib_pstate; +} + +/* + * wait_physical_linkstate - wait for an physical link state change to occur + * @ppd: port device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for physical link state change to occur. + * Returns 0 if state reached, otherwise -ETIMEDOUT. + */ +static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if (read_state == state) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link state 0x%x\n", + state); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, state); + return 0; } #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ @@ -12809,30 +12924,24 @@ static void clean_up_interrupts(struct hfi1_devdata *dd) for (i = 0; i < dd->num_msix_entries; i++, me++) { if (!me->arg) /* => no irq, no affinity */ continue; - hfi1_put_irq_affinity(dd, &dd->msix_entries[i]); - free_irq(me->msix.vector, me->arg); + hfi1_put_irq_affinity(dd, me); + free_irq(me->irq, me->arg); } + + /* clean structures */ + kfree(dd->msix_entries); + dd->msix_entries = NULL; + dd->num_msix_entries = 0; } else { /* INTx */ if (dd->requested_intx_irq) { free_irq(dd->pcidev->irq, dd); dd->requested_intx_irq = 0; } - } - - /* turn off interrupts */ - if (dd->num_msix_entries) { - /* MSI-X */ - pci_disable_msix(dd->pcidev); - } else { - /* INTx */ disable_intx(dd->pcidev); } - /* clean structures */ - kfree(dd->msix_entries); - dd->msix_entries = NULL; - dd->num_msix_entries = 0; + pci_free_irq_vectors(dd->pcidev); } /* @@ -12953,7 +13062,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd) me->type = IRQ_SDMA; } else if (first_rx <= i && i < last_rx) { idx = i - first_rx; - rcd = dd->rcd[idx]; + rcd = hfi1_rcd_get_by_index(dd, idx); if (rcd) { /* * Set the interrupt register and mask for this @@ -12972,6 +13081,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd) remap_intr(dd, IS_RCVAVAIL_START + idx, i); me->type = IRQ_RCVCTXT; rcd->msix_intr = i; + hfi1_rcd_put(rcd); } } else { /* not in our expected range - complain, then @@ -12986,13 +13096,21 @@ static int request_msix_irqs(struct hfi1_devdata *dd) continue; /* make sure the name is terminated */ me->name[sizeof(me->name) - 1] = 0; + me->irq = pci_irq_vector(dd->pcidev, i); + /* + * On err return me->irq. Don't need to clear this + * because 'arg' has not been set, and cleanup will + * do the right thing. + */ + if (me->irq < 0) + return me->irq; - ret = request_threaded_irq(me->msix.vector, handler, thread, 0, + ret = request_threaded_irq(me->irq, handler, thread, 0, me->name, arg); if (ret) { dd_dev_err(dd, - "unable to allocate %s interrupt, vector %d, index %d, err %d\n", - err_info, me->msix.vector, idx, ret); + "unable to allocate %s interrupt, irq %d, index %d, err %d\n", + err_info, me->irq, idx, ret); return ret; } /* @@ -13003,8 +13121,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd) ret = hfi1_get_irq_affinity(dd, me); if (ret) - dd_dev_err(dd, - "unable to pin IRQ %d\n", ret); + dd_dev_err(dd, "unable to pin IRQ %d\n", ret); } return ret; @@ -13023,7 +13140,7 @@ void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - synchronize_irq(me->msix.vector); + synchronize_irq(me->irq); } } @@ -13036,7 +13153,7 @@ void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd) return; hfi1_put_irq_affinity(dd, me); - free_irq(me->msix.vector, me->arg); + free_irq(me->irq, me->arg); me->arg = NULL; } @@ -13064,14 +13181,19 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) DRIVER_NAME "_%d kctxt%d", dd->unit, idx); me->name[sizeof(me->name) - 1] = 0; me->type = IRQ_RCVCTXT; - + me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr); + if (me->irq < 0) { + dd_dev_err(dd, "vnic irq vector request (idx %d) fail %d\n", + idx, me->irq); + return; + } remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr); - ret = request_threaded_irq(me->msix.vector, receive_context_interrupt, + ret = request_threaded_irq(me->irq, receive_context_interrupt, receive_context_thread, 0, me->name, arg); if (ret) { - dd_dev_err(dd, "vnic irq request (vector %d, idx %d) fail %d\n", - me->msix.vector, idx, ret); + dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n", + me->irq, idx, ret); return; } /* @@ -13084,7 +13206,7 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) if (ret) { dd_dev_err(dd, "unable to pin IRQ %d\n", ret); - free_irq(me->msix.vector, me->arg); + free_irq(me->irq, me->arg); } } @@ -13107,9 +13229,8 @@ static void reset_interrupts(struct hfi1_devdata *dd) static int set_up_interrupts(struct hfi1_devdata *dd) { - struct hfi1_msix_entry *entries; - u32 total, request; - int i, ret; + u32 total; + int ret, request; int single_interrupt = 0; /* we expect to have all the interrupts */ /* @@ -13121,39 +13242,31 @@ static int set_up_interrupts(struct hfi1_devdata *dd) */ total = 1 + dd->num_sdma + dd->n_krcv_queues + HFI1_NUM_VNIC_CTXT; - entries = kcalloc(total, sizeof(*entries), GFP_KERNEL); - if (!entries) { - ret = -ENOMEM; - goto fail; - } - /* 1-1 MSI-X entry assignment */ - for (i = 0; i < total; i++) - entries[i].msix.entry = i; - /* ask for MSI-X interrupts */ - request = total; - request_msix(dd, &request, entries); - - if (request == 0) { + request = request_msix(dd, total); + if (request < 0) { + ret = request; + goto fail; + } else if (request == 0) { /* using INTx */ /* dd->num_msix_entries already zero */ - kfree(entries); single_interrupt = 1; dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n"); + } else if (request < total) { + /* using MSI-X, with reduced interrupts */ + dd_dev_err(dd, "reduced interrupt found, wanted %u, got %u\n", + total, request); + ret = -EINVAL; + goto fail; } else { - /* using MSI-X */ - dd->num_msix_entries = request; - dd->msix_entries = entries; - - if (request != total) { - /* using MSI-X, with reduced interrupts */ - dd_dev_err( - dd, - "cannot handle reduced interrupt case, want %u, got %u\n", - total, request); - ret = -EINVAL; + dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), + GFP_KERNEL); + if (!dd->msix_entries) { + ret = -ENOMEM; goto fail; } + /* using MSI-X */ + dd->num_msix_entries = total; dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); } @@ -13396,8 +13509,7 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) /* RcvArray */ for (i = 0; i < dd->chip_rcv_array_count; i++) - write_csr(dd, RCV_ARRAY + (8 * i), - RCV_ARRAY_RT_WRITE_ENABLE_SMASK); + hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0); /* RcvQPMapTable */ for (i = 0; i < 32; i++) @@ -13831,9 +13943,10 @@ static void init_sc2vl_tables(struct hfi1_devdata *dd) * a reset following the (possible) FLR in this routine. * */ -static void init_chip(struct hfi1_devdata *dd) +static int init_chip(struct hfi1_devdata *dd) { int i; + int ret = 0; /* * Put the HFI CSRs in a known state. @@ -13881,12 +13994,22 @@ static void init_chip(struct hfi1_devdata *dd) pcie_flr(dd->pcidev); /* restore command and BARs */ - restore_pci_variables(dd); + ret = restore_pci_variables(dd); + if (ret) { + dd_dev_err(dd, "%s: Could not restore PCI variables\n", + __func__); + return ret; + } if (is_ax(dd)) { dd_dev_info(dd, "Resetting CSRs with FLR\n"); pcie_flr(dd->pcidev); - restore_pci_variables(dd); + ret = restore_pci_variables(dd); + if (ret) { + dd_dev_err(dd, "%s: Could not restore PCI variables\n", + __func__); + return ret; + } } } else { dd_dev_info(dd, "Resetting CSRs with writes\n"); @@ -13914,6 +14037,7 @@ static void init_chip(struct hfi1_devdata *dd) write_csr(dd, ASIC_QSFP1_OUT, 0x1f); write_csr(dd, ASIC_QSFP2_OUT, 0x1f); init_chip_resources(dd); + return ret; } static void init_early_variables(struct hfi1_devdata *dd) @@ -14365,6 +14489,7 @@ void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd) static void init_rxe(struct hfi1_devdata *dd) { struct rsm_map_table *rmt; + u64 val; /* enable all receive errors */ write_csr(dd, RCV_ERR_MASK, ~0ull); @@ -14389,6 +14514,11 @@ static void init_rxe(struct hfi1_devdata *dd) * (64 bytes). Max_Payload_Size is possibly modified upward in * tune_pcie_caps() which is called after this routine. */ + + /* Have 16 bytes (4DW) of bypass header available in header queue */ + val = read_csr(dd, RCV_BYPASS); + val |= (4ull << 16); + write_csr(dd, RCV_BYPASS, val); } static void init_other(struct hfi1_devdata *dd) @@ -14470,99 +14600,86 @@ static void init_txe(struct hfi1_devdata *dd) write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE); } -int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey) +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd, + u16 jkey) { - struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; - unsigned sctxt; - int ret = 0; + u8 hw_ctxt; u64 reg; - if (!rcd || !rcd->sc) { - ret = -EINVAL; - goto done; - } - sctxt = rcd->sc->hw_context; + if (!rcd || !rcd->sc) + return -EINVAL; + + hw_ctxt = rcd->sc->hw_context; reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */ ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) << SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT); /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */ if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY)) reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK; - write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg); + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, reg); /* * Enable send-side J_KEY integrity check, unless this is A0 h/w */ if (!is_ax(dd)) { - reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; - write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); } /* Enable J_KEY check on receive context. */ reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK | ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) << RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT); - write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg); -done: - return ret; + write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, reg); + + return 0; } -int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt) +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) { - struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; - unsigned sctxt; - int ret = 0; + u8 hw_ctxt; u64 reg; - if (!rcd || !rcd->sc) { - ret = -EINVAL; - goto done; - } - sctxt = rcd->sc->hw_context; - write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0); + if (!rcd || !rcd->sc) + return -EINVAL; + + hw_ctxt = rcd->sc->hw_context; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, 0); /* * Disable send-side J_KEY integrity check, unless this is A0 h/w. * This check would not have been enabled for A0 h/w, see * set_ctxt_jkey(). */ if (!is_ax(dd)) { - reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; - write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); } /* Turn off the J_KEY on the receive side */ - write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0); -done: - return ret; + write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, 0); + + return 0; } -int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd, + u16 pkey) { - struct hfi1_ctxtdata *rcd; - unsigned sctxt; - int ret = 0; + u8 hw_ctxt; u64 reg; - if (ctxt < dd->num_rcv_contexts) { - rcd = dd->rcd[ctxt]; - } else { - ret = -EINVAL; - goto done; - } - if (!rcd || !rcd->sc) { - ret = -EINVAL; - goto done; - } - sctxt = rcd->sc->hw_context; + if (!rcd || !rcd->sc) + return -EINVAL; + + hw_ctxt = rcd->sc->hw_context; reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) << SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT; - write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); - reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK; - write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); -done: - return ret; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); + + return 0; } int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt) @@ -14573,9 +14690,6 @@ int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt) if (!ctxt || !ctxt->sc) return -EINVAL; - if (ctxt->ctxt >= dd->num_rcv_contexts) - return -EINVAL; - hw_ctxt = ctxt->sc->hw_context; reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; @@ -14773,7 +14887,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, } ppd->vls_supported = num_vls; ppd->vls_operational = ppd->vls_supported; - ppd->actual_vls_operational = ppd->vls_supported; /* Set the default MTU. */ for (vl = 0; vl < num_vls; vl++) dd->vld[vl].mtu = hfi1_max_mtu; @@ -14782,7 +14895,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, * Set the initial values to reasonable default, will be set * for real when link is up. */ - ppd->lstate = IB_PORT_DOWN; ppd->overrun_threshold = 0x4; ppd->phy_error_threshold = 0xf; ppd->port_crc_mode_enabled = link_crc_mask; @@ -14793,7 +14905,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* start in offline */ ppd->host_link_state = HLS_DN_OFFLINE; init_vl_arb_caches(ppd); - ppd->last_pstate = 0xff; /* invalid value */ } dd->link_default = HLS_DN_POLL; @@ -14807,6 +14918,11 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret < 0) goto bail_free; + /* Save PCI space registers to rewrite after device reset */ + ret = save_pci_variables(dd); + if (ret < 0) + goto bail_cleanup; + /* verify that reads actually work, save revision for reset check */ dd->revision = read_csr(dd, CCE_REVISION); if (dd->revision == ~(u64)0) { @@ -14899,7 +15015,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, goto bail_cleanup; /* obtain chip sizes, reset chip CSRs */ - init_chip(dd); + ret = init_chip(dd); + if (ret) + goto bail_cleanup; /* read in the PCIe link speed information */ ret = pcie_speeds(dd); @@ -14974,10 +15092,16 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; - ret = hfi1_create_ctxts(dd); + ret = hfi1_create_kctxts(dd); if (ret) goto bail_cleanup; + /* + * Initialize aspm, to be done after gen3 transition and setting up + * contexts and before enabling interrupts + */ + aspm_init(dd); + dd->rcvhdrsize = DEFAULT_RCVHDRSIZE; /* * rcd[0] is guaranteed to be valid by this point. Also, all @@ -14996,7 +15120,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, goto bail_cleanup; } - /* use contexts created by hfi1_create_ctxts */ + /* use contexts created by hfi1_create_kctxts */ ret = set_up_interrupts(dd); if (ret) goto bail_cleanup; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index cbe455d9ab8b..b8345a60a0fb 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -384,6 +384,7 @@ #define VERIFY_CAP_LOCAL_FABRIC 0x08 #define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09 #define LOCAL_DEVICE_ID 0x0a +#define RESERVED_REGISTERS 0x0b #define LOCAL_LNI_INFO 0x0c #define REMOTE_LNI_INFO 0x0d #define MISC_STATUS 0x0e @@ -506,6 +507,9 @@ #define DOWN_REMOTE_REASON_SHIFT 16 #define DOWN_REMOTE_REASON_MASK 0xff +#define HOST_INTERFACE_VERSION_SHIFT 16 +#define HOST_INTERFACE_VERSION_MASK 0xff + /* verify capability PHY power management bits */ #define PWRM_BER_CONTROL 0x1 #define PWRM_BANDWIDTH_CONTROL 0x2 @@ -605,11 +609,11 @@ int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data); int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data); void __iomem *get_csr_addr( - struct hfi1_devdata *dd, + const struct hfi1_devdata *dd, u32 offset); static inline void __iomem *get_kctxt_csr_addr( - struct hfi1_devdata *dd, + const struct hfi1_devdata *dd, int ctxt, u32 offset0) { @@ -644,7 +648,6 @@ u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, #define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */ extern const u8 pcie_serdes_broadcast[]; extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES]; -extern uint platform_config_load; /* SBus commands */ #define RESET_SBUS_RECEIVER 0x20 @@ -704,6 +707,7 @@ int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result); /* chip.c */ void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, u8 *ver_patch); +int write_host_interface_version(struct hfi1_devdata *dd, u8 version); void read_guid(struct hfi1_devdata *dd); int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout); void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, @@ -743,11 +747,10 @@ int is_ax(struct hfi1_devdata *dd); int is_bx(struct hfi1_devdata *dd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); -u32 get_logical_state(struct hfi1_pportdata *ppd); const char *opa_lstate_name(u32 lstate); const char *opa_pstate_name(u32 pstate); -u32 driver_physical_state(struct hfi1_pportdata *ppd); -u32 driver_logical_state(struct hfi1_pportdata *ppd); +u32 driver_pstate(struct hfi1_pportdata *ppd); +u32 driver_lstate(struct hfi1_pportdata *ppd); int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok); int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok); @@ -1347,21 +1350,21 @@ enum { u64 get_all_cpu_total(u64 __percpu *cntr); void hfi1_start_cleanup(struct hfi1_devdata *dd); void hfi1_clear_tids(struct hfi1_ctxtdata *rcd); -struct ib_header *hfi1_get_msgheader( - struct hfi1_devdata *dd, __le32 *rhf_addr); void hfi1_init_ctxt(struct send_context *sc); void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, u32 type, unsigned long pa, u16 order); void hfi1_quiet_serdes(struct hfi1_pportdata *ppd); -void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt); +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, + struct hfi1_ctxtdata *rcd); u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp); u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp); -u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd); int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which); int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val); -int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey); -int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt); -int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey); +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd, + u16 jkey); +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt); +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt, + u16 pkey); int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt); void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); void hfi1_init_vnic_rsm(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 995d62c7f9a7..3e27794ec750 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -325,22 +325,15 @@ struct diag_pkt { #define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ /* misc. */ +#define SC15_PACKET 0xF #define SIZE_OF_CRC 1 +#define SIZE_OF_LT 1 #define LIM_MGMT_P_KEY 0x7FFF #define FULL_MGMT_P_KEY 0xFFFF #define DEFAULT_P_KEY LIM_MGMT_P_KEY -/** - * 0xF8 - 4 bits of multicast range and 1 bit for collective range - * Example: For 24 bit LID space, - * Multicast range: 0xF00000 to 0xF7FFFF - * Collective range: 0xF80000 to 0xFFFFFE - */ -#define HFI1_MCAST_NR 0x4 /* Number of top bits set */ -#define HFI1_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */ - #define HFI1_PSM_IOC_BASE_SEQ 0x0 static inline __u64 rhf_to_cpu(const __le32 *rbuf) diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index e9fa3c293e42..36ae1fd86502 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -1,4 +1,3 @@ -#ifdef CONFIG_DEBUG_FS /* * Copyright(c) 2015-2017 Intel Corporation. * @@ -173,12 +172,15 @@ static int _opcode_stats_seq_show(struct seq_file *s, void *v) u64 n_packets = 0, n_bytes = 0; struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { - if (!dd->rcd[j]) - continue; - n_packets += dd->rcd[j]->opstats->stats[i].n_packets; - n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + rcd = hfi1_rcd_get_by_index(dd, j); + if (rcd) { + n_packets += rcd->opstats->stats[i].n_packets; + n_bytes += rcd->opstats->stats[i].n_bytes; + } + hfi1_rcd_put(rcd); } if (!n_packets && !n_bytes) return SEQ_SKIP; @@ -231,6 +233,7 @@ static int _ctx_stats_seq_show(struct seq_file *s, void *v) u64 n_packets = 0; struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; if (v == SEQ_START_TOKEN) { seq_puts(s, "Ctx:npkts\n"); @@ -240,11 +243,14 @@ static int _ctx_stats_seq_show(struct seq_file *s, void *v) spos = v; i = *spos; - if (!dd->rcd[i]) + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) return SEQ_SKIP; - for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++) - n_packets += dd->rcd[i]->opstats->stats[j].n_packets; + for (j = 0; j < ARRAY_SIZE(rcd->opstats->stats); j++) + n_packets += rcd->opstats->stats[j].n_packets; + + hfi1_rcd_put(rcd); if (!n_packets) return SEQ_SKIP; @@ -260,10 +266,10 @@ DEBUGFS_FILE_OPS(ctx_stats); static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) __acquires(RCU) { - struct qp_iter *iter; + struct rvt_qp_iter *iter; loff_t n = *pos; - iter = qp_iter_init(s->private); + iter = rvt_qp_iter_init(s->private, 0, NULL); /* stop calls rcu_read_unlock */ rcu_read_lock(); @@ -272,7 +278,7 @@ static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) return NULL; do { - if (qp_iter_next(iter)) { + if (rvt_qp_iter_next(iter)) { kfree(iter); return NULL; } @@ -285,11 +291,11 @@ static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, loff_t *pos) __must_hold(RCU) { - struct qp_iter *iter = iter_ptr; + struct rvt_qp_iter *iter = iter_ptr; (*pos)++; - if (qp_iter_next(iter)) { + if (rvt_qp_iter_next(iter)) { kfree(iter); return NULL; } @@ -305,7 +311,7 @@ static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) { - struct qp_iter *iter = iter_ptr; + struct rvt_qp_iter *iter = iter_ptr; if (!iter) return 0; @@ -361,6 +367,52 @@ DEBUGFS_SEQ_FILE_OPS(sdes); DEBUGFS_SEQ_FILE_OPEN(sdes) DEBUGFS_FILE_OPS(sdes); +static void *_rcds_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd; + struct hfi1_devdata *dd; + + ibd = (struct hfi1_ibdev *)s->private; + dd = dd_from_dev(ibd); + if (!dd->rcd || *pos >= dd->n_krcv_queues) + return NULL; + return pos; +} + +static void *_rcds_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + ++*pos; + if (!dd->rcd || *pos >= dd->n_krcv_queues) + return NULL; + return pos; +} + +static void _rcds_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _rcds_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + loff_t *spos = v; + loff_t i = *spos; + + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + seqfile_dump_rcd(s, rcd); + hfi1_rcd_put(rcd); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(rcds); +DEBUGFS_SEQ_FILE_OPEN(rcds) +DEBUGFS_FILE_OPS(rcds); + /* read the per-device counters */ static ssize_t dev_counters_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -1098,12 +1150,15 @@ static int _fault_stats_seq_show(struct seq_file *s, void *v) u64 n_packets = 0, n_bytes = 0; struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { - if (!dd->rcd[j]) - continue; - n_packets += dd->rcd[j]->opstats->stats[i].n_packets; - n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + rcd = hfi1_rcd_get_by_index(dd, j); + if (rcd) { + n_packets += rcd->opstats->stats[i].n_packets; + n_bytes += rcd->opstats->stats[i].n_bytes; + } + hfi1_rcd_put(rcd); } if (!n_packets && !n_bytes) return SEQ_SKIP; @@ -1311,6 +1366,7 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd); DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd); DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd); DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd); /* dev counter files */ for (i = 0; i < ARRAY_SIZE(cntr_ops); i++) @@ -1478,5 +1534,3 @@ void hfi1_dbg_exit(void) debugfs_remove_recursive(hfi1_dbg_root); hfi1_dbg_root = NULL; } - -#endif diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index a50870e455a3..7372cc00cb2d 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -96,7 +96,6 @@ MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("Intel Omni-Path Architecture driver"); -MODULE_VERSION(HFI1_DRIVER_VERSION); /* * MAX_PKT_RCV is the max # if packets processed per receive interrupt. @@ -196,7 +195,7 @@ int hfi1_count_active_units(void) spin_lock_irqsave(&hfi1_devs_lock, flags); list_for_each_entry(dd, &hfi1_dev_list, list) { - if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase) + if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1) continue; for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -224,6 +223,27 @@ static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf, (offset * RCV_BUF_BLOCK_SIZE)); } +static inline void *hfi1_get_header(struct hfi1_devdata *dd, + __le32 *rhf_addr) +{ + u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr)); + + return (void *)(rhf_addr - dd->rhf_offset + offset); +} + +static inline struct ib_header *hfi1_get_msgheader(struct hfi1_devdata *dd, + __le32 *rhf_addr) +{ + return (struct ib_header *)hfi1_get_header(dd, rhf_addr); +} + +static inline struct hfi1_16b_header + *hfi1_get_16B_header(struct hfi1_devdata *dd, + __le32 *rhf_addr) +{ + return (struct hfi1_16b_header *)hfi1_get_header(dd, rhf_addr); +} + /* * Validate and encode the a given RcvArray Buffer size. * The function will check whether the given size falls within @@ -249,7 +269,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, { struct ib_header *rhdr = packet->hdr; u32 rte = rhf_rcv_type_err(packet->rhf); - int lnh = ib_get_lnh(rhdr); + u32 mlid_base; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct hfi1_devdata *dd = ppd->dd; struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; @@ -257,37 +277,47 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) return; + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + goto drop; + } else { + u8 lnh = ib_get_lnh(rhdr); + + mlid_base = be16_to_cpu(IB_MULTICAST_LID_BASE); + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &rhdr->u.oth; + } else if (lnh == HFI1_LRH_GRH) { + packet->ohdr = &rhdr->u.l.oth; + packet->grh = &rhdr->u.l.grh; + } else { + goto drop; + } + } + if (packet->rhf & RHF_TID_ERR) { /* For TIDERR and RC QPs preemptively schedule a NAK */ - struct ib_other_headers *ohdr = NULL; u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ - u16 lid = ib_get_dlid(rhdr); + u32 dlid = ib_get_dlid(rhdr); u32 qp_num; - u32 rcv_flags = 0; /* Sanity check packet */ if (tlen < 24) goto drop; /* Check for GRH */ - if (lnh == HFI1_LRH_BTH) { - ohdr = &rhdr->u.oth; - } else if (lnh == HFI1_LRH_GRH) { + if (packet->grh) { u32 vtf; + struct ib_grh *grh = packet->grh; - ohdr = &rhdr->u.l.oth; - if (rhdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + if (grh->next_hdr != IB_GRH_NEXT_HDR) goto drop; - vtf = be32_to_cpu(rhdr->u.l.grh.version_tclass_flow); + vtf = be32_to_cpu(grh->version_tclass_flow); if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) goto drop; - rcv_flags |= HFI1_HAS_GRH; - } else { - goto drop; } + /* Get the destination QP number. */ - qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) { + qp_num = ib_bth_get_qpn(packet->ohdr); + if (dlid < mlid_base) { struct rvt_qp *qp; unsigned long flags; @@ -312,11 +342,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, switch (qp->ibqp.qp_type) { case IB_QPT_RC: - hfi1_rc_hdrerr( - rcd, - rhdr, - rcv_flags, - qp); + hfi1_rc_hdrerr(rcd, packet, qp); break; default: /* For now don't handle any other QP types */ @@ -332,9 +358,8 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, switch (rte) { case RHF_RTE_ERROR_OP_CODE_ERR: { - u32 opcode; void *ebuf = NULL; - __be32 *bth = NULL; + u8 opcode; if (rhf_use_egr_bfr(packet->rhf)) ebuf = packet->ebuf; @@ -342,16 +367,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, if (!ebuf) goto drop; /* this should never happen */ - if (lnh == HFI1_LRH_BTH) - bth = (__be32 *)ebuf; - else if (lnh == HFI1_LRH_GRH) - bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh)); - else - goto drop; - - opcode = be32_to_cpu(bth[0]) >> 24; - opcode &= 0xff; - + opcode = ib_bth_get_opcode(packet->ohdr); if (opcode == IB_OPCODE_CNP) { /* * Only in pre-B0 h/w is the CNP_OPCODE handled @@ -365,7 +381,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, sc5 = hfi1_9B_get_sc5(rhdr, packet->rhf); sl = ibp->sc_to_sl[sc5]; - lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK; + lqpn = ib_bth_get_qpn(packet->ohdr); rcu_read_lock(); qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn); if (!qp) { @@ -415,33 +431,39 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd, packet->rhf = rhf_to_cpu(packet->rhf_addr); packet->rhqoff = rcd->head; packet->numpkt = 0; - packet->rcv_flags = 0; } void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, bool do_cnp) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct ib_header *hdr = pkt->hdr; struct ib_other_headers *ohdr = pkt->ohdr; - struct ib_grh *grh = NULL; + struct ib_grh *grh = pkt->grh; u32 rqpn = 0, bth1; - u16 rlid, dlid = ib_get_dlid(hdr); - u8 sc, svc_type; + u16 pkey, rlid, dlid = ib_get_dlid(pkt->hdr); + u8 hdr_type, sc, svc_type; bool is_mcast = false; - if (pkt->rcv_flags & HFI1_HAS_GRH) - grh = &hdr->u.l.grh; + if (pkt->etype == RHF_RCV_TYPE_BYPASS) { + is_mcast = hfi1_is_16B_mcast(dlid); + pkey = hfi1_16B_get_pkey(pkt->hdr); + sc = hfi1_16B_get_sc(pkt->hdr); + hdr_type = HFI1_PKT_TYPE_16B; + } else { + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); + pkey = ib_bth_get_pkey(ohdr); + sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf); + hdr_type = HFI1_PKT_TYPE_9B; + } switch (qp->ibqp.qp_type) { case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - rlid = ib_get_slid(hdr); - rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; + rlid = ib_get_slid(pkt->hdr); + rqpn = ib_get_sqpn(pkt->ohdr); svc_type = IB_CC_SVCTYPE_UD; - is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && - (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); break; case IB_QPT_UC: rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); @@ -457,14 +479,11 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, return; } - sc = hfi1_9B_get_sc5(hdr, pkt->rhf); - bth1 = be32_to_cpu(ohdr->bth[1]); - if (do_cnp && (bth1 & IB_FECN_SMASK)) { - u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); - - return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh); - } + /* Call appropriate CNP handler */ + if (do_cnp && (bth1 & IB_FECN_SMASK)) + hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey, + dlid, rlid, sc, grh); if (!is_mcast && (bth1 & IB_BECN_SMASK)) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); @@ -591,9 +610,10 @@ static void __prescan_rxq(struct hfi1_packet *packet) if (lnh == HFI1_LRH_BTH) { packet->ohdr = &hdr->u.oth; + packet->grh = NULL; } else if (lnh == HFI1_LRH_GRH) { packet->ohdr = &hdr->u.l.oth; - packet->rcv_flags |= HFI1_HAS_GRH; + packet->grh = &hdr->u.l.grh; } else { goto next; /* just in case */ } @@ -698,10 +718,8 @@ static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) { int ret; - packet->hdr = hfi1_get_msgheader(packet->rcd->dd, - packet->rhf_addr); - packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; packet->etype = rhf_rcv_type(packet->rhf); + /* total length */ packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */ /* retrieve eager buffer details */ @@ -759,7 +777,7 @@ static inline void process_rcv_update(int last, struct hfi1_packet *packet) packet->etail, 0, 0); packet->updegr = 0; } - packet->rcv_flags = 0; + packet->grh = NULL; } static inline void finish_packet(struct hfi1_packet *packet) @@ -837,9 +855,10 @@ bail: return last; } -static inline void set_nodma_rtail(struct hfi1_devdata *dd, u8 ctxt) +static inline void set_nodma_rtail(struct hfi1_devdata *dd, u16 ctxt) { - int i; + struct hfi1_ctxtdata *rcd; + u16 i; /* * For dynamically allocated kernel contexts (like vnic) switch @@ -847,19 +866,28 @@ static inline void set_nodma_rtail(struct hfi1_devdata *dd, u8 ctxt) * interrupt handler for all statically allocated kernel contexts. */ if (ctxt >= dd->first_dyn_alloc_ctxt) { - dd->rcd[ctxt]->do_interrupt = - &handle_receive_interrupt_nodma_rtail; + rcd = hfi1_rcd_get_by_index(dd, ctxt); + if (rcd) { + rcd->do_interrupt = + &handle_receive_interrupt_nodma_rtail; + hfi1_rcd_put(rcd); + } return; } - for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) - dd->rcd[i]->do_interrupt = - &handle_receive_interrupt_nodma_rtail; + for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + rcd->do_interrupt = + &handle_receive_interrupt_nodma_rtail; + hfi1_rcd_put(rcd); + } } -static inline void set_dma_rtail(struct hfi1_devdata *dd, u8 ctxt) +static inline void set_dma_rtail(struct hfi1_devdata *dd, u16 ctxt) { - int i; + struct hfi1_ctxtdata *rcd; + u16 i; /* * For dynamically allocated kernel contexts (like vnic) switch @@ -867,27 +895,39 @@ static inline void set_dma_rtail(struct hfi1_devdata *dd, u8 ctxt) * interrupt handler for all statically allocated kernel contexts. */ if (ctxt >= dd->first_dyn_alloc_ctxt) { - dd->rcd[ctxt]->do_interrupt = - &handle_receive_interrupt_dma_rtail; + rcd = hfi1_rcd_get_by_index(dd, ctxt); + if (rcd) { + rcd->do_interrupt = + &handle_receive_interrupt_dma_rtail; + hfi1_rcd_put(rcd); + } return; } - for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) - dd->rcd[i]->do_interrupt = - &handle_receive_interrupt_dma_rtail; + for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + rcd->do_interrupt = + &handle_receive_interrupt_dma_rtail; + hfi1_rcd_put(rcd); + } } void set_all_slowpath(struct hfi1_devdata *dd) { - int i; + struct hfi1_ctxtdata *rcd; + u16 i; /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */ for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) { - struct hfi1_ctxtdata *rcd = dd->rcd[i]; - + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) + continue; if ((i < dd->first_dyn_alloc_ctxt) || - (rcd && rcd->sc && (rcd->sc->type == SC_KERNEL))) + (rcd->sc && (rcd->sc->type == SC_KERNEL))) { rcd->do_interrupt = &handle_receive_interrupt; + } + hfi1_rcd_put(rcd); } } @@ -896,20 +936,30 @@ static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd, struct hfi1_devdata *dd) { struct work_struct *lsaw = &rcd->ppd->linkstate_active_work; - struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd, - packet->rhf_addr); u8 etype = rhf_rcv_type(packet->rhf); + u8 sc = SC15_PACKET; + + if (etype == RHF_RCV_TYPE_IB) { + struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd, + packet->rhf_addr); + sc = hfi1_9B_get_sc5(hdr, packet->rhf); + } else if (etype == RHF_RCV_TYPE_BYPASS) { + struct hfi1_16b_header *hdr = hfi1_get_16B_header( + packet->rcd->dd, + packet->rhf_addr); + sc = hfi1_16B_get_sc(hdr); + } + if (sc != SC15_PACKET) { + int hwstate = driver_lstate(rcd->ppd); - if (etype == RHF_RCV_TYPE_IB && - hfi1_9B_get_sc5(hdr, packet->rhf) != 0xf) { - int hwstate = read_logical_state(dd); - - if (hwstate != LSTATE_ACTIVE) { - dd_dev_info(dd, "Unexpected link state %d\n", hwstate); + if (hwstate != IB_PORT_ACTIVE) { + dd_dev_info(dd, + "Unexpected link state %s\n", + opa_lstate_name(hwstate)); return 0; } - queue_work(rcd->ppd->hfi1_wq, lsaw); + queue_work(rcd->ppd->link_wq, lsaw); return 1; } return 0; @@ -1063,7 +1113,8 @@ void receive_interrupt_work(struct work_struct *work) struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, linkstate_active_work); struct hfi1_devdata *dd = ppd->dd; - int i; + struct hfi1_ctxtdata *rcd; + u16 i; /* Received non-SC15 packet implies neighbor_normal */ ppd->neighbor_normal = 1; @@ -1073,8 +1124,12 @@ void receive_interrupt_work(struct work_struct *work) * Interrupt all statically allocated kernel contexts that could * have had an interrupt during auto activation. */ - for (i = HFI1_CTRL_CTXT; i < dd->first_dyn_alloc_ctxt; i++) - force_recv_intr(dd->rcd[i]); + for (i = HFI1_CTRL_CTXT; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + force_recv_intr(rcd); + hfi1_rcd_put(rcd); + } } /* @@ -1264,10 +1319,9 @@ void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, */ int hfi1_reset_device(int unit) { - int ret, i; + int ret; struct hfi1_devdata *dd = hfi1_lookup(unit); struct hfi1_pportdata *ppd; - unsigned long flags; int pidx; if (!dd) { @@ -1277,7 +1331,7 @@ int hfi1_reset_device(int unit) dd_dev_info(dd, "Reset on unit %u requested\n", unit); - if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) { + if (!dd->kregbase1 || !(dd->flags & HFI1_PRESENT)) { dd_dev_info(dd, "Invalid unit number %u or not initialized or not present\n", unit); @@ -1285,17 +1339,15 @@ int hfi1_reset_device(int unit) goto bail; } - spin_lock_irqsave(&dd->uctxt_lock, flags); + /* If there are any user/vnic contexts, we cannot reset */ + mutex_lock(&hfi1_mutex); if (dd->rcd) - for (i = dd->first_dyn_alloc_ctxt; - i < dd->num_rcv_contexts; i++) { - if (!dd->rcd[i]) - continue; - spin_unlock_irqrestore(&dd->uctxt_lock, flags); + if (hfi1_stats.sps_ctxts) { + mutex_unlock(&hfi1_mutex); ret = -EBUSY; goto bail; } - spin_unlock_irqrestore(&dd->uctxt_lock, flags); + mutex_unlock(&hfi1_mutex); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1321,6 +1373,162 @@ bail: return ret; } +static inline void hfi1_setup_ib_header(struct hfi1_packet *packet) +{ + packet->hdr = (struct hfi1_ib_message_header *) + hfi1_get_msgheader(packet->rcd->dd, + packet->rhf_addr); + packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; +} + +static int hfi1_bypass_ingress_pkt_check(struct hfi1_packet *packet) +{ + struct hfi1_pportdata *ppd = packet->rcd->ppd; + + /* slid and dlid cannot be 0 */ + if ((!packet->slid) || (!packet->dlid)) + return -EINVAL; + + /* Compare port lid with incoming packet dlid */ + if ((!(hfi1_is_16B_mcast(packet->dlid))) && + (packet->dlid != + opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) { + if (packet->dlid != ppd->lid) + return -EINVAL; + } + + /* No multicast packets with SC15 */ + if ((hfi1_is_16B_mcast(packet->dlid)) && (packet->sc == 0xF)) + return -EINVAL; + + /* Packets with permissive DLID always on SC15 */ + if ((packet->dlid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), + 16B)) && + (packet->sc != 0xF)) + return -EINVAL; + + return 0; +} + +static int hfi1_setup_9B_packet(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + struct ib_header *hdr; + u8 lnh; + + hfi1_setup_ib_header(packet); + hdr = packet->hdr; + + lnh = ib_get_lnh(hdr); + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &hdr->u.oth; + packet->grh = NULL; + } else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + packet->ohdr = &hdr->u.l.oth; + packet->grh = &hdr->u.l.grh; + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(packet->grh->version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else { + goto drop; + } + + /* Query commonly used fields from packet header */ + packet->payload = packet->ebuf; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->slid = ib_get_slid(hdr); + packet->dlid = ib_get_dlid(hdr); + if (unlikely((packet->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (packet->dlid != be16_to_cpu(IB_LID_PERMISSIVE)))) + packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) - + be16_to_cpu(IB_MULTICAST_LID_BASE); + packet->sl = ib_get_sl(hdr); + packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf); + packet->pad = ib_bth_get_pad(packet->ohdr); + packet->extra_byte = 0; + packet->fecn = ib_bth_get_fecn(packet->ohdr); + packet->becn = ib_bth_get_becn(packet->ohdr); + + return 0; +drop: + ibp->rvp.n_pkt_drops++; + return -EINVAL; +} + +static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) +{ + /* + * Bypass packets have a different header/payload split + * compared to an IB packet. + * Current split is set such that 16 bytes of the actual + * header is in the header buffer and the remining is in + * the eager buffer. We chose 16 since hfi1 driver only + * supports 16B bypass packets and we will be able to + * receive the entire LRH with such a split. + */ + + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + u8 l4; + u8 grh_len; + + packet->hdr = (struct hfi1_16b_header *) + hfi1_get_16B_header(packet->rcd->dd, + packet->rhf_addr); + packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; + + l4 = hfi1_16B_get_l4(packet->hdr); + if (l4 == OPA_16B_L4_IB_LOCAL) { + grh_len = 0; + packet->ohdr = packet->ebuf; + packet->grh = NULL; + } else if (l4 == OPA_16B_L4_IB_GLOBAL) { + u32 vtf; + + grh_len = sizeof(struct ib_grh); + packet->ohdr = packet->ebuf + grh_len; + packet->grh = packet->ebuf; + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(packet->grh->version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else { + goto drop; + } + + /* Query commonly used fields from packet header */ + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->hlen = hdr_len_by_opcode[packet->opcode] + 8 + grh_len; + packet->payload = packet->ebuf + packet->hlen - (4 * sizeof(u32)); + packet->slid = hfi1_16B_get_slid(packet->hdr); + packet->dlid = hfi1_16B_get_dlid(packet->hdr); + if (unlikely(hfi1_is_16B_mcast(packet->dlid))) + packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) - + opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), + 16B); + packet->sc = hfi1_16B_get_sc(packet->hdr); + packet->sl = ibp->sc_to_sl[packet->sc]; + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + packet->extra_byte = SIZE_OF_LT; + packet->fecn = hfi1_16B_get_fecn(packet->hdr); + packet->becn = hfi1_16B_get_becn(packet->hdr); + + if (hfi1_bypass_ingress_pkt_check(packet)) + goto drop; + + return 0; +drop: + hfi1_cdbg(PKT, "%s: packet dropped\n", __func__); + ibp->rvp.n_pkt_drops++; + return -EINVAL; +} + void handle_eflags(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; @@ -1351,6 +1559,9 @@ int process_receive_ib(struct hfi1_packet *packet) if (unlikely(hfi1_dbg_fault_packet(packet))) return RHF_RCV_CONTINUE; + if (hfi1_setup_9B_packet(packet)) + return RHF_RCV_CONTINUE; + trace_hfi1_rcvhdr(packet->rcd->ppd->dd, packet->rcd->ctxt, rhf_err_flags(packet->rhf), @@ -1380,8 +1591,8 @@ static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet) if (packet->rcd->is_vnic) return true; - if ((HFI1_GET_L2_TYPE(packet->ebuf) == OPA_VNIC_L2_TYPE) && - (HFI1_GET_L4_TYPE(packet->ebuf) == OPA_VNIC_L4_ETHR)) + if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) && + (hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR)) return true; return false; @@ -1391,25 +1602,38 @@ int process_receive_bypass(struct hfi1_packet *packet) { struct hfi1_devdata *dd = packet->rcd->dd; - if (unlikely(rhf_err_flags(packet->rhf))) { - handle_eflags(packet); - } else if (hfi1_is_vnic_packet(packet)) { + if (hfi1_is_vnic_packet(packet)) { hfi1_vnic_bypass_rcv(packet); return RHF_RCV_CONTINUE; } - dd_dev_err(dd, "Unsupported bypass packet. Dropping\n"); - incr_cntr64(&dd->sw_rcv_bypass_packet_errors); - if (!(dd->err_info_rcvport.status_and_code & OPA_EI_STATUS_SMASK)) { - u64 *flits = packet->ebuf; + if (hfi1_setup_bypass_packet(packet)) + return RHF_RCV_CONTINUE; + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return RHF_RCV_CONTINUE; + } - if (flits && !(packet->rhf & RHF_LEN_ERR)) { - dd->err_info_rcvport.packet_flit1 = flits[0]; - dd->err_info_rcvport.packet_flit2 = - packet->tlen > sizeof(flits[0]) ? flits[1] : 0; + if (hfi1_16B_get_l2(packet->hdr) == 0x2) { + hfi1_16B_rcv(packet); + } else { + dd_dev_err(dd, + "Bypass packets other than 16B are not supported in normal operation. Dropping\n"); + incr_cntr64(&dd->sw_rcv_bypass_packet_errors); + if (!(dd->err_info_rcvport.status_and_code & + OPA_EI_STATUS_SMASK)) { + u64 *flits = packet->ebuf; + + if (flits && !(packet->rhf & RHF_LEN_ERR)) { + dd->err_info_rcvport.packet_flit1 = flits[0]; + dd->err_info_rcvport.packet_flit2 = + packet->tlen > sizeof(flits[0]) ? + flits[1] : 0; + } + dd->err_info_rcvport.status_and_code |= + (OPA_EI_STATUS_SMASK | BAD_L2_ERR); } - dd->err_info_rcvport.status_and_code |= - (OPA_EI_STATUS_SMASK | BAD_L2_ERR); } return RHF_RCV_CONTINUE; } @@ -1422,6 +1646,7 @@ int process_receive_error(struct hfi1_packet *packet) rhf_rcv_type_err(packet->rhf) == 3)) return RHF_RCV_CONTINUE; + hfi1_setup_ib_header(packet); handle_eflags(packet); if (unlikely(rhf_err_flags(packet->rhf))) @@ -1435,6 +1660,8 @@ int kdeth_process_expected(struct hfi1_packet *packet) { if (unlikely(hfi1_dbg_fault_packet(packet))) return RHF_RCV_CONTINUE; + + hfi1_setup_ib_header(packet); if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); @@ -1445,6 +1672,7 @@ int kdeth_process_expected(struct hfi1_packet *packet) int kdeth_process_eager(struct hfi1_packet *packet) { + hfi1_setup_ib_header(packet); if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); if (unlikely(hfi1_dbg_fault_packet(packet))) @@ -1461,3 +1689,62 @@ int process_receive_invalid(struct hfi1_packet *packet) rhf_rcv_type(packet->rhf)); return RHF_RCV_CONTINUE; } + +void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_packet packet; + struct ps_mdata mdata; + + seq_printf(s, "Rcd %u: RcvHdr cnt %u entsize %u %s head %llu tail %llu\n", + rcd->ctxt, rcd->rcvhdrq_cnt, rcd->rcvhdrqentsize, + HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? + "dma_rtail" : "nodma_rtail", + read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) & + RCV_HDR_HEAD_HEAD_MASK, + read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL)); + + init_packet(rcd, &packet); + init_ps_mdata(&mdata, &packet); + + while (1) { + struct hfi1_devdata *dd = rcd->dd; + __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + + dd->rhf_offset; + struct ib_header *hdr; + u64 rhf = rhf_to_cpu(rhf_addr); + u32 etype = rhf_rcv_type(rhf), qpn; + u8 opcode; + u32 psn; + u8 lnh; + + if (ps_done(&mdata, rhf, rcd)) + break; + + if (ps_skip(&mdata, rhf, rcd)) + goto next; + + if (etype > RHF_RCV_TYPE_IB) + goto next; + + packet.hdr = hfi1_get_msgheader(dd, rhf_addr); + hdr = packet.hdr; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + + if (lnh == HFI1_LRH_BTH) + packet.ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) + packet.ohdr = &hdr->u.l.oth; + else + goto next; /* just in case */ + + opcode = (be32_to_cpu(packet.ohdr->bth[0]) >> 24); + qpn = be32_to_cpu(packet.ohdr->bth[1]) & RVT_QPN_MASK; + psn = mask_psn(be32_to_cpu(packet.ohdr->bth[2])); + + seq_printf(s, "\tEnt %u: opcode 0x%x, qpn 0x%x, psn 0x%x\n", + mdata.ps_head, opcode, qpn, psn); +next: + update_ps_mdata(&mdata, rcd); + } +} diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c index 26da124c88e2..d46b17107901 100644 --- a/drivers/infiniband/hw/hfi1/eprom.c +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -250,7 +250,6 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, { void *buffer; void *p; - u32 length; int ret; buffer = kmalloc(P1_SIZE, GFP_KERNEL); @@ -265,13 +264,13 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, /* scan for image magic that may trail the actual data */ p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); - if (p) - length = p - buffer; - else - length = P1_SIZE; + if (!p) { + kfree(buffer); + return -ENOENT; + } *data = buffer; - *size = length; + *size = p - buffer; return 0; } diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c new file mode 100644 index 000000000000..0af91675acc6 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/exp_rcv.c @@ -0,0 +1,114 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "exp_rcv.h" +#include "trace.h" + +/** + * exp_tid_group_init - initialize exp_tid_set + * @set - the set + */ +void hfi1_exp_tid_group_init(struct exp_tid_set *set) +{ + INIT_LIST_HEAD(&set->list); + set->count = 0; +} + +/** + * alloc_ctxt_rcv_groups - initialize expected receive groups + * @rcd - the context to add the groupings to + */ +int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 tidbase; + struct tid_group *grp; + int i; + + tidbase = rcd->expected_base; + for (i = 0; i < rcd->expected_count / + dd->rcv_entries.group_size; i++) { + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + goto bail; + grp->size = dd->rcv_entries.group_size; + grp->base = tidbase; + tid_group_add_tail(grp, &rcd->tid_group_list); + tidbase += dd->rcv_entries.group_size; + } + + return 0; +bail: + hfi1_free_ctxt_rcv_groups(rcd); + return -ENOMEM; +} + +/** + * free_ctxt_rcv_groups - free expected receive groups + * @rcd - the context to free + * + * The routine dismantles the expect receive linked + * list and clears any tids associated with the receive + * context. + * + * This should only be called for kernel contexts and the + * a base user context. + */ +void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) +{ + struct tid_group *grp, *gptr; + + WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list)); + WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list)); + + list_for_each_entry_safe(grp, gptr, &rcd->tid_group_list.list, list) { + tid_group_remove(grp, &rcd->tid_group_list); + kfree(grp); + } + + hfi1_clear_tids(rcd); +} diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h new file mode 100644 index 000000000000..08719047628a --- /dev/null +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -0,0 +1,190 @@ +#ifndef _HFI1_EXP_RCV_H +#define _HFI1_EXP_RCV_H +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" + +#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) + +#define EXP_TID_TIDLEN_MASK 0x7FFULL +#define EXP_TID_TIDLEN_SHIFT 0 +#define EXP_TID_TIDCTRL_MASK 0x3ULL +#define EXP_TID_TIDCTRL_SHIFT 20 +#define EXP_TID_TIDIDX_MASK 0x3FFULL +#define EXP_TID_TIDIDX_SHIFT 22 +#define EXP_TID_GET(tid, field) \ + (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK) + +#define EXP_TID_SET(field, value) \ + (((value) & EXP_TID_TID##field##_MASK) << \ + EXP_TID_TID##field##_SHIFT) +#define EXP_TID_CLEAR(tid, field) ({ \ + (tid) &= ~(EXP_TID_TID##field##_MASK << \ + EXP_TID_TID##field##_SHIFT); \ + }) +#define EXP_TID_RESET(tid, field, value) do { \ + EXP_TID_CLEAR(tid, field); \ + (tid) |= EXP_TID_SET(field, (value)); \ + } while (0) + +/* + * Define fields in the KDETH header so we can update the header + * template. + */ +#define KDETH_OFFSET_SHIFT 0 +#define KDETH_OFFSET_MASK 0x7fff +#define KDETH_OM_SHIFT 15 +#define KDETH_OM_MASK 0x1 +#define KDETH_TID_SHIFT 16 +#define KDETH_TID_MASK 0x3ff +#define KDETH_TIDCTRL_SHIFT 26 +#define KDETH_TIDCTRL_MASK 0x3 +#define KDETH_INTR_SHIFT 28 +#define KDETH_INTR_MASK 0x1 +#define KDETH_SH_SHIFT 29 +#define KDETH_SH_MASK 0x1 +#define KDETH_KVER_SHIFT 30 +#define KDETH_KVER_MASK 0x3 +#define KDETH_JKEY_SHIFT 0x0 +#define KDETH_JKEY_MASK 0xff +#define KDETH_HCRC_UPPER_SHIFT 16 +#define KDETH_HCRC_UPPER_MASK 0xff +#define KDETH_HCRC_LOWER_SHIFT 24 +#define KDETH_HCRC_LOWER_MASK 0xff + +#define KDETH_GET(val, field) \ + (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) +#define KDETH_SET(dw, field, val) do { \ + u32 dwval = le32_to_cpu(dw); \ + dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ + dwval |= (((val) & KDETH_##field##_MASK) << \ + KDETH_##field##_SHIFT); \ + dw = cpu_to_le32(dwval); \ + } while (0) + +#define KDETH_RESET(dw, field, val) ({ dw = 0; KDETH_SET(dw, field, val); }) + +/* KDETH OM multipliers and switch over point */ +#define KDETH_OM_SMALL 4 +#define KDETH_OM_SMALL_SHIFT 2 +#define KDETH_OM_LARGE 64 +#define KDETH_OM_LARGE_SHIFT 6 +#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) + +struct tid_group { + struct list_head list; + u32 base; + u8 size; + u8 used; + u8 map; +}; + +/* + * Write an "empty" RcvArray entry. + * This function exists so the TID registaration code can use it + * to write to unused/unneeded entries and still take advantage + * of the WC performance improvements. The HFI will ignore this + * write to the RcvArray entry. + */ +static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +{ + /* + * Doing the WC fill writes only makes sense if the device is + * present and the RcvArray has been mapped as WC memory. + */ + if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) { + writeq(0, dd->rcvarray_wc + (index * 8)); + if ((index & 3) == 3) + flush_wc(); + } +} + +static inline void tid_group_add_tail(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_add_tail(&grp->list, &set->list); + set->count++; +} + +static inline void tid_group_remove(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_del_init(&grp->list); + set->count--; +} + +static inline void tid_group_move(struct tid_group *group, + struct exp_tid_set *s1, + struct exp_tid_set *s2) +{ + tid_group_remove(group, s1); + tid_group_add_tail(group, s2); +} + +static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) +{ + struct tid_group *grp = + list_first_entry(&set->list, struct tid_group, list); + list_del_init(&grp->list); + set->count--; + return grp; +} + +static inline u32 rcventry2tidinfo(u32 rcventry) +{ + u32 pair = rcventry & ~0x1; + + return EXP_TID_SET(IDX, pair >> 1) | + EXP_TID_SET(CTRL, 1 << (rcventry - pair)); +} + +int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); +void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); +void hfi1_exp_tid_group_init(struct exp_tid_set *set); + +#endif /* _HFI1_EXP_RCV_H */ diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 3158128d57e8..2bc89260235a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -58,10 +58,10 @@ #include "device.h" #include "common.h" #include "trace.h" +#include "mmu_rb.h" #include "user_sdma.h" #include "user_exp_rcv.h" #include "aspm.h" -#include "mmu_rb.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -79,21 +79,25 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma); static u64 kvirt_to_phys(void *addr); static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo); -static int init_subctxts(struct hfi1_ctxtdata *uctxt, - const struct hfi1_user_info *uinfo); -static int init_user_ctxt(struct hfi1_filedata *fd); +static void init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo); +static int init_user_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); static void user_init(struct hfi1_ctxtdata *uctxt); static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase, __u32 len); static int get_base_info(struct hfi1_filedata *fd, void __user *ubase, __u32 len); -static int setup_base_ctxt(struct hfi1_filedata *fd); +static int setup_base_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); static int setup_subctxt(struct hfi1_ctxtdata *uctxt); static int find_sub_ctxt(struct hfi1_filedata *fd, const struct hfi1_user_info *uinfo); static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, - struct hfi1_user_info *uinfo); + struct hfi1_user_info *uinfo, + struct hfi1_ctxtdata **cd); +static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt); static unsigned int poll_urgent(struct file *fp, struct poll_table_struct *pt); static unsigned int poll_next(struct file *fp, struct poll_table_struct *pt); static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt, @@ -116,7 +120,7 @@ static const struct file_operations hfi1_file_ops = { .llseek = noop_llseek, }; -static struct vm_operations_struct vm_ops = { +static const struct vm_operations_struct vm_ops = { .fault = vma_fault, }; @@ -181,7 +185,7 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) struct hfi1_devdata, user_cdev); - if (!((dd->flags & HFI1_PRESENT) && dd->kregbase)) + if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1)) return -EINVAL; if (!atomic_inc_not_zero(&dd->user_refcount)) @@ -267,12 +271,14 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, /* * Copy the number of tidlist entries we used * and the length of the buffer we registered. - * These fields are adjacent in the structure so - * we can copy them at the same time. */ addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, - sizeof(tinfo.tidcnt) + + sizeof(tinfo.tidcnt))) + return -EFAULT; + + addr = arg + offsetof(struct hfi1_tid_info, length); + if (copy_to_user((void __user *)addr, &tinfo.length, sizeof(tinfo.length))) ret = -EFAULT; } @@ -388,8 +394,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, sc_disable(sc); ret = sc_enable(sc); - hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, - uctxt->ctxt); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt); } else { ret = sc_restart(sc); } @@ -425,8 +430,7 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) if (!iter_is_iovec(from) || !dim) return -EINVAL; - hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)", - fd->uctxt->ctxt, fd->subctxt, dim); + trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim); if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) return -ENOSPC; @@ -752,12 +756,11 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) if (!uctxt) goto done; - hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); - mutex_lock(&hfi1_mutex); + hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); flush_wc(); /* drain user sdma queue */ - hfi1_user_sdma_free_queues(fdata); + hfi1_user_sdma_free_queues(fdata, uctxt); /* release the cpu */ hfi1_put_proc_affinity(fdata->rec_cpu_num); @@ -766,6 +769,13 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) hfi1_user_exp_rcv_free(fdata); /* + * fdata->uctxt is used in the above cleanup. It is not ready to be + * removed until here. + */ + fdata->uctxt = NULL; + hfi1_rcd_put(uctxt); + + /* * Clear any left over, unhandled events so the next process that * gets this context doesn't get confused. */ @@ -773,13 +783,14 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) HFI1_MAX_SHARED_CTXTS) + fdata->subctxt; *ev = 0; + spin_lock_irqsave(&dd->uctxt_lock, flags); __clear_bit(fdata->subctxt, uctxt->in_use_ctxts); if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) { - mutex_unlock(&hfi1_mutex); + spin_unlock_irqrestore(&dd->uctxt_lock, flags); goto done; } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); - spin_lock_irqsave(&dd->uctxt_lock, flags); /* * Disable receive context and interrupt available, reset all * RcvCtxtCtrl bits to default values. @@ -790,34 +801,24 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_ONE_PKT_EGR_DIS | HFI1_RCVCTRL_NO_RHQ_DROP_DIS | - HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt); + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); /* Clear the context's J_KEY */ - hfi1_clear_ctxt_jkey(dd, uctxt->ctxt); + hfi1_clear_ctxt_jkey(dd, uctxt); /* - * Reset context integrity checks to default. - * (writes to CSRs probably belong in chip.c) + * If a send context is allocated, reset context integrity + * checks to default and disable the send context. */ - write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE, - hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type)); - sc_disable(uctxt->sc); - spin_unlock_irqrestore(&dd->uctxt_lock, flags); - - dd->rcd[uctxt->ctxt] = NULL; + if (uctxt->sc) { + set_pio_integrity(uctxt->sc); + sc_disable(uctxt->sc); + } - hfi1_user_exp_rcv_grp_free(uctxt); + hfi1_free_ctxt_rcv_groups(uctxt); hfi1_clear_ctxt_pkey(dd, uctxt); - uctxt->rcvwait_to = 0; - uctxt->piowait_to = 0; - uctxt->rcvnowait = 0; - uctxt->pionowait = 0; uctxt->event_flags = 0; - hfi1_stats.sps_ctxts--; - if (++dd->freectxts == dd->num_user_contexts) - aspm_enable_all(dd); - mutex_unlock(&hfi1_mutex); - hfi1_free_ctxtdata(dd, uctxt); + deallocate_ctxt(uctxt); done: mmdrop(fdata->mm); kobject_put(&dd->kobj); @@ -845,135 +846,211 @@ static u64 kvirt_to_phys(void *addr) return paddr; } +/** + * complete_subctxt + * @fd: valid filedata pointer + * + * Sub-context info can only be set up after the base context + * has been completed. This is indicated by the clearing of the + * HFI1_CTXT_BASE_UINIT bit. + * + * Wait for the bit to be cleared, and then complete the subcontext + * initialization. + * + */ +static int complete_subctxt(struct hfi1_filedata *fd) +{ + int ret; + unsigned long flags; + + /* + * sub-context info can only be set up after the base context + * has been completed. + */ + ret = wait_event_interruptible( + fd->uctxt->wait, + !test_bit(HFI1_CTXT_BASE_UNINIT, &fd->uctxt->event_flags)); + + if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags)) + ret = -ENOMEM; + + /* Finish the sub-context init */ + if (!ret) { + fd->rec_cpu_num = hfi1_get_proc_affinity(fd->uctxt->numa_id); + ret = init_user_ctxt(fd, fd->uctxt); + } + + if (ret) { + hfi1_rcd_put(fd->uctxt); + fd->uctxt = NULL; + spin_lock_irqsave(&fd->dd->uctxt_lock, flags); + __clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts); + spin_unlock_irqrestore(&fd->dd->uctxt_lock, flags); + } + + return ret; +} + static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo) { int ret; unsigned int swmajor, swminor; + struct hfi1_ctxtdata *uctxt = NULL; swmajor = uinfo->userversion >> 16; if (swmajor != HFI1_USER_SWMAJOR) return -ENODEV; + if (uinfo->subctxt_cnt > HFI1_MAX_SHARED_CTXTS) + return -EINVAL; + swminor = uinfo->userversion & 0xffff; + /* + * Acquire the mutex to protect against multiple creations of what + * could be a shared base context. + */ mutex_lock(&hfi1_mutex); /* - * Get a sub context if necessary. + * Get a sub context if available (fd->uctxt will be set). * ret < 0 error, 0 no context, 1 sub-context found */ - ret = 0; - if (uinfo->subctxt_cnt) { - ret = find_sub_ctxt(fd, uinfo); - if (ret > 0) - fd->rec_cpu_num = - hfi1_get_proc_affinity(fd->uctxt->numa_id); - } + ret = find_sub_ctxt(fd, uinfo); /* - * Allocate a base context if context sharing is not required or we - * couldn't find a sub context. + * Allocate a base context if context sharing is not required or a + * sub context wasn't found. */ if (!ret) - ret = allocate_ctxt(fd, fd->dd, uinfo); + ret = allocate_ctxt(fd, fd->dd, uinfo, &uctxt); mutex_unlock(&hfi1_mutex); - /* Depending on the context type, do the appropriate init */ - if (ret > 0) { - /* - * sub-context info can only be set up after the base - * context has been completed. - */ - ret = wait_event_interruptible(fd->uctxt->wait, !test_bit( - HFI1_CTXT_BASE_UNINIT, - &fd->uctxt->event_flags)); - if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags)) { - clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts); - return -ENOMEM; - } - /* The only thing a sub context needs is the user_xxx stuff */ - if (!ret) - ret = init_user_ctxt(fd); - - if (ret) - clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts); - } else if (!ret) { - ret = setup_base_ctxt(fd); - if (fd->uctxt->subctxt_cnt) { - /* If there is an error, set the failed bit. */ - if (ret) - set_bit(HFI1_CTXT_BASE_FAILED, - &fd->uctxt->event_flags); + /* Depending on the context type, finish the appropriate init */ + switch (ret) { + case 0: + ret = setup_base_ctxt(fd, uctxt); + if (uctxt->subctxt_cnt) { /* - * Base context is done, notify anybody using a - * sub-context that is waiting for this completion + * Base context is done (successfully or not), notify + * anybody using a sub-context that is waiting for + * this completion. */ - clear_bit(HFI1_CTXT_BASE_UNINIT, - &fd->uctxt->event_flags); - wake_up(&fd->uctxt->wait); + clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); } + break; + case 1: + ret = complete_subctxt(fd); + break; + default: + break; } return ret; } -/* - * The hfi1_mutex must be held when this function is called. It is - * necessary to ensure serialized access to the bitmask in_use_ctxts. +/** + * match_ctxt + * @fd: valid filedata pointer + * @uinfo: user info to compare base context with + * @uctxt: context to compare uinfo to. + * + * Compare the given context with the given information to see if it + * can be used for a sub context. */ -static int find_sub_ctxt(struct hfi1_filedata *fd, - const struct hfi1_user_info *uinfo) +static int match_ctxt(struct hfi1_filedata *fd, + const struct hfi1_user_info *uinfo, + struct hfi1_ctxtdata *uctxt) { - int i; struct hfi1_devdata *dd = fd->dd; + unsigned long flags; u16 subctxt; - for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) { - struct hfi1_ctxtdata *uctxt = dd->rcd[i]; + /* Skip dynamically allocated kernel contexts */ + if (uctxt->sc && (uctxt->sc->type == SC_KERNEL)) + return 0; - /* Skip ctxts which are not yet open */ - if (!uctxt || - bitmap_empty(uctxt->in_use_ctxts, - HFI1_MAX_SHARED_CTXTS)) - continue; + /* Skip ctxt if it doesn't match the requested one */ + if (memcmp(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)) || + uctxt->jkey != generate_jkey(current_uid()) || + uctxt->subctxt_id != uinfo->subctxt_id || + uctxt->subctxt_cnt != uinfo->subctxt_cnt) + return 0; - /* Skip dynamically allocted kernel contexts */ - if (uctxt->sc && (uctxt->sc->type == SC_KERNEL)) - continue; + /* Verify the sharing process matches the base */ + if (uctxt->userversion != uinfo->userversion) + return -EINVAL; - /* Skip ctxt if it doesn't match the requested one */ - if (memcmp(uctxt->uuid, uinfo->uuid, - sizeof(uctxt->uuid)) || - uctxt->jkey != generate_jkey(current_uid()) || - uctxt->subctxt_id != uinfo->subctxt_id || - uctxt->subctxt_cnt != uinfo->subctxt_cnt) - continue; + /* Find an unused sub context */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) { + /* context is being closed, do not use */ + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + return 0; + } - /* Verify the sharing process matches the master */ - if (uctxt->userversion != uinfo->userversion) - return -EINVAL; + subctxt = find_first_zero_bit(uctxt->in_use_ctxts, + HFI1_MAX_SHARED_CTXTS); + if (subctxt >= uctxt->subctxt_cnt) { + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + return -EBUSY; + } - /* Find an unused context */ - subctxt = find_first_zero_bit(uctxt->in_use_ctxts, - HFI1_MAX_SHARED_CTXTS); - if (subctxt >= uctxt->subctxt_cnt) - return -EBUSY; + fd->subctxt = subctxt; + __set_bit(fd->subctxt, uctxt->in_use_ctxts); + spin_unlock_irqrestore(&dd->uctxt_lock, flags); - fd->uctxt = uctxt; - fd->subctxt = subctxt; - __set_bit(fd->subctxt, uctxt->in_use_ctxts); + fd->uctxt = uctxt; + hfi1_rcd_get(uctxt); + + return 1; +} - return 1; +/** + * find_sub_ctxt + * @fd: valid filedata pointer + * @uinfo: matching info to use to find a possible context to share. + * + * The hfi1_mutex must be held when this function is called. It is + * necessary to ensure serialized creation of shared contexts. + * + * Return: + * 0 No sub-context found + * 1 Subcontext found and allocated + * errno EINVAL (incorrect parameters) + * EBUSY (all sub contexts in use) + */ +static int find_sub_ctxt(struct hfi1_filedata *fd, + const struct hfi1_user_info *uinfo) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd = fd->dd; + u16 i; + int ret; + + if (!uinfo->subctxt_cnt) + return 0; + + for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) { + uctxt = hfi1_rcd_get_by_index(dd, i); + if (uctxt) { + ret = match_ctxt(fd, uinfo, uctxt); + hfi1_rcd_put(uctxt); + /* value of != 0 will return */ + if (ret) + return ret; + } } return 0; } static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, - struct hfi1_user_info *uinfo) + struct hfi1_user_info *uinfo, + struct hfi1_ctxtdata **rcd) { struct hfi1_ctxtdata *uctxt; - unsigned int ctxt; int ret, numa; if (dd->flags & HFI1_FROZEN) { @@ -987,22 +1064,9 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, return -EIO; } - /* - * This check is sort of redundant to the next EBUSY error. It would - * also indicate an inconsistancy in the driver if this value was - * zero, but there were still contexts available. - */ if (!dd->freectxts) return -EBUSY; - for (ctxt = dd->first_dyn_alloc_ctxt; - ctxt < dd->num_rcv_contexts; ctxt++) - if (!dd->rcd[ctxt]) - break; - - if (ctxt == dd->num_rcv_contexts) - return -EBUSY; - /* * If we don't have a NUMA node requested, preference is towards * device NUMA node. @@ -1012,11 +1076,10 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, numa = cpu_to_node(fd->rec_cpu_num); else numa = numa_node_id(); - uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa); - if (!uctxt) { - dd_dev_err(dd, - "Unable to allocate ctxtdata memory, failing open\n"); - return -ENOMEM; + ret = hfi1_create_ctxtdata(dd->pport, numa, &uctxt); + if (ret < 0) { + dd_dev_err(dd, "user ctxtdata allocation failed\n"); + return ret; } hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)", uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num, @@ -1025,8 +1088,7 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, /* * Allocate and enable a PIO send context. */ - uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, - uctxt->dd->node); + uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, dd->node); if (!uctxt->sc) { ret = -ENOMEM; goto ctxdata_free; @@ -1038,28 +1100,19 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, goto ctxdata_free; /* - * Setup sub context resources if the user-level has requested + * Setup sub context information if the user-level has requested * sub contexts. * This has to be done here so the rest of the sub-contexts find the - * proper master. + * proper base context. */ - if (uinfo->subctxt_cnt) { - ret = init_subctxts(uctxt, uinfo); - /* - * On error, we don't need to disable and de-allocate the - * send context because it will be done during file close - */ - if (ret) - goto ctxdata_free; - } + if (uinfo->subctxt_cnt) + init_subctxts(uctxt, uinfo); uctxt->userversion = uinfo->userversion; uctxt->flags = hfi1_cap_mask; /* save current flag state */ init_waitqueue_head(&uctxt->wait); strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); uctxt->jkey = generate_jkey(current_uid()); - INIT_LIST_HEAD(&uctxt->sdma_queues); - spin_lock_init(&uctxt->sdma_qlock); hfi1_stats.sps_ctxts++; /* * Disable ASPM when there are open user/PSM contexts to avoid @@ -1067,31 +1120,33 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, */ if (dd->freectxts-- == dd->num_user_contexts) aspm_disable_all(dd); - fd->uctxt = uctxt; + + *rcd = uctxt; return 0; ctxdata_free: - dd->rcd[ctxt] = NULL; - hfi1_free_ctxtdata(dd, uctxt); + hfi1_free_ctxt(uctxt); return ret; } -static int init_subctxts(struct hfi1_ctxtdata *uctxt, - const struct hfi1_user_info *uinfo) +static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt) { - u16 num_subctxts; + mutex_lock(&hfi1_mutex); + hfi1_stats.sps_ctxts--; + if (++uctxt->dd->freectxts == uctxt->dd->num_user_contexts) + aspm_enable_all(uctxt->dd); + mutex_unlock(&hfi1_mutex); - num_subctxts = uinfo->subctxt_cnt; - if (num_subctxts > HFI1_MAX_SHARED_CTXTS) - return -EINVAL; + hfi1_free_ctxt(uctxt); +} +static void init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo) +{ uctxt->subctxt_cnt = uinfo->subctxt_cnt; uctxt->subctxt_id = uinfo->subctxt_id; - uctxt->redirect_seq_cnt = 1; set_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); - - return 0; } static int setup_subctxt(struct hfi1_ctxtdata *uctxt) @@ -1153,7 +1208,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt) clear_rcvhdrtail(uctxt); /* Setup J_KEY before enabling the context */ - hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey); + hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey); rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) @@ -1179,7 +1234,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt) rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; else rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; - hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt); + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt); } static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase, @@ -1223,23 +1278,25 @@ static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase, return ret; } -static int init_user_ctxt(struct hfi1_filedata *fd) +static int init_user_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) { - struct hfi1_ctxtdata *uctxt = fd->uctxt; int ret; ret = hfi1_user_sdma_alloc_queues(uctxt, fd); if (ret) return ret; - ret = hfi1_user_exp_rcv_init(fd); + ret = hfi1_user_exp_rcv_init(fd, uctxt); + if (ret) + hfi1_user_sdma_free_queues(fd, uctxt); return ret; } -static int setup_base_ctxt(struct hfi1_filedata *fd) +static int setup_base_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) { - struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; int ret = 0; @@ -1260,20 +1317,27 @@ static int setup_base_ctxt(struct hfi1_filedata *fd) if (ret) goto setup_failed; - ret = hfi1_user_exp_rcv_grp_init(fd); + ret = hfi1_alloc_ctxt_rcv_groups(uctxt); if (ret) goto setup_failed; - ret = init_user_ctxt(fd); + ret = init_user_ctxt(fd, uctxt); if (ret) goto setup_failed; user_init(uctxt); + /* Now that the context is set up, the fd can get a reference. */ + fd->uctxt = uctxt; + hfi1_rcd_get(uctxt); + return 0; setup_failed: - hfi1_free_ctxtdata(dd, uctxt); + /* Set the failed bit so sub-context init can do the right thing */ + set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); + deallocate_ctxt(uctxt); + return ret; } @@ -1390,7 +1454,7 @@ static unsigned int poll_next(struct file *fp, spin_lock_irq(&dd->uctxt_lock); if (hdrqempty(uctxt)) { set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags); - hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt); pollflag = 0; } else { pollflag = POLLIN | POLLRDNORM; @@ -1409,19 +1473,14 @@ int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit) { struct hfi1_ctxtdata *uctxt; struct hfi1_devdata *dd = ppd->dd; - unsigned ctxt; - int ret = 0; - unsigned long flags; + u16 ctxt; - if (!dd->events) { - ret = -EINVAL; - goto done; - } + if (!dd->events) + return -EINVAL; - spin_lock_irqsave(&dd->uctxt_lock, flags); for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts; ctxt++) { - uctxt = dd->rcd[ctxt]; + uctxt = hfi1_rcd_get_by_index(dd, ctxt); if (uctxt) { unsigned long *evs = dd->events + (uctxt->ctxt - dd->first_dyn_alloc_ctxt) * @@ -1434,11 +1493,11 @@ int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit) set_bit(evtbit, evs); for (i = 1; i < uctxt->subctxt_cnt; i++) set_bit(evtbit, evs + i); + hfi1_rcd_put(uctxt); } } - spin_unlock_irqrestore(&dd->uctxt_lock, flags); -done: - return ret; + + return 0; } /** @@ -1475,7 +1534,7 @@ static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt, } else { rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS; } - hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt); + hfi1_rcvctrl(dd, rcvctrl_op, uctxt); /* always; new head should be equal to new tail; see above */ bail: return 0; @@ -1525,7 +1584,7 @@ static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, u16 subctxt, u16 pkey) } if (intable) - ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey); + ret = hfi1_set_ctxt_pkey(dd, uctxt, pkey); done: return ret; } diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index 4042c11b2742..5aea8f47e670 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -64,30 +64,22 @@ #define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw" #define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw" #define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw" -#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat" #define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw" #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw" #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw" #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw" +#define HOST_INTERFACE_VERSION 1 static uint fw_8051_load = 1; static uint fw_fabric_serdes_load = 1; static uint fw_pcie_serdes_load = 1; static uint fw_sbus_load = 1; -/* - * Access required in platform.c - * Maintains state of whether the platform config was fetched via the - * fallback option - */ -uint platform_config_load; - /* Firmware file names get set in hfi1_firmware_init() based on the above */ static char *fw_8051_name; static char *fw_fabric_serdes_name; static char *fw_sbus_name; static char *fw_pcie_serdes_name; -static char *platform_config_name; #define SBUS_MAX_POLL_COUNT 100 #define SBUS_COUNTER(reg, name) \ @@ -177,7 +169,6 @@ static struct firmware_details fw_8051; static struct firmware_details fw_fabric; static struct firmware_details fw_pcie; static struct firmware_details fw_sbus; -static const struct firmware *platform_config; /* flags for turn_off_spicos() */ #define SPICO_SBUS 0x1 @@ -615,6 +606,14 @@ retry: fw_fabric_serdes_name = ALT_FW_FABRIC_NAME; fw_sbus_name = ALT_FW_SBUS_NAME; fw_pcie_serdes_name = ALT_FW_PCIE_NAME; + + /* + * Add a delay before obtaining and loading debug firmware. + * Authorization will fail if the delay between firmware + * authorization events is shorter than 50us. Add 100us to + * make a delay time safe. + */ + usleep_range(100, 120); } if (fw_sbus_load) { @@ -675,7 +674,6 @@ done: static int obtain_firmware(struct hfi1_devdata *dd) { unsigned long timeout; - int err = 0; mutex_lock(&fw_mutex); @@ -699,38 +697,11 @@ static int obtain_firmware(struct hfi1_devdata *dd) } /* not in FW_TRY state */ - if (fw_state == FW_FINAL) { - if (platform_config) { - dd->platform_config.data = platform_config->data; - dd->platform_config.size = platform_config->size; - } - goto done; /* already acquired */ - } else if (fw_state == FW_ERR) { - goto done; /* already tried and failed */ - } - /* fw_state is FW_EMPTY */ - /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */ - __obtain_firmware(dd); - - if (platform_config_load) { - platform_config = NULL; - err = request_firmware(&platform_config, platform_config_name, - &dd->pcidev->dev); - if (err) { - platform_config = NULL; - dd_dev_err(dd, - "%s: No default platform config file found\n", - __func__); - goto done; - } - dd->platform_config.data = platform_config->data; - dd->platform_config.size = platform_config->size; - } + if (fw_state == FW_EMPTY) + __obtain_firmware(dd); -done: mutex_unlock(&fw_mutex); - return fw_err; } @@ -752,9 +723,6 @@ void dispose_firmware(void) dispose_one_firmware(&fw_pcie); dispose_one_firmware(&fw_sbus); - release_firmware(platform_config); - platform_config = NULL; - /* retain the error state, otherwise revert to empty */ if (fw_state != FW_ERR) fw_state = FW_EMPTY; @@ -1079,6 +1047,13 @@ static int load_8051_firmware(struct hfi1_devdata *dd, dd_dev_info(dd, "8051 firmware version %d.%d.%d\n", (int)ver_major, (int)ver_minor, (int)ver_patch); dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch); + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + return -EIO; + } return 0; } @@ -1709,10 +1684,8 @@ int hfi1_firmware_init(struct hfi1_devdata *dd) } /* no 8051 or QSFP on simulator */ - if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) fw_8051_load = 0; - platform_config_load = 0; - } if (!fw_8051_name) { if (dd->icode == ICODE_RTL_SILICON) @@ -1726,8 +1699,6 @@ int hfi1_firmware_init(struct hfi1_devdata *dd) fw_sbus_name = DEFAULT_FW_SBUS_NAME; if (!fw_pcie_serdes_name) fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME; - if (!platform_config_name) - platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME; return obtain_firmware(dd); } @@ -1773,6 +1744,7 @@ static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table) int parse_platform_config(struct hfi1_devdata *dd) { struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + struct hfi1_pportdata *ppd = dd->pport; u32 *ptr = NULL; u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0; u32 record_idx = 0, table_type = 0, table_length_dwords = 0; @@ -1784,7 +1756,7 @@ int parse_platform_config(struct hfi1_devdata *dd) * scratch register bitmap, thus there is no platform config to parse. * Skip parsing in these situations. */ - if (is_integrated(dd) && !platform_config_load) + if (ppd->config_from_scratch) return 0; if (!dd->platform_config.data) { @@ -2073,13 +2045,14 @@ int get_platform_config_field(struct hfi1_devdata *dd, int ret = 0, wlen = 0, seek = 0; u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL; struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + struct hfi1_pportdata *ppd = dd->pport; if (data) memset(data, 0, len); else return -EINVAL; - if (is_integrated(dd) && !platform_config_load) { + if (ppd->config_from_scratch) { /* * Use saved configuration from ppd for integrated platforms */ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 414a04a481c2..3ac9c307a285 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -66,9 +66,11 @@ #include <linux/i2c.h> #include <linux/i2c-algo-bit.h> #include <rdma/ib_hdrs.h> +#include <rdma/opa_addr.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <rdma/rdma_vt.h> +#include <rdma/opa_addr.h> #include "chip_registers.h" #include "common.h" @@ -213,13 +215,11 @@ struct hfi1_ctxtdata { /* dynamic receive available interrupt timeout */ u32 rcvavail_timeout; - /* - * number of opens (including slave sub-contexts) on this instance - * (ignoring forks, dup, etc. for now) - */ - int cnt; + /* Reference count the base context usage */ + struct kref kref; + /* Device context index */ - unsigned ctxt; + u16 ctxt; /* * non-zero if ctxt can be shared, and defines the maximum number of * sub-contexts for this device context. @@ -245,24 +245,10 @@ struct hfi1_ctxtdata { /* lock protecting all Expected TID data */ struct mutex exp_lock; - /* number of pio bufs for this ctxt (all procs, if shared) */ - u32 piocnt; - /* first pio buffer for this ctxt */ - u32 pio_base; - /* chip offset of PIO buffers for this ctxt */ - u32 piobufs; /* per-context configuration flags */ unsigned long flags; /* per-context event flags for fileops/intr communication */ unsigned long event_flags; - /* WAIT_RCV that timed out, no interrupt */ - u32 rcvwait_to; - /* WAIT_PIO that timed out, no interrupt */ - u32 piowait_to; - /* WAIT_RCV already happened, no wait */ - u32 rcvnowait; - /* WAIT_PIO already happened, no wait */ - u32 pionowait; /* total number of polled urgent packets */ u32 urgent; /* saved total number of polled urgent packets for poll edge trigger */ @@ -289,10 +275,8 @@ struct hfi1_ctxtdata { u16 poll_type; /* receive packet sequence counter */ u8 seq_cnt; - u8 redirect_seq_cnt; /* ctxt rcvhdrq head offset */ u32 head; - u32 pkt_count; /* QPs waiting for context processing */ struct list_head qp_wait_list; /* interrupt handling */ @@ -301,15 +285,6 @@ struct hfi1_ctxtdata { unsigned numa_id; /* numa node of this context */ /* verbs stats per CTX */ struct hfi1_opcode_stats_perctx *opstats; - /* - * This is the kernel thread that will keep making - * progress on the user sdma requests behind the scenes. - * There is one per context (shared contexts use the master's). - */ - struct task_struct *progress; - struct list_head sdma_queues; - /* protect sdma queues */ - spinlock_t sdma_qlock; /* Is ASPM interrupt supported for this context */ bool aspm_intr_supported; @@ -352,23 +327,150 @@ struct hfi1_ctxtdata { struct hfi1_packet { void *ebuf; void *hdr; + void *payload; struct hfi1_ctxtdata *rcd; __le32 *rhf_addr; struct rvt_qp *qp; struct ib_other_headers *ohdr; + struct ib_grh *grh; u64 rhf; u32 maxcnt; u32 rhqoff; + u32 dlid; + u32 slid; u16 tlen; s16 etail; u8 hlen; u8 numpkt; u8 rsize; u8 updegr; - u8 rcv_flags; u8 etype; + u8 extra_byte; + u8 pad; + u8 sc; + u8 sl; + u8 opcode; + bool becn; + bool fecn; }; +/* Packet types */ +#define HFI1_PKT_TYPE_9B 0 +#define HFI1_PKT_TYPE_16B 1 + +/* + * OPA 16B Header + */ +#define OPA_16B_L4_MASK 0xFFull +#define OPA_16B_SC_MASK 0x1F00000ull +#define OPA_16B_SC_SHIFT 20 +#define OPA_16B_LID_MASK 0xFFFFFull +#define OPA_16B_DLID_MASK 0xF000ull +#define OPA_16B_DLID_SHIFT 20 +#define OPA_16B_DLID_HIGH_SHIFT 12 +#define OPA_16B_SLID_MASK 0xF00ull +#define OPA_16B_SLID_SHIFT 20 +#define OPA_16B_SLID_HIGH_SHIFT 8 +#define OPA_16B_BECN_MASK 0x80000000ull +#define OPA_16B_BECN_SHIFT 31 +#define OPA_16B_FECN_MASK 0x10000000ull +#define OPA_16B_FECN_SHIFT 28 +#define OPA_16B_L2_MASK 0x60000000ull +#define OPA_16B_L2_SHIFT 29 +#define OPA_16B_PKEY_MASK 0xFFFF0000ull +#define OPA_16B_PKEY_SHIFT 16 +#define OPA_16B_LEN_MASK 0x7FF00000ull +#define OPA_16B_LEN_SHIFT 20 +#define OPA_16B_RC_MASK 0xE000000ull +#define OPA_16B_RC_SHIFT 25 +#define OPA_16B_AGE_MASK 0xFF0000ull +#define OPA_16B_AGE_SHIFT 16 +#define OPA_16B_ENTROPY_MASK 0xFFFFull + +/* + * OPA 16B L2/L4 Encodings + */ +#define OPA_16B_L2_TYPE 0x02 +#define OPA_16B_L4_IB_LOCAL 0x09 +#define OPA_16B_L4_IB_GLOBAL 0x0A +#define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR + +static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr) +{ + return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK); +} + +static inline u8 hfi1_16B_get_sc(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_SC_MASK) >> OPA_16B_SC_SHIFT); +} + +static inline u32 hfi1_16B_get_dlid(struct hfi1_16b_header *hdr) +{ + return (u32)((hdr->lrh[1] & OPA_16B_LID_MASK) | + (((hdr->lrh[2] & OPA_16B_DLID_MASK) >> + OPA_16B_DLID_HIGH_SHIFT) << OPA_16B_DLID_SHIFT)); +} + +static inline u32 hfi1_16B_get_slid(struct hfi1_16b_header *hdr) +{ + return (u32)((hdr->lrh[0] & OPA_16B_LID_MASK) | + (((hdr->lrh[2] & OPA_16B_SLID_MASK) >> + OPA_16B_SLID_HIGH_SHIFT) << OPA_16B_SLID_SHIFT)); +} + +static inline u8 hfi1_16B_get_becn(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[0] & OPA_16B_BECN_MASK) >> OPA_16B_BECN_SHIFT); +} + +static inline u8 hfi1_16B_get_fecn(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_FECN_MASK) >> OPA_16B_FECN_SHIFT); +} + +static inline u8 hfi1_16B_get_l2(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_L2_MASK) >> OPA_16B_L2_SHIFT); +} + +static inline u16 hfi1_16B_get_pkey(struct hfi1_16b_header *hdr) +{ + return (u16)((hdr->lrh[2] & OPA_16B_PKEY_MASK) >> OPA_16B_PKEY_SHIFT); +} + +static inline u8 hfi1_16B_get_rc(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_RC_MASK) >> OPA_16B_RC_SHIFT); +} + +static inline u8 hfi1_16B_get_age(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[3] & OPA_16B_AGE_MASK) >> OPA_16B_AGE_SHIFT); +} + +static inline u16 hfi1_16B_get_len(struct hfi1_16b_header *hdr) +{ + return (u16)((hdr->lrh[0] & OPA_16B_LEN_MASK) >> OPA_16B_LEN_SHIFT); +} + +static inline u16 hfi1_16B_get_entropy(struct hfi1_16b_header *hdr) +{ + return (u16)(hdr->lrh[3] & OPA_16B_ENTROPY_MASK); +} + +#define OPA_16B_MAKE_QW(low_dw, high_dw) (((u64)(high_dw) << 32) | (low_dw)) + +/* + * BTH + */ +#define OPA_16B_BTH_PAD_MASK 7 +static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) & + OPA_16B_BTH_PAD_MASK); +} + struct rvt_sge_state; /* @@ -512,7 +614,7 @@ static inline void incr_cntr32(u32 *cntr) #define MAX_NAME_SIZE 64 struct hfi1_msix_entry { enum irq_type type; - struct msix_entry msix; + int irq; void *arg; char name[MAX_NAME_SIZE]; cpumask_t mask; @@ -575,6 +677,9 @@ struct hfi1_pportdata { u8 default_atten; u8 max_power_class; + /* did we read platform config from scratch registers? */ + bool config_from_scratch; + /* GUIDs for this interface, in host order, guids[0] is a port guid */ u64 guids[HFI1_GUIDS_PER_PORT]; @@ -593,6 +698,7 @@ struct hfi1_pportdata { /* SendDMA related entries */ struct workqueue_struct *hfi1_wq; + struct workqueue_struct *link_wq; /* move out of interrupt context */ struct work_struct link_vc_work; @@ -607,8 +713,6 @@ struct hfi1_pportdata { struct mutex hls_lock; u32 host_link_state; - u32 lstate; /* logical link state */ - /* these are the "32 bit" regs */ u32 ibmtu; /* The MTU programmed for this unit */ @@ -619,7 +723,7 @@ struct hfi1_pportdata { u32 ibmaxlen; u32 current_egress_rate; /* units [10^6 bits/sec] */ /* LID programmed for this instance */ - u16 lid; + u32 lid; /* list of pkeys programmed; 0 if not set */ u16 pkeys[MAX_PKEY_VALUES]; u16 link_width_supported; @@ -654,12 +758,12 @@ struct hfi1_pportdata { u8 link_enabled; /* link enabled? */ u8 linkinit_reason; u8 local_tx_rate; /* rate given to 8051 firmware */ - u8 last_pstate; /* info only */ u8 qsfp_retry_count; /* placeholders for IB MAD packet settings */ u8 overrun_threshold; u8 phy_error_threshold; + unsigned int is_link_down_queued; /* Used to override LED behavior for things like maintenance beaconing*/ /* @@ -756,6 +860,10 @@ struct hfi1_pportdata { typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); typedef void (*opcode_handler)(struct hfi1_packet *packet); +typedef void (*hfi1_make_req)(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); + /* return values for the RHF receive functions */ #define RHF_RCV_CONTINUE 0 /* keep going */ @@ -860,12 +968,15 @@ struct hfi1_devdata { struct device *diag_device; struct device *ui_device; - /* mem-mapped pointer to base of chip regs */ - u8 __iomem *kregbase; - /* end of mem-mapped chip space excluding sendbuf and user regs */ - u8 __iomem *kregend; - /* physical address of chip for io_remap, etc. */ + /* first mapping up to RcvArray */ + u8 __iomem *kregbase1; resource_size_t physaddr; + + /* second uncached mapping from RcvArray to pio send buffers */ + u8 __iomem *kregbase2; + /* for detecting offset above kregbase2 address */ + u32 base2_start; + /* Per VL data. Enough for all VLs but not all elements are set/used. */ struct per_vl_data vld[PER_VL_SEND_CONTEXTS]; /* send context data */ @@ -953,8 +1064,7 @@ struct hfi1_devdata { u64 __iomem *egrtidbase; spinlock_t sendctrl_lock; /* protect changes to SendCtrl */ spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */ - /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */ - spinlock_t uctxt_lock; /* rcd and user context changes */ + spinlock_t uctxt_lock; /* protect rcd changes */ struct mutex dc8051_lock; /* exclusive access to 8051 */ struct workqueue_struct *update_cntr_wq; struct work_struct update_cntr_work; @@ -1229,9 +1339,10 @@ static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare) #define dc8051_ver_patch(a) ((a) & 0x0000ff) /* f_put_tid types */ -#define PT_EXPECTED 0 -#define PT_EAGER 1 -#define PT_INVALID 2 +#define PT_EXPECTED 0 +#define PT_EAGER 1 +#define PT_INVALID_FLUSH 2 +#define PT_INVALID 3 struct tid_rb_node; struct mmu_rb_node; @@ -1276,13 +1387,16 @@ void handle_user_interrupt(struct hfi1_ctxtdata *rcd); int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd); int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd); -int hfi1_create_ctxts(struct hfi1_devdata *dd); -struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, - int numa); +int hfi1_create_kctxts(struct hfi1_devdata *dd); +int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, + struct hfi1_ctxtdata **rcd); +void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd); void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, struct hfi1_devdata *dd, u8 hw_pidx, u8 port); void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd); - +int hfi1_rcd_put(struct hfi1_ctxtdata *rcd); +void hfi1_rcd_get(struct hfi1_ctxtdata *rcd); +struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt); int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread); @@ -1292,6 +1406,13 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); extern const struct pci_device_id hfi1_pci_tbl[]; +void hfi1_make_ud_req_9B(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); + +void hfi1_make_ud_req_16B(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); /* receive packet handler dispositions */ #define RCV_PKT_OK 0x0 /* keep going */ @@ -1306,21 +1427,6 @@ static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd) int hfi1_reset_device(int); -/* return the driver's idea of the logical OPA port state */ -static inline u32 driver_lstate(struct hfi1_pportdata *ppd) -{ - /* - * The driver does some processing from the time the logical - * link state is at INIT to the time the SM can be notified - * as such. Return IB_PORT_DOWN until the software state - * is ready. - */ - if (ppd->lstate == IB_PORT_INIT && !(ppd->host_link_state & HLS_UP)) - return IB_PORT_DOWN; - else - return ppd->lstate; -} - void receive_interrupt_work(struct work_struct *work); /* extract service channel from header and rhf */ @@ -1413,13 +1519,25 @@ static inline u32 egress_cycles(u32 len, u32 rate) } void set_link_ipg(struct hfi1_pportdata *ppd); -void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, u32 rqpn, u8 svc_type); void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh); +void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh); +typedef void (*hfi1_handle_cnp)(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh); + +/* We support only two types - 9B and 16B for now */ +static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &return_cnp, + [HFI1_PKT_TYPE_16B] = &return_cnp_16B +}; #define PKEY_CHECK_INVALID -1 -int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, +int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey, u8 sc5, int8_t s_pkey_index); #define PACKET_EGRESS_TIMEOUT 350 @@ -1522,9 +1640,9 @@ static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, * by HW and rcv_pkey_check function should be called instead. */ static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, - u8 sc5, u8 idx, u16 slid) + u8 sc5, u8 idx, u32 slid, bool force) { - if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + if (!(force) && !(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) return 0; /* If SC15, pkey[0:14] must be 0x7fff */ @@ -1658,12 +1776,22 @@ static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt, bool do_cnp) { struct ib_other_headers *ohdr = pkt->ohdr; - u32 bth1; - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & (IB_BECN_SMASK | IB_FECN_SMASK))) { + u32 bth1; + bool becn = false; + bool fecn = false; + + if (pkt->etype == RHF_RCV_TYPE_BYPASS) { + fecn = hfi1_16B_get_fecn(pkt->hdr); + becn = hfi1_16B_get_becn(pkt->hdr); + } else { + bth1 = be32_to_cpu(ohdr->bth[1]); + fecn = bth1 & IB_FECN_SMASK; + becn = bth1 & IB_BECN_SMASK; + } + if (unlikely(fecn || becn)) { hfi1_process_ecn_slowpath(qp, pkt, do_cnp); - return !!(bth1 & IB_FECN_SMASK); + return fecn; } return false; } @@ -1829,10 +1957,9 @@ void hfi1_pcie_cleanup(struct pci_dev *pdev); int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev); void hfi1_pcie_ddcleanup(struct hfi1_devdata *); int pcie_speeds(struct hfi1_devdata *dd); -void request_msix(struct hfi1_devdata *dd, u32 *nent, - struct hfi1_msix_entry *entry); -void hfi1_enable_intx(struct pci_dev *pdev); -void restore_pci_variables(struct hfi1_devdata *dd); +int request_msix(struct hfi1_devdata *dd, u32 msireq); +int restore_pci_variables(struct hfi1_devdata *dd); +int save_pci_variables(struct hfi1_devdata *dd); int do_pcie_gen3_transition(struct hfi1_devdata *dd); int parse_platform_config(struct hfi1_devdata *dd); int get_platform_config_field(struct hfi1_devdata *dd, @@ -1860,6 +1987,7 @@ int process_receive_error(struct hfi1_packet *packet); int kdeth_process_expected(struct hfi1_packet *packet); int kdeth_process_eager(struct hfi1_packet *packet); int process_receive_invalid(struct hfi1_packet *packet); +void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd); /* global module parameter variables */ extern unsigned int hfi1_max_mtu; @@ -1991,9 +2119,15 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) #define dd_dev_emerg(dd, fmt, ...) \ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ get_unit_name((dd)->unit), ##__VA_ARGS__) + #define dd_dev_err(dd, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_err_ratelimited(dd, fmt, ...) \ + dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + #define dd_dev_warn(dd, fmt, ...) \ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ get_unit_name((dd)->unit), ##__VA_ARGS__) @@ -2087,52 +2221,220 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); #define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) #define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) -#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } -#define show_packettype(etype) \ -__print_symbolic(etype, \ - packettype_name(EXPECTED), \ - packettype_name(EAGER), \ - packettype_name(IB), \ - packettype_name(ERROR), \ - packettype_name(BYPASS)) - -#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } -#define show_ib_opcode(opcode) \ -__print_symbolic(opcode, \ - ib_opcode_name(RC_SEND_FIRST), \ - ib_opcode_name(RC_SEND_MIDDLE), \ - ib_opcode_name(RC_SEND_LAST), \ - ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(RC_SEND_ONLY), \ - ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_WRITE_FIRST), \ - ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ - ib_opcode_name(RC_RDMA_WRITE_LAST), \ - ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_WRITE_ONLY), \ - ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_READ_REQUEST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ - ib_opcode_name(RC_ACKNOWLEDGE), \ - ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ - ib_opcode_name(RC_COMPARE_SWAP), \ - ib_opcode_name(RC_FETCH_ADD), \ - ib_opcode_name(UC_SEND_FIRST), \ - ib_opcode_name(UC_SEND_MIDDLE), \ - ib_opcode_name(UC_SEND_LAST), \ - ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(UC_SEND_ONLY), \ - ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(UC_RDMA_WRITE_FIRST), \ - ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ - ib_opcode_name(UC_RDMA_WRITE_LAST), \ - ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(UC_RDMA_WRITE_ONLY), \ - ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(UD_SEND_ONLY), \ - ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(CNP)) +static inline void hfi1_update_ah_attr(struct ib_device *ibdev, + struct rdma_ah_attr *attr) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u32 dlid = rdma_ah_get_dlid(attr); + + /* + * Kernel clients may not have setup GRH information + * Set that here. + */ + ibp = to_iport(ibdev, rdma_ah_get_port_num(attr)); + ppd = ppd_from_ibp(ibp); + if ((((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))) && + (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) && + (!(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))) || + (rdma_ah_get_make_grd(attr))) { + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + rdma_ah_set_interface_id(attr, OPA_MAKE_ID(dlid)); + rdma_ah_set_subnet_prefix(attr, ibp->rvp.gid_prefix); + } +} + +/* + * hfi1_check_mcast- Check if the given lid is + * in the OPA multicast range. + * + * The LID might either reside in ah.dlid or might be + * in the GRH of the address handle as DGID if extended + * addresses are in use. + */ +static inline bool hfi1_check_mcast(u32 lid) +{ + return ((lid >= opa_get_mcast_base(OPA_MCAST_NR)) && + (lid != be32_to_cpu(OPA_LID_PERMISSIVE))); +} + +#define opa_get_lid(lid, format) \ + __opa_get_lid(lid, OPA_PORT_PACKET_FORMAT_##format) + +/* Convert a lid to a specific lid space */ +static inline u32 __opa_get_lid(u32 lid, u8 format) +{ + bool is_mcast = hfi1_check_mcast(lid); + + switch (format) { + case OPA_PORT_PACKET_FORMAT_8B: + case OPA_PORT_PACKET_FORMAT_10B: + if (is_mcast) + return (lid - opa_get_mcast_base(OPA_MCAST_NR) + + 0xF0000); + return lid & 0xFFFFF; + case OPA_PORT_PACKET_FORMAT_16B: + if (is_mcast) + return (lid - opa_get_mcast_base(OPA_MCAST_NR) + + 0xF00000); + return lid & 0xFFFFFF; + case OPA_PORT_PACKET_FORMAT_9B: + if (is_mcast) + return (lid - + opa_get_mcast_base(OPA_MCAST_NR) + + be16_to_cpu(IB_MULTICAST_LID_BASE)); + else + return lid & 0xFFFF; + default: + return lid; + } +} + +/* Return true if the given lid is the OPA 16B multicast range */ +static inline bool hfi1_is_16B_mcast(u32 lid) +{ + return ((lid >= + opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) && + (lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))); +} + +static inline void hfi1_make_opa_lid(struct rdma_ah_attr *attr) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + u32 dlid = rdma_ah_get_dlid(attr); + + /* Modify ah_attr.dlid to be in the 32 bit LID space. + * This is how the address will be laid out: + * Assuming MCAST_NR to be 4, + * 32 bit permissive LID = 0xFFFFFFFF + * Multicast LID range = 0xFFFFFFFE to 0xF0000000 + * Unicast LID range = 0xEFFFFFFF to 1 + * Invalid LID = 0 + */ + if (ib_is_opa_gid(&grh->dgid)) + dlid = opa_get_lid_from_gid(&grh->dgid); + else if ((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) && + (dlid != be32_to_cpu(OPA_LID_PERMISSIVE))) + dlid = dlid - be16_to_cpu(IB_MULTICAST_LID_BASE) + + opa_get_mcast_base(OPA_MCAST_NR); + else if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) + dlid = be32_to_cpu(OPA_LID_PERMISSIVE); + + rdma_ah_set_dlid(attr, dlid); +} + +static inline u8 hfi1_get_packet_type(u32 lid) +{ + /* 9B if lid > 0xF0000000 */ + if (lid >= opa_get_mcast_base(OPA_MCAST_NR)) + return HFI1_PKT_TYPE_9B; + + /* 16B if lid > 0xC000 */ + if (lid >= opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 9B)) + return HFI1_PKT_TYPE_16B; + + return HFI1_PKT_TYPE_9B; +} + +static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr) +{ + /* + * If there was an incoming 16B packet with permissive + * LIDs, OPA GIDs would have been programmed when those + * packets were received. A 16B packet will have to + * be sent in response to that packet. Return a 16B + * header type if that's the case. + */ + if (rdma_ah_get_dlid(attr) == be32_to_cpu(OPA_LID_PERMISSIVE)) + return (ib_is_opa_gid(&rdma_ah_read_grh(attr)->dgid)) ? + HFI1_PKT_TYPE_16B : HFI1_PKT_TYPE_9B; + + /* + * Return a 16B header type if either the the destination + * or source lid is extended. + */ + if (hfi1_get_packet_type(rdma_ah_get_dlid(attr)) == HFI1_PKT_TYPE_16B) + return HFI1_PKT_TYPE_16B; + + return hfi1_get_packet_type(lid); +} + +static inline void hfi1_make_ext_grh(struct hfi1_packet *packet, + struct ib_grh *grh, u32 slid, + u32 dlid) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (!ibp) + return; + + grh->hop_limit = 1; + grh->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; + if (slid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B)) + grh->sgid.global.interface_id = + OPA_MAKE_ID(be32_to_cpu(OPA_LID_PERMISSIVE)); + else + grh->sgid.global.interface_id = OPA_MAKE_ID(slid); + + /* + * Upper layers (like mad) may compare the dgid in the + * wc that is obtained here with the sgid_index in + * the wr. Since sgid_index in wr is always 0 for + * extended lids, set the dgid here to the default + * IB gid. + */ + grh->dgid.global.subnet_prefix = ibp->rvp.gid_prefix; + grh->dgid.global.interface_id = + cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]); +} + +static inline int hfi1_get_16b_padding(u32 hdr_size, u32 payload) +{ + return -(hdr_size + payload + (SIZE_OF_CRC << 2) + + SIZE_OF_LT) & 0x7; +} + +static inline void hfi1_make_ib_hdr(struct ib_header *hdr, + u16 lrh0, u16 len, + u16 dlid, u16 slid) +{ + hdr->lrh[0] = cpu_to_be16(lrh0); + hdr->lrh[1] = cpu_to_be16(dlid); + hdr->lrh[2] = cpu_to_be16(len); + hdr->lrh[3] = cpu_to_be16(slid); +} + +static inline void hfi1_make_16b_hdr(struct hfi1_16b_header *hdr, + u32 slid, u32 dlid, + u16 len, u16 pkey, + u8 becn, u8 fecn, u8 l4, + u8 sc) +{ + u32 lrh0 = 0; + u32 lrh1 = 0x40000000; + u32 lrh2 = 0; + u32 lrh3 = 0; + + lrh0 = (lrh0 & ~OPA_16B_BECN_MASK) | (becn << OPA_16B_BECN_SHIFT); + lrh0 = (lrh0 & ~OPA_16B_LEN_MASK) | (len << OPA_16B_LEN_SHIFT); + lrh0 = (lrh0 & ~OPA_16B_LID_MASK) | (slid & OPA_16B_LID_MASK); + lrh1 = (lrh1 & ~OPA_16B_FECN_MASK) | (fecn << OPA_16B_FECN_SHIFT); + lrh1 = (lrh1 & ~OPA_16B_SC_MASK) | (sc << OPA_16B_SC_SHIFT); + lrh1 = (lrh1 & ~OPA_16B_LID_MASK) | (dlid & OPA_16B_LID_MASK); + lrh2 = (lrh2 & ~OPA_16B_SLID_MASK) | + ((slid >> OPA_16B_SLID_SHIFT) << OPA_16B_SLID_HIGH_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_DLID_MASK) | + ((dlid >> OPA_16B_DLID_SHIFT) << OPA_16B_DLID_HIGH_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_PKEY_MASK) | (pkey << OPA_16B_PKEY_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_L4_MASK) | l4; + + hdr->lrh[0] = lrh0; + hdr->lrh[1] = lrh1; + hdr->lrh[2] = lrh2; + hdr->lrh[3] = lrh3; +} #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4a11d4da4c92..fba77001c3a7 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -67,6 +67,7 @@ #include "aspm.h" #include "affinity.h" #include "vnic.h" +#include "exp_rcv.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -125,85 +126,198 @@ static struct idr hfi1_unit_table; u32 hfi1_cpulist_count; unsigned long *hfi1_cpulist; -/* - * Common code for creating the receive context array. - */ -int hfi1_create_ctxts(struct hfi1_devdata *dd) +static int hfi1_create_kctxt(struct hfi1_devdata *dd, + struct hfi1_pportdata *ppd) { - unsigned i; + struct hfi1_ctxtdata *rcd; int ret; /* Control context has to be always 0 */ BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); + ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); + if (ret < 0) { + dd_dev_err(dd, "Kernel receive context allocation failed\n"); + return ret; + } + + /* + * Set up the kernel context flags here and now because they use + * default values for all receive side memories. User contexts will + * be handled as they are created. + */ + rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + + /* Control context must use DMA_RTAIL */ + if (rcd->ctxt == HFI1_CTRL_CTXT) + rcd->flags |= HFI1_CAP_DMA_RTAIL; + rcd->seq_cnt = 1; + + rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); + if (!rcd->sc) { + dd_dev_err(dd, "Kernel send context allocation failed\n"); + return -ENOMEM; + } + hfi1_init_ctxt(rcd->sc); + + return 0; +} + +/* + * Create the receive context array and one or more kernel contexts + */ +int hfi1_create_kctxts(struct hfi1_devdata *dd) +{ + u16 i; + int ret; + dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd), GFP_KERNEL, dd->node); if (!dd->rcd) - goto nomem; + return -ENOMEM; - /* create one or more kernel contexts */ for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { - struct hfi1_pportdata *ppd; - struct hfi1_ctxtdata *rcd; + ret = hfi1_create_kctxt(dd, dd->pport); + if (ret) + goto bail; + } - ppd = dd->pport + (i % dd->num_pports); + return 0; +bail: + for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) + hfi1_free_ctxt(dd->rcd[i]); - /* dd->rcd[i] gets assigned inside the callee */ - rcd = hfi1_create_ctxtdata(ppd, i, dd->node); - if (!rcd) { - dd_dev_err(dd, - "Unable to allocate kernel receive context, failing\n"); - goto nomem; - } - /* - * Set up the kernel context flags here and now because they - * use default values for all receive side memories. User - * contexts will be handled as they are created. - */ - rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | - HFI1_CAP_KGET(NODROP_RHQ_FULL) | - HFI1_CAP_KGET(NODROP_EGR_FULL) | - HFI1_CAP_KGET(DMA_RTAIL); - - /* Control context must use DMA_RTAIL */ - if (rcd->ctxt == HFI1_CTRL_CTXT) - rcd->flags |= HFI1_CAP_DMA_RTAIL; - rcd->seq_cnt = 1; - - rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); - if (!rcd->sc) { - dd_dev_err(dd, - "Unable to allocate kernel send context, failing\n"); - goto nomem; - } + /* All the contexts should be freed, free the array */ + kfree(dd->rcd); + dd->rcd = NULL; + return ret; +} + +/* + * Helper routines for the receive context reference count (rcd and uctxt). + */ +static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) +{ + kref_init(&rcd->kref); +} - hfi1_init_ctxt(rcd->sc); +/** + * hfi1_rcd_free - When reference is zero clean up. + * @kref: pointer to an initialized rcd data structure + * + */ +static void hfi1_rcd_free(struct kref *kref) +{ + unsigned long flags; + struct hfi1_ctxtdata *rcd = + container_of(kref, struct hfi1_ctxtdata, kref); + + hfi1_free_ctxtdata(rcd->dd, rcd); + + spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); + rcd->dd->rcd[rcd->ctxt] = NULL; + spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); + + kfree(rcd); +} + +/** + * hfi1_rcd_put - decrement reference for rcd + * @rcd: pointer to an initialized rcd data structure + * + * Use this to put a reference after the init. + */ +int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) +{ + if (rcd) + return kref_put(&rcd->kref, hfi1_rcd_free); + + return 0; +} + +/** + * hfi1_rcd_get - increment reference for rcd + * @rcd: pointer to an initialized rcd data structure + * + * Use this to get a reference after the init. + */ +void hfi1_rcd_get(struct hfi1_ctxtdata *rcd) +{ + kref_get(&rcd->kref); +} + +/** + * allocate_rcd_index - allocate an rcd index from the rcd array + * @dd: pointer to a valid devdata structure + * @rcd: rcd data structure to assign + * @index: pointer to index that is allocated + * + * Find an empty index in the rcd array, and assign the given rcd to it. + * If the array is full, we are EBUSY. + * + */ +static int allocate_rcd_index(struct hfi1_devdata *dd, + struct hfi1_ctxtdata *rcd, u16 *index) +{ + unsigned long flags; + u16 ctxt; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) + if (!dd->rcd[ctxt]) + break; + + if (ctxt < dd->num_rcv_contexts) { + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; + hfi1_rcd_init(rcd); } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); - /* - * Initialize aspm, to be done after gen3 transition and setting up - * contexts and before enabling interrupts - */ - aspm_init(dd); + if (ctxt >= dd->num_rcv_contexts) + return -EBUSY; + + *index = ctxt; return 0; -nomem: - ret = -ENOMEM; +} - if (dd->rcd) { - for (i = 0; i < dd->num_rcv_contexts; ++i) - hfi1_free_ctxtdata(dd, dd->rcd[i]); +/** + * hfi1_rcd_get_by_index + * @dd: pointer to a valid devdata structure + * @ctxt: the index of an possilbe rcd + * + * We need to protect access to the rcd array. If access is needed to + * one or more index, get the protecting spinlock and then increment the + * kref. + * + * The caller is responsible for making the _put(). + * + */ +struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) +{ + unsigned long flags; + struct hfi1_ctxtdata *rcd = NULL; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd[ctxt]) { + rcd = dd->rcd[ctxt]; + hfi1_rcd_get(rcd); } - kfree(dd->rcd); - dd->rcd = NULL; - return ret; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + return rcd; } /* - * Common code for user and kernel context setup. + * Common code for user and kernel context create and setup. + * NOTE: the initial kref is done here (hf1_rcd_init()). */ -struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, - int numa) +int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, + struct hfi1_ctxtdata **context) { struct hfi1_devdata *dd = ppd->dd; struct hfi1_ctxtdata *rcd; @@ -217,20 +331,30 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); if (rcd) { u32 rcvtids, max_entries; - - hfi1_cdbg(PROC, "setting up context %u\n", ctxt); + u16 ctxt; + int ret; + + ret = allocate_rcd_index(dd, rcd, &ctxt); + if (ret) { + *context = NULL; + kfree(rcd); + return ret; + } INIT_LIST_HEAD(&rcd->qp_wait_list); + hfi1_exp_tid_group_init(&rcd->tid_group_list); + hfi1_exp_tid_group_init(&rcd->tid_used_list); + hfi1_exp_tid_group_init(&rcd->tid_full_list); rcd->ppd = ppd; rcd->dd = dd; __set_bit(0, rcd->in_use_ctxts); - rcd->ctxt = ctxt; - dd->rcd[ctxt] = rcd; rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; mutex_init(&rcd->exp_lock); + hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); + /* * Calculate the context's RcvArray entry starting point. * We do this here because we have to take into account all @@ -328,14 +452,30 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, if (!rcd->opstats) goto bail; } + + *context = rcd; + return 0; } - return rcd; + bail: - dd->rcd[ctxt] = NULL; - kfree(rcd->egrbufs.rcvtids); - kfree(rcd->egrbufs.buffers); - kfree(rcd); - return NULL; + *context = NULL; + hfi1_free_ctxt(rcd); + return -ENOMEM; +} + +/** + * hfi1_free_ctxt + * @rcd: pointer to an initialized rcd data structure + * + * This wrapper is the free function that matches hfi1_create_ctxtdata(). + * When a context is done being used (kernel or user), this function is called + * for the "final" put to match the kref init from hf1i_create_ctxtdata(). + * Other users of the context do a get/put sequence to make sure that the + * structure isn't removed while in use. + */ +void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) +{ + hfi1_rcd_put(rcd); } /* @@ -483,7 +623,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; ppd->part_enforce |= HFI1_PART_ENFORCE_IN; - ppd->part_enforce |= HFI1_PART_ENFORCE_OUT; if (loopback) { hfi1_early_err(&pdev->dev, @@ -559,16 +698,19 @@ static int loadtime_init(struct hfi1_devdata *dd) static int init_after_reset(struct hfi1_devdata *dd) { int i; - + struct hfi1_ctxtdata *rcd; /* * Ensure chip does no sends or receives, tail updates, or * pioavail updates while we re-initialize. This is mostly * for the driver data structures, not chip registers. */ - for (i = 0; i < dd->num_rcv_contexts; i++) + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | - HFI1_RCVCTRL_INTRAVAIL_DIS | - HFI1_RCVCTRL_TAILUPD_DIS, i); + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS, rcd); + hfi1_rcd_put(rcd); + } pio_send_control(dd, PSC_GLOBAL_DISABLE); for (i = 0; i < dd->num_send_contexts; i++) sc_disable(dd->send_contexts[i].sc); @@ -578,8 +720,9 @@ static int init_after_reset(struct hfi1_devdata *dd) static void enable_chip(struct hfi1_devdata *dd) { + struct hfi1_ctxtdata *rcd; u32 rcvmask; - u32 i; + u16 i; /* enable PIO send */ pio_send_control(dd, PSC_GLOBAL_ENABLE); @@ -589,17 +732,21 @@ static void enable_chip(struct hfi1_devdata *dd) * Other ctxts done as user opens and initializes them. */ for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) + continue; rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; - rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; - if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) + if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; - if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL)) + if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; - if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL)) + if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; - hfi1_rcvctrl(dd, rcvmask, i); - sc_enable(dd->rcd[i]->sc); + hfi1_rcvctrl(dd, rcvmask, rcd); + sc_enable(rcd->sc); + hfi1_rcd_put(rcd); } } @@ -624,6 +771,20 @@ static int create_workqueues(struct hfi1_devdata *dd) if (!ppd->hfi1_wq) goto wq_error; } + if (!ppd->link_wq) { + /* + * Make the link workqueue single-threaded to enforce + * serialization. + */ + ppd->link_wq = + alloc_workqueue( + "hfi_link_%d_%d", + WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, + 1, /* max_active */ + dd->unit, pidx); + if (!ppd->link_wq) + goto wq_error; + } } return 0; wq_error: @@ -634,6 +795,10 @@ wq_error: destroy_workqueue(ppd->hfi1_wq); ppd->hfi1_wq = NULL; } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } } return -ENOMEM; } @@ -656,7 +821,8 @@ wq_error: int hfi1_init(struct hfi1_devdata *dd, int reinit) { int ret = 0, pidx, lastfail = 0; - unsigned i, len; + unsigned long len; + u16 i; struct hfi1_ctxtdata *rcd; struct hfi1_pportdata *ppd; @@ -725,7 +891,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) * existing, and re-allocate. * Need to re-create rest of ctxt 0 ctxtdata as well. */ - rcd = dd->rcd[i]; + rcd = hfi1_rcd_get_by_index(dd, i); if (!rcd) continue; @@ -739,6 +905,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); ret = lastfail; } + hfi1_rcd_put(rcd); } /* Allocate enough memory for user event notification. */ @@ -858,6 +1025,7 @@ static void stop_timers(struct hfi1_devdata *dd) static void shutdown_device(struct hfi1_devdata *dd) { struct hfi1_pportdata *ppd; + struct hfi1_ctxtdata *rcd; unsigned pidx; int i; @@ -876,12 +1044,15 @@ static void shutdown_device(struct hfi1_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; - for (i = 0; i < dd->num_rcv_contexts; i++) + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | - HFI1_RCVCTRL_CTXT_DIS | - HFI1_RCVCTRL_INTRAVAIL_DIS | - HFI1_RCVCTRL_PKEY_DIS | - HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i); + HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_PKEY_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); + hfi1_rcd_put(rcd); + } /* * Gracefully stop all sends allowing any in progress to * trickle out first. @@ -917,6 +1088,10 @@ static void shutdown_device(struct hfi1_devdata *dd) destroy_workqueue(ppd->hfi1_wq); ppd->hfi1_wq = NULL; } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } } sdma_exit(dd); } @@ -927,14 +1102,11 @@ static void shutdown_device(struct hfi1_devdata *dd) * @rcd: the ctxtdata structure * * free up any allocated data for a context - * This should not touch anything that would affect a simultaneous - * re-allocation of context data, because it is called after hfi1_mutex - * is released (and can be called from reinit as well). * It should never change any chip state, or global driver state. */ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) { - unsigned e; + u32 e; if (!rcd) return; @@ -953,6 +1125,7 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) /* all the RcvArray entries should have been cleared by now */ kfree(rcd->egrbufs.rcvtids); + rcd->egrbufs.rcvtids = NULL; for (e = 0; e < rcd->egrbufs.alloced; e++) { if (rcd->egrbufs.buffers[e].dma) @@ -962,13 +1135,21 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) rcd->egrbufs.buffers[e].dma); } kfree(rcd->egrbufs.buffers); + rcd->egrbufs.alloced = 0; + rcd->egrbufs.buffers = NULL; sc_free(rcd->sc); + rcd->sc = NULL; + vfree(rcd->subctxt_uregbase); vfree(rcd->subctxt_rcvegrbuf); vfree(rcd->subctxt_rcvhdr_base); kfree(rcd->opstats); - kfree(rcd); + + rcd->subctxt_uregbase = NULL; + rcd->subctxt_rcvegrbuf = NULL; + rcd->subctxt_rcvhdr_base = NULL; + rcd->opstats = NULL; } /* @@ -1311,8 +1492,6 @@ static void cleanup_device_data(struct hfi1_devdata *dd) { int ctxt; int pidx; - struct hfi1_ctxtdata **tmp; - unsigned long flags; /* users can't do anything more with chip */ for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1337,18 +1516,6 @@ static void cleanup_device_data(struct hfi1_devdata *dd) free_credit_return(dd); - /* - * Free any resources still in use (usually just kernel contexts) - * at unload; we do for ctxtcnt, because that's what we allocate. - * We acquire lock to be really paranoid that rcd isn't being - * accessed from some interrupt-related code (that should not happen, - * but best to be sure). - */ - spin_lock_irqsave(&dd->uctxt_lock, flags); - tmp = dd->rcd; - dd->rcd = NULL; - spin_unlock_irqrestore(&dd->uctxt_lock, flags); - if (dd->rcvhdrtail_dummy_kvaddr) { dma_free_coherent(&dd->pcidev->dev, sizeof(u64), (void *)dd->rcvhdrtail_dummy_kvaddr, @@ -1356,16 +1523,22 @@ static void cleanup_device_data(struct hfi1_devdata *dd) dd->rcvhdrtail_dummy_kvaddr = NULL; } - for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) { - struct hfi1_ctxtdata *rcd = tmp[ctxt]; + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + */ + for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; - tmp[ctxt] = NULL; /* debugging paranoia */ if (rcd) { hfi1_clear_tids(rcd); - hfi1_free_ctxtdata(dd, rcd); + hfi1_free_ctxt(rcd); } } - kfree(tmp); + + kfree(dd->rcd); + dd->rcd = NULL; + free_pio_map(dd); /* must follow rcv context free - need to remove rcv's hooks */ for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) @@ -1532,6 +1705,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) destroy_workqueue(ppd->hfi1_wq); ppd->hfi1_wq = NULL; } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } } if (!j) hfi1_device_remove(dd); diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c index 04a5082d5ac5..96845dfed5c5 100644 --- a/drivers/infiniband/hw/hfi1/intr.c +++ b/drivers/infiniband/hw/hfi1/intr.c @@ -164,6 +164,7 @@ void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) ppd->linkup = 0; /* clear HW details of the previous connection */ + ppd->actual_vls_operational = 0; reset_link_credits(dd); /* freeze after a link down to guarantee a clean egress */ @@ -196,7 +197,7 @@ void handle_user_interrupt(struct hfi1_ctxtdata *rcd) if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) { wake_up_interruptible(&rcd->wait); - hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd); } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG, &rcd->event_flags)) { rcd->urgent++; diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index d9740ddea6f1..591697d85eed 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -106,7 +106,9 @@ struct iowait { struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *tx, - unsigned seq); + uint seq, + bool pkts_sent + ); void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); seqlock_t *lock; @@ -118,6 +120,7 @@ struct iowait { u32 count; u32 tx_limit; u32 tx_count; + u8 starved_cnt; }; #define SDMA_AVAIL_REASON 0 @@ -143,7 +146,8 @@ static inline void iowait_init( struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *tx, - unsigned seq), + uint seq, + bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), void (*sdma_drained)(struct iowait *wait)) { @@ -305,4 +309,66 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) return tx; } +/** + * iowait_queue - Put the iowait on a wait queue + * @pkts_sent: have some packets been sent before queuing? + * @w: the iowait struct + * @wait_head: the wait queue + * + * This function is called to insert an iowait struct into a + * wait queue after a resource (eg, sdma decriptor or pio + * buffer) is run out. + */ +static inline void iowait_queue(bool pkts_sent, struct iowait *w, + struct list_head *wait_head) +{ + /* + * To play fair, insert the iowait at the tail of the wait queue if it + * has already sent some packets; Otherwise, put it at the head. + */ + if (pkts_sent) { + list_add_tail(&w->list, wait_head); + w->starved_cnt = 0; + } else { + list_add(&w->list, wait_head); + w->starved_cnt++; + } +} + +/** + * iowait_starve_clear - clear the wait queue's starve count + * @pkts_sent: have some packets been sent? + * @w: the iowait struct + * + * This function is called to clear the starve count. If no + * packets have been sent, the starve count will not be cleared. + */ +static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w) +{ + if (pkts_sent) + w->starved_cnt = 0; +} + +/** + * iowait_starve_find_max - Find the maximum of the starve count + * @w: the iowait struct + * @max: a variable containing the max starve count + * @idx: the index of the current iowait in an array + * @max_idx: a variable containing the array index for the + * iowait entry that has the max starve count + * + * This function is called to compare the starve count of a + * given iowait with the given max starve count. The max starve + * count and the index will be updated if the iowait's start + * count is larger. + */ +static inline void iowait_starve_find_max(struct iowait *w, u8 *max, + uint idx, uint *max_idx) +{ + if (w->starved_cnt > *max) { + *max = w->starved_cnt; + *max_idx = idx; + } +} + #endif diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 5977673a52d4..f4c0ffc040cc 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -46,6 +46,7 @@ */ #include <linux/net.h> +#include <rdma/opa_addr.h> #define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \ / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16))) @@ -59,6 +60,24 @@ #define OPA_LINK_WIDTH_RESET_OLD 0x0fff #define OPA_LINK_WIDTH_RESET 0xffff +struct trap_node { + struct list_head list; + struct opa_mad_notice_attr data; + __be64 tid; + int len; + u32 retry; + u8 in_use; + u8 repress; +}; + +static int smp_length_check(u32 data_size, u32 request_len) +{ + if (unlikely(request_len < data_size)) + return -EINVAL; + + return 0; +} + static int reply(struct ib_mad_hdr *smp) { /* @@ -89,28 +108,222 @@ void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port) ib_dispatch_event(&event); } -static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) +/* + * If the port is down, clean up all pending traps. We need to be careful + * with the given trap, because it may be queued. + */ +static void cleanup_traps(struct hfi1_ibport *ibp, struct trap_node *trap) +{ + struct trap_node *node, *q; + unsigned long flags; + struct list_head trap_list; + int i; + + for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) { + spin_lock_irqsave(&ibp->rvp.lock, flags); + list_replace_init(&ibp->rvp.trap_lists[i].list, &trap_list); + ibp->rvp.trap_lists[i].list_len = 0; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + /* + * Remove all items from the list, freeing all the non-given + * traps. + */ + list_for_each_entry_safe(node, q, &trap_list, list) { + list_del(&node->list); + if (node != trap) + kfree(node); + } + } + + /* + * If this wasn't on one of the lists it would not be freed. If it + * was on the list, it is now safe to free. + */ + kfree(trap); +} + +static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp, + struct trap_node *trap) +{ + struct trap_node *node; + struct trap_list *trap_list; + unsigned long flags; + unsigned long timeout; + int found = 0; + unsigned int queue_id; + static int trap_count; + + queue_id = trap->data.generic_type & 0x0F; + if (queue_id >= RVT_MAX_TRAP_LISTS) { + trap_count++; + pr_err_ratelimited("hfi1: Invalid trap 0x%0x dropped. Total dropped: %d\n", + trap->data.generic_type, trap_count); + kfree(trap); + return NULL; + } + + /* + * Since the retry (handle timeout) does not remove a trap request + * from the list, all we have to do is compare the node. + */ + spin_lock_irqsave(&ibp->rvp.lock, flags); + trap_list = &ibp->rvp.trap_lists[queue_id]; + + list_for_each_entry(node, &trap_list->list, list) { + if (node == trap) { + node->retry++; + found = 1; + break; + } + } + + /* If it is not on the list, add it, limited to RVT-MAX_TRAP_LEN. */ + if (!found) { + if (trap_list->list_len < RVT_MAX_TRAP_LEN) { + trap_list->list_len++; + list_add_tail(&trap->list, &trap_list->list); + } else { + pr_warn_ratelimited("hfi1: Maximum trap limit reached for 0x%0x traps\n", + trap->data.generic_type); + kfree(trap); + } + } + + /* + * Next check to see if there is a timer pending. If not, set it up + * and get the first trap from the list. + */ + node = NULL; + if (!timer_pending(&ibp->rvp.trap_timer)) { + /* + * o14-2 + * If the time out is set we have to wait until it expires + * before the trap can be sent. + * This should be > RVT_TRAP_TIMEOUT + */ + timeout = (RVT_TRAP_TIMEOUT * + (1UL << ibp->rvp.subnet_timeout)) / 1000; + mod_timer(&ibp->rvp.trap_timer, + jiffies + usecs_to_jiffies(timeout)); + node = list_first_entry(&trap_list->list, struct trap_node, + list); + node->in_use = 1; + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + return node; +} + +static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp, + struct opa_smp *smp) +{ + struct trap_list *trap_list; + struct trap_node *trap; + unsigned long flags; + int i; + + if (smp->attr_id != IB_SMP_ATTR_NOTICE) + return; + + spin_lock_irqsave(&ibp->rvp.lock, flags); + for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) { + trap_list = &ibp->rvp.trap_lists[i]; + trap = list_first_entry_or_null(&trap_list->list, + struct trap_node, list); + if (trap && trap->tid == smp->tid) { + if (trap->in_use) { + trap->repress = 1; + } else { + trap_list->list_len--; + list_del(&trap->list); + kfree(trap); + } + break; + } + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); +} + +static void hfi1_update_sm_ah_attr(struct hfi1_ibport *ibp, + struct rdma_ah_attr *attr, u32 dlid) +{ + rdma_ah_set_dlid(attr, dlid); + rdma_ah_set_port_num(attr, ppd_from_ibp(ibp)->port); + if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + grh->sgid_index = 0; + grh->hop_limit = 1; + grh->dgid.global.subnet_prefix = + ibp->rvp.gid_prefix; + grh->dgid.global.interface_id = OPA_MAKE_ID(dlid); + } +} + +static int hfi1_modify_qp0_ah(struct hfi1_ibport *ibp, + struct rvt_ah *ah, u32 dlid) +{ + struct rdma_ah_attr attr; + struct rvt_qp *qp0; + int ret = -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.type = ah->ibah.type; + hfi1_update_sm_ah_attr(ibp, &attr, dlid); + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ret = rdma_modify_ah(&ah->ibah, &attr); + rcu_read_unlock(); + return ret; +} + +static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid) +{ + struct rdma_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct rvt_qp *qp0; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ppd(ppd); + u8 port_num = ppd->port; + + memset(&attr, 0, sizeof(attr)); + attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num); + hfi1_update_sm_ah_attr(ibp, &attr, dlid); + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ah = rdma_create_ah(qp0->ibqp.pd, &attr); + rcu_read_unlock(); + return ah; +} + +static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap) { struct ib_mad_send_buf *send_buf; struct ib_mad_agent *agent; struct opa_smp *smp; - int ret; unsigned long flags; - unsigned long timeout; int pkey_idx; u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp; agent = ibp->rvp.send_agent; - if (!agent) + if (!agent) { + cleanup_traps(ibp, trap); return; + } /* o14-3.2.1 */ - if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE) + if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) { + cleanup_traps(ibp, trap); return; + } - /* o14-2 */ - if (ibp->rvp.trap_timeout && time_before(jiffies, - ibp->rvp.trap_timeout)) + /* Add the trap to the list if necessary and see if we can send it */ + trap = check_and_add_trap(ibp, trap); + if (!trap) return; pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); @@ -131,11 +344,21 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; smp->class_version = OPA_SM_CLASS_VERSION; smp->method = IB_MGMT_METHOD_TRAP; - ibp->rvp.tid++; - smp->tid = cpu_to_be64(ibp->rvp.tid); + + /* Only update the transaction ID for new traps (o13-5). */ + if (trap->tid == 0) { + ibp->rvp.tid++; + /* make sure that tid != 0 */ + if (ibp->rvp.tid == 0) + ibp->rvp.tid++; + trap->tid = cpu_to_be64(ibp->rvp.tid); + } + smp->tid = trap->tid; + smp->attr_id = IB_SMP_ATTR_NOTICE; /* o14-1: smp->mkey = 0; */ - memcpy(smp->route.lid.data, data, len); + + memcpy(smp->route.lid.data, &trap->data, trap->len); spin_lock_irqsave(&ibp->rvp.lock, flags); if (!ibp->rvp.sm_ah) { @@ -144,65 +367,101 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid); if (IS_ERR(ah)) { - ret = PTR_ERR(ah); - } else { - send_buf->ah = ah; - ibp->rvp.sm_ah = ibah_to_rvtah(ah); - ret = 0; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + return; } + send_buf->ah = ah; + ibp->rvp.sm_ah = ibah_to_rvtah(ah); } else { - ret = -EINVAL; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + return; } } else { send_buf->ah = &ibp->rvp.sm_ah->ibah; - ret = 0; } + + /* + * If the trap was repressed while things were getting set up, don't + * bother sending it. This could happen for a retry. + */ + if (trap->repress) { + list_del(&trap->list); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + kfree(trap); + ib_free_send_mad(send_buf); + return; + } + + trap->in_use = 0; spin_unlock_irqrestore(&ibp->rvp.lock, flags); - if (!ret) - ret = ib_post_send_mad(send_buf, NULL); - if (!ret) { - /* 4.096 usec. */ - timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000; - ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout); - } else { + if (ib_post_send_mad(send_buf, NULL)) ib_free_send_mad(send_buf); - ibp->rvp.trap_timeout = 0; +} + +void hfi1_handle_trap_timer(unsigned long data) +{ + struct hfi1_ibport *ibp = (struct hfi1_ibport *)data; + struct trap_node *trap = NULL; + unsigned long flags; + int i; + + /* Find the trap with the highest priority */ + spin_lock_irqsave(&ibp->rvp.lock, flags); + for (i = 0; !trap && i < RVT_MAX_TRAP_LISTS; i++) { + trap = list_first_entry_or_null(&ibp->rvp.trap_lists[i].list, + struct trap_node, list); } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + if (trap) + send_trap(ibp, trap); +} + +static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid) +{ + struct trap_node *trap; + + trap = kzalloc(sizeof(*trap), GFP_ATOMIC); + if (!trap) + return NULL; + + INIT_LIST_HEAD(&trap->list); + trap->data.generic_type = type; + trap->data.prod_type_lsb = IB_NOTICE_PROD_CA; + trap->data.trap_num = trap_num; + trap->data.issuer_lid = cpu_to_be32(lid); + + return trap; } /* - * Send a bad [PQ]_Key trap (ch. 14.3.8). + * Send a bad P_Key trap (ch. 14.3.8). */ -void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, - u32 qp1, u32 qp2, u16 lid1, u16 lid2) +void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, + u32 qp1, u32 qp2, u32 lid1, u32 lid2) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - u32 _lid1 = lid1; - u32 _lid2 = lid2; - - memset(&data, 0, sizeof(data)); - if (trap_num == OPA_TRAP_BAD_P_KEY) - ibp->rvp.pkey_violations++; - else - ibp->rvp.qkey_violations++; ibp->rvp.n_pkt_drops++; + ibp->rvp.pkey_violations++; + + trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_P_KEY, + lid); + if (!trap) + return; /* Send violation trap */ - data.generic_type = IB_NOTICE_TYPE_SECURITY; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = trap_num; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_257_258.lid1 = cpu_to_be32(_lid1); - data.ntc_257_258.lid2 = cpu_to_be32(_lid2); - data.ntc_257_258.key = cpu_to_be32(key); - data.ntc_257_258.sl = sl << 3; - data.ntc_257_258.qp1 = cpu_to_be32(qp1); - data.ntc_257_258.qp2 = cpu_to_be32(qp2); - - send_trap(ibp, &data, sizeof(data)); + trap->data.ntc_257_258.lid1 = cpu_to_be32(lid1); + trap->data.ntc_257_258.lid2 = cpu_to_be32(lid2); + trap->data.ntc_257_258.key = cpu_to_be32(key); + trap->data.ntc_257_258.sl = sl << 3; + trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1); + trap->data.ntc_257_258.qp2 = cpu_to_be32(qp2); + + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } /* @@ -211,34 +470,36 @@ void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_M_KEY, + lid); + if (!trap) + return; + /* Send violation trap */ - data.generic_type = IB_NOTICE_TYPE_SECURITY; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_BAD_M_KEY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_256.lid = data.issuer_lid; - data.ntc_256.method = mad->method; - data.ntc_256.attr_id = mad->attr_id; - data.ntc_256.attr_mod = mad->attr_mod; - data.ntc_256.mkey = mkey; + trap->data.ntc_256.lid = trap->data.issuer_lid; + trap->data.ntc_256.method = mad->method; + trap->data.ntc_256.attr_id = mad->attr_id; + trap->data.ntc_256.attr_mod = mad->attr_mod; + trap->data.ntc_256.mkey = mkey; if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - data.ntc_256.dr_slid = dr_slid; - data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; - if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) { - data.ntc_256.dr_trunc_hop |= + trap->data.ntc_256.dr_slid = dr_slid; + trap->data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + if (hop_cnt > ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path)) { + trap->data.ntc_256.dr_trunc_hop |= IB_NOTICE_TRAP_DR_TRUNC; - hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path); + hop_cnt = ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path); } - data.ntc_256.dr_trunc_hop |= hop_cnt; - memcpy(data.ntc_256.dr_rtn_path, return_path, + trap->data.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(trap->data.ntc_256.dr_rtn_path, return_path, hop_cnt); } - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + + send_trap(ibp, trap); } /* @@ -246,22 +507,24 @@ static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, */ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) { - struct opa_mad_notice_attr data; + struct trap_node *trap; struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); struct hfi1_devdata *dd = dd_from_dev(verbs_dev); struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_INFO, + OPA_TRAP_CHANGE_CAPABILITY, + lid); + if (!trap) + return; - data.generic_type = IB_NOTICE_TYPE_INFO; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_144.lid = data.issuer_lid; - data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + trap->data.ntc_144.lid = trap->data.issuer_lid; + trap->data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + trap->data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags); - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } /* @@ -269,19 +532,19 @@ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) */ void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_INFO, OPA_TRAP_CHANGE_SYSGUID, + lid); + if (!trap) + return; - data.generic_type = IB_NOTICE_TYPE_INFO; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_CHANGE_SYSGUID; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; - data.ntc_145.lid = data.issuer_lid; + trap->data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; + trap->data.ntc_145.lid = trap->data.issuer_lid; - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } /* @@ -289,29 +552,30 @@ void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) */ void hfi1_node_desc_chg(struct hfi1_ibport *ibp) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_INFO, + OPA_TRAP_CHANGE_CAPABILITY, + lid); + if (!trap) + return; - data.generic_type = IB_NOTICE_TYPE_INFO; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_144.lid = data.issuer_lid; - data.ntc_144.change_flags = + trap->data.ntc_144.lid = trap->data.issuer_lid; + trap->data.ntc_144.change_flags = cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG); - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, - u8 port, u32 *resp_len) + u8 port, u32 *resp_len, u32 max_len) { struct opa_node_description *nd; - if (am) { + if (am || smp_length_check(sizeof(*nd), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -328,7 +592,7 @@ static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am, static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct opa_node_info *ni; struct hfi1_devdata *dd = dd_from_ibdev(ibdev); @@ -338,6 +602,7 @@ static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data, /* GUID 0 is illegal */ if (am || pidx >= dd->num_pports || ibdev->node_guid == 0 || + smp_length_check(sizeof(*ni), max_len) || get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); @@ -519,7 +784,7 @@ void read_ltp_rtt(struct hfi1_devdata *dd) static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { int i; struct hfi1_devdata *dd; @@ -535,7 +800,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, u32 buffer_units; u64 tmp = 0; - if (num_ports != 1) { + if (num_ports != 1 || smp_length_check(sizeof(*pi), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -605,7 +870,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, ppd->offline_disabled_reason; pi->port_states.portphysstate_portstate = - (hfi1_ibphys_portstate(ppd) << 4) | state; + (driver_pstate(ppd) << 4) | state; pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc; @@ -704,13 +969,9 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT; pi->buffer_units = cpu_to_be32(buffer_units); - pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported | - OPA_CAP_MASK3_IsEthOnFabricSupported); - /* Driver does not support mcast/collective configuration */ - pi->opa_cap_mask &= - cpu_to_be16(~OPA_CAP_MASK3_IsAddrRangeConfigSupported); - pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7) - << 3 | (HFI1_MCAST_NR & 0x7)); + pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags); + pi->collectivemask_multicastmask = ((OPA_COLLECTIVE_NR & 0x7) + << 3 | (OPA_MCAST_NR & 0x7)); /* HFI supports a replay buffer 128 LTPs in size */ pi->replay_depth.buffer = 0x80; @@ -748,7 +1009,7 @@ static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u32 n_blocks_req = OPA_AM_NBLK(am); @@ -771,6 +1032,11 @@ static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16); + if (smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + if (start_block + n_blocks_req > n_blocks_avail || n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) { pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; " @@ -915,8 +1181,8 @@ static int physical_transition_allowed(int old, int new) static int port_states_transition_allowed(struct hfi1_pportdata *ppd, u32 logical_new, u32 physical_new) { - u32 physical_old = driver_physical_state(ppd); - u32 logical_old = driver_logical_state(ppd); + u32 physical_old = driver_pstate(ppd); + u32 logical_old = driver_lstate(ppd); int ret, logical_allowed, physical_allowed; ret = logical_transition_allowed(logical_old, logical_new); @@ -1074,7 +1340,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, */ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct opa_port_info *pi = (struct opa_port_info *)data; struct ib_event event; @@ -1083,8 +1349,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct hfi1_ibport *ibp; u8 clientrereg; unsigned long flags; - u32 smlid, opa_lid; /* tmp vars to hold LID values */ - u16 lid; + u32 smlid; + u32 lid; u8 ls_old, ls_new, ps_new; u8 vls; u8 msl; @@ -1095,27 +1361,26 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, int ret, i, invalid = 0, call_set_mtu = 0; int call_link_downgrade_policy = 0; - if (num_ports != 1) { + if (num_ports != 1 || + smp_length_check(sizeof(*pi), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } - opa_lid = be32_to_cpu(pi->lid); - if (opa_lid & 0xFFFF0000) { - pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid); + lid = be32_to_cpu(pi->lid); + if (lid & 0xFF000000) { + pr_warn("OPA_PortInfo lid out of range: %X\n", lid); smp->status |= IB_SMP_INVALID_FIELD; goto get_only; } - lid = (u16)(opa_lid & 0x0000FFFF); smlid = be32_to_cpu(pi->sm_lid); - if (smlid & 0xFFFF0000) { + if (smlid & 0xFF000000) { pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid); smp->status |= IB_SMP_INVALID_FIELD; goto get_only; } - smlid &= 0x0000FFFF; clientrereg = (pi->clientrereg_subnettimeout & OPA_PI_MASK_CLIENT_REREGISTER); @@ -1130,12 +1395,16 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, ls_old = driver_lstate(ppd); ibp->rvp.mkey = pi->mkey; - ibp->rvp.gid_prefix = pi->subnet_prefix; + if (ibp->rvp.gid_prefix != pi->subnet_prefix) { + ibp->rvp.gid_prefix = pi->subnet_prefix; + event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&event); + } ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); /* Must be a valid unicast LID address. */ if ((lid == 0 && ls_old > IB_PORT_INIT) || - lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + (hfi1_is_16B_mcast(lid))) { smp->status |= IB_SMP_INVALID_FIELD; pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n", lid); @@ -1148,6 +1417,16 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC); event.event = IB_EVENT_LID_CHANGE; ib_dispatch_event(&event); + + if (HFI1_PORT_GUID_INDEX + 1 < HFI1_GUIDS_PER_PORT) { + /* Manufacture GID from LID to support extended + * addresses + */ + ppd->guids[HFI1_PORT_GUID_INDEX + 1] = + be64_to_cpu(OPA_MAKE_ID(lid)); + event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&event); + } } msl = pi->smsl & OPA_PI_MASK_SMSL; @@ -1158,7 +1437,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, /* Must be a valid unicast LID address. */ if ((smlid == 0 && ls_old > IB_PORT_INIT) || - smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + (hfi1_is_16B_mcast(smlid))) { smp->status |= IB_SMP_INVALID_FIELD; pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid); } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) { @@ -1166,7 +1445,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, spin_lock_irqsave(&ibp->rvp.lock, flags); if (ibp->rvp.sm_ah) { if (smlid != ibp->rvp.sm_lid) - rdma_ah_set_dlid(&ibp->rvp.sm_ah->attr, smlid); + hfi1_modify_qp0_ah(ibp, ibp->rvp.sm_ah, smlid); if (msl != ibp->rvp.sm_sl) rdma_ah_set_sl(&ibp->rvp.sm_ah->attr, msl); } @@ -1346,7 +1625,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, if (ret) return ret; - ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len); + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len, + max_len); /* restore re-reg bit per o14-12.2.1 */ pi->clientrereg_subnettimeout |= clientrereg; @@ -1363,7 +1643,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, return ret; get_only: - return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len, + max_len); } /** @@ -1424,7 +1705,7 @@ static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u32 n_blocks_sent = OPA_AM_NBLK(am); @@ -1434,6 +1715,7 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, int i; u16 n_blocks_avail; unsigned npkeys = hfi1_get_npkeys(dd); + u32 size = 0; if (n_blocks_sent == 0) { pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n", @@ -1444,6 +1726,13 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; + size = sizeof(u16) * (n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE); + + if (smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + if (start_block + n_blocks_sent > n_blocks_avail || n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) { pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n", @@ -1461,7 +1750,8 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len, + max_len); } #define ILLEGAL_VL 12 @@ -1522,14 +1812,14 @@ static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data) static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_ibport *ibp = to_iport(ibdev, port); u8 *p = data; size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */ unsigned i; - if (am) { + if (am || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1545,14 +1835,15 @@ static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_ibport *ibp = to_iport(ibdev, port); u8 *p = data; + size_t size = ARRAY_SIZE(ibp->sl_to_sc); int i; u8 sc; - if (am) { + if (am || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1567,19 +1858,20 @@ static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, } } - return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len, + max_len); } static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_ibport *ibp = to_iport(ibdev, port); u8 *p = data; size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */ unsigned i; - if (am) { + if (am || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1595,13 +1887,14 @@ static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_ibport *ibp = to_iport(ibdev, port); + size_t size = ARRAY_SIZE(ibp->sc_to_sl); u8 *p = data; int i; - if (am) { + if (am || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1609,19 +1902,20 @@ static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) ibp->sc_to_sl[i] = *p++; - return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len, + max_len); } static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { u32 n_blocks = OPA_AM_NBLK(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); void *vp = (void *)data; size_t size = 4 * sizeof(u64); - if (n_blocks != 1) { + if (n_blocks != 1 || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1636,7 +1930,7 @@ static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { u32 n_blocks = OPA_AM_NBLK(am); int async_update = OPA_AM_ASYNC(am); @@ -1644,8 +1938,15 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, void *vp = (void *)data; struct hfi1_pportdata *ppd; int lstate; + /* + * set_sc2vlt_tables writes the information contained in *data + * to four 64-bit registers SendSC2VLt[0-3]. We need to make + * sure *max_len is not greater than the total size of the four + * SendSC2VLt[0-3] registers. + */ + size_t size = 4 * sizeof(u64); - if (n_blocks != 1 || async_update) { + if (n_blocks != 1 || async_update || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1665,27 +1966,28 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, set_sc2vlt_tables(dd, vp); - return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len, + max_len); } static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { u32 n_blocks = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; void *vp = (void *)data; - int size; + int size = sizeof(struct sc2vlnt); - if (n_blocks != 1) { + if (n_blocks != 1 || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } ppd = dd->pport + (port - 1); - size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp); + fm_get_table(ppd, FM_TBL_SC2VLNT, vp); if (resp_len) *resp_len += size; @@ -1695,15 +1997,16 @@ static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { u32 n_blocks = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; void *vp = (void *)data; int lstate; + int size = sizeof(struct sc2vlnt); - if (n_blocks != 1) { + if (n_blocks != 1 || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1721,12 +2024,12 @@ static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, fm_set_table(ppd, FM_TBL_SC2VLNT, vp); return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); } static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { u32 nports = OPA_AM_NPORT(am); u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); @@ -1735,7 +2038,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, struct hfi1_pportdata *ppd; struct opa_port_state_info *psi = (struct opa_port_state_info *)data; - if (nports != 1) { + if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1755,7 +2058,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, ppd->offline_disabled_reason; psi->port_states.portphysstate_portstate = - (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf); + (driver_pstate(ppd) << 4) | (lstate & 0xf); psi->link_width_downgrade_tx_active = cpu_to_be16(ppd->link_width_downgrade_tx_active); psi->link_width_downgrade_rx_active = @@ -1768,7 +2071,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { u32 nports = OPA_AM_NPORT(am); u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); @@ -1779,7 +2082,7 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, struct opa_port_state_info *psi = (struct opa_port_state_info *)data; int ret, invalid = 0; - if (nports != 1) { + if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1809,19 +2112,21 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, if (invalid) smp->status |= IB_SMP_INVALID_FIELD; - return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len, + max_len); } static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u32 addr = OPA_AM_CI_ADDR(am); u32 len = OPA_AM_CI_LEN(am) + 1; int ret; - if (dd->pport->port_type != PORT_TYPE_QSFP) { + if (dd->pport->port_type != PORT_TYPE_QSFP || + smp_length_check(len, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1864,21 +2169,22 @@ static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data, } static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, u8 port, u32 *resp_len, + u32 max_len) { u32 num_ports = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; struct buffer_control *p = (struct buffer_control *)data; - int size; + int size = sizeof(struct buffer_control); - if (num_ports != 1) { + if (num_ports != 1 || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } ppd = dd->pport + (port - 1); - size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p); + fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p); trace_bct_get(dd, p); if (resp_len) *resp_len += size; @@ -1887,14 +2193,15 @@ static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data, } static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, u8 port, u32 *resp_len, + u32 max_len) { u32 num_ports = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; struct buffer_control *p = (struct buffer_control *)data; - if (num_ports != 1) { + if (num_ports != 1 || smp_length_check(sizeof(*p), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1905,41 +2212,43 @@ static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len, + max_len); } static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); u32 num_ports = OPA_AM_NPORT(am); u8 section = (am & 0x00ff0000) >> 16; u8 *p = data; - int size = 0; + int size = 256; - if (num_ports != 1) { + if (num_ports != 1 || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } switch (section) { case OPA_VLARB_LOW_ELEMENTS: - size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p); + fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p); break; case OPA_VLARB_HIGH_ELEMENTS: - size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p); + fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p); break; case OPA_VLARB_PREEMPT_ELEMENTS: - size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p); + fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p); break; case OPA_VLARB_PREEMPT_MATRIX: - size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p); + fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p); break; default: pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n", be32_to_cpu(smp->attr_mod)); smp->status |= IB_SMP_INVALID_FIELD; + size = 0; break; } @@ -1951,14 +2260,15 @@ static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); u32 num_ports = OPA_AM_NPORT(am); u8 section = (am & 0x00ff0000) >> 16; u8 *p = data; + int size = 256; - if (num_ports != 1) { + if (num_ports != 1 || smp_length_check(size, max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1986,7 +2296,8 @@ static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, break; } - return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len, + max_len); } struct opa_pma_mad { @@ -3282,13 +3593,18 @@ struct opa_congestion_info_attr { static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct opa_congestion_info_attr *p = (struct opa_congestion_info_attr *)data; struct hfi1_ibport *ibp = to_iport(ibdev, port); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + if (smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + p->congestion_info = 0; p->control_table_cap = ppd->cc_max_table_entries; p->congestion_log_length = OPA_CONG_LOG_ELEMS; @@ -3301,7 +3617,7 @@ static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data, static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, - u8 port, u32 *resp_len) + u8 port, u32 *resp_len, u32 max_len) { int i; struct opa_congestion_setting_attr *p = @@ -3311,6 +3627,11 @@ static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, struct opa_congestion_setting_entry_shadow *entries; struct cc_state *cc_state; + if (smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + rcu_read_lock(); cc_state = get_cc_state(ppd); @@ -3385,7 +3706,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd) static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct opa_congestion_setting_attr *p = (struct opa_congestion_setting_attr *)data; @@ -3394,6 +3715,11 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, struct opa_congestion_setting_entry_shadow *entries; int i; + if (smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + /* * Save details from packet into the ppd. Hold the cc_state_lock so * our information is consistent with anyone trying to apply the state. @@ -3415,12 +3741,12 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, apply_cc_state(ppd); return __subn_get_opa_cong_setting(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); } static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, - u8 port, u32 *resp_len) + u8 port, u32 *resp_len, u32 max_len) { struct hfi1_ibport *ibp = to_iport(ibdev, port); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); @@ -3428,7 +3754,7 @@ static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, s64 ts; int i; - if (am != 0) { + if (am || smp_length_check(sizeof(*cong_log), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -3486,7 +3812,7 @@ static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct ib_cc_table_attr *cc_table_attr = (struct ib_cc_table_attr *)data; @@ -3498,9 +3824,10 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, int i, j; u32 sentry, eentry; struct cc_state *cc_state; + u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); /* sanity check n_blocks, start_block */ - if (n_blocks == 0 || + if (n_blocks == 0 || smp_length_check(size, max_len) || start_block + n_blocks > ppd->cc_max_table_entries) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); @@ -3530,14 +3857,14 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, rcu_read_unlock(); if (resp_len) - *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); + *resp_len += size; return reply((struct ib_mad_hdr *)smp); } static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -3548,9 +3875,10 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, int i, j; u32 sentry, eentry; u16 ccti_limit; + u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); /* sanity check n_blocks, start_block */ - if (n_blocks == 0 || + if (n_blocks == 0 || smp_length_check(size, max_len) || start_block + n_blocks > ppd->cc_max_table_entries) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); @@ -3581,7 +3909,8 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, /* now apply the information */ apply_cc_state(ppd); - return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len, + max_len); } struct opa_led_info { @@ -3594,7 +3923,7 @@ struct opa_led_info { static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd = dd->pport; @@ -3602,7 +3931,7 @@ static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, u32 nport = OPA_AM_NPORT(am); u32 is_beaconing_active; - if (nport != 1) { + if (nport != 1 || smp_length_check(sizeof(*p), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -3624,14 +3953,14 @@ static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct opa_led_info *p = (struct opa_led_info *)data; u32 nport = OPA_AM_NPORT(am); int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK); - if (nport != 1) { + if (nport != 1 || smp_length_check(sizeof(*p), max_len)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -3641,12 +3970,13 @@ static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, else shutdown_led_override(dd->pport); - return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len); + return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len, + max_len); } static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { int ret; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -3654,71 +3984,71 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, switch (attr_id) { case IB_SMP_ATTR_NODE_DESC: ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_NODE_INFO: ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_PORT_INFO: ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_PKEY_TABLE: ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SL_TO_SC_MAP: ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SC_TO_SL_MAP: ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SC_TO_VLT_MAP: ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_PORT_STATE_INFO: ret = __subn_get_opa_psi(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: ret = __subn_get_opa_bct(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_CABLE_INFO: ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_VL_ARB_TABLE: ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_CONGESTION_INFO: ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: ret = __subn_get_opa_cong_setting(smp, am, data, ibdev, - port, resp_len); + port, resp_len, max_len); break; case OPA_ATTRIB_ID_HFI_CONGESTION_LOG: ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev, - port, resp_len); + port, resp_len, max_len); break; case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_LED_INFO: ret = __subn_get_opa_led_info(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_SM_INFO: if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) @@ -3736,7 +4066,7 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, u32 max_len) { int ret; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -3744,51 +4074,51 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, switch (attr_id) { case IB_SMP_ATTR_PORT_INFO: ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_PKEY_TABLE: ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SL_TO_SC_MAP: ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SC_TO_SL_MAP: ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SC_TO_VLT_MAP: ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_PORT_STATE_INFO: ret = __subn_set_opa_psi(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: ret = __subn_set_opa_bct(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_VL_ARB_TABLE: ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: ret = __subn_set_opa_cong_setting(smp, am, data, ibdev, - port, resp_len); + port, resp_len, max_len); break; case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_LED_INFO: ret = __subn_set_opa_led_info(smp, am, data, ibdev, port, - resp_len); + resp_len, max_len); break; case IB_SMP_ATTR_SM_INFO: if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) @@ -3844,7 +4174,10 @@ static int subn_get_opa_aggregate(struct opa_smp *smp, memset(next_smp + sizeof(*agg), 0, agg_data_len); (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data, - ibdev, port, NULL); + ibdev, port, NULL, (u32)agg_data_len); + + if (smp->status & IB_SMP_INVALID_FIELD) + break; if (smp->status & ~IB_SMP_DIRECTION) { set_aggr_error(agg); return reply((struct ib_mad_hdr *)smp); @@ -3887,7 +4220,9 @@ static int subn_set_opa_aggregate(struct opa_smp *smp, } (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data, - ibdev, port, NULL); + ibdev, port, NULL, (u32)agg_data_len); + if (smp->status & IB_SMP_INVALID_FIELD) + break; if (smp->status & ~IB_SMP_DIRECTION) { set_aggr_error(agg); return reply((struct ib_mad_hdr *)smp); @@ -3958,7 +4293,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp, const struct ib_wc *in_wc) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u16 slid = in_wc->slid; + u16 slid = ib_lid_cpu16(in_wc->slid); u16 pkey; if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) @@ -3997,12 +4332,13 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, struct opa_smp *smp = (struct opa_smp *)out_mad; struct hfi1_ibport *ibp = to_iport(ibdev, port); u8 *data; - u32 am; + u32 am, data_size; __be16 attr_id; int ret; *out_mad = *in_mad; data = opa_get_smp_data(smp); + data_size = (u32)opa_get_smp_data_size(smp); am = be32_to_cpu(smp->attr_mod); attr_id = smp->attr_id; @@ -4046,7 +4382,8 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, default: clear_opa_smp_data(smp); ret = subn_get_opa_sma(attr_id, smp, am, data, - ibdev, port, resp_len); + ibdev, port, resp_len, + data_size); break; case OPA_ATTRIB_ID_AGGREGATE: ret = subn_get_opa_aggregate(smp, ibdev, port, @@ -4058,7 +4395,8 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, switch (attr_id) { default: ret = subn_set_opa_sma(attr_id, smp, am, data, - ibdev, port, resp_len); + ibdev, port, resp_len, + data_size); break; case OPA_ATTRIB_ID_AGGREGATE: ret = subn_set_opa_aggregate(smp, ibdev, port, @@ -4077,6 +4415,11 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, */ ret = IB_MAD_RESULT_SUCCESS; break; + case IB_MGMT_METHOD_TRAP_REPRESS: + subn_handle_opa_trap_repress(ibp, smp); + /* Always successful */ + ret = IB_MAD_RESULT_SUCCESS; + break; default: smp->status |= IB_SMP_UNSUP_METHOD; ret = reply((struct ib_mad_hdr *)smp); diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index 5aa3fd1be653..4c1245072093 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -115,7 +115,7 @@ struct opa_mad_notice_attr { __be32 lid; /* LID where change occurred */ __be32 new_cap_mask; /* new capability mask */ __be16 reserved2; - __be16 cap_mask; + __be16 cap_mask3; __be16 change_flags; /* low 4 bits only */ } __packed ntc_144; @@ -428,5 +428,6 @@ struct sc2vlnt { COUNTER_MASK(1, 4)) void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port); +void hfi1_handle_trap_timer(unsigned long data); #endif /* _HFI1_MAD_H */ diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e4b56a0dd6d0..2f0d285dc278 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -169,9 +169,8 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, unsigned long flags; int ret = 0; + trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len); spin_lock_irqsave(&handler->lock, flags); - hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr, - mnode->len); node = __mmu_rb_search(handler, mnode->addr, mnode->len); if (node) { ret = -EINVAL; @@ -197,7 +196,7 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, { struct mmu_rb_node *node = NULL; - hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len); + trace_hfi1_mmu_rb_search(addr, len); if (!handler->ops->filter) { node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1); @@ -214,21 +213,27 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, return node; } -struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, - unsigned long addr, unsigned long len) +bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler, + unsigned long addr, unsigned long len, + struct mmu_rb_node **rb_node) { struct mmu_rb_node *node; unsigned long flags; + bool ret = false; spin_lock_irqsave(&handler->lock, flags); node = __mmu_rb_search(handler, addr, len); if (node) { + if (node->addr == addr && node->len == len) + goto unlock; __mmu_int_rb_remove(node, &handler->root); list_del(&node->list); /* remove from LRU list */ + ret = true; } +unlock: spin_unlock_irqrestore(&handler->lock, flags); - - return node; + *rb_node = node; + return ret; } void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) @@ -272,8 +277,7 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, unsigned long flags; /* Validity of handler and node pointers has been checked by caller. */ - hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, - node->len); + trace_hfi1_mmu_rb_remove(node->addr, node->len); spin_lock_irqsave(&handler->lock, flags); __mmu_int_rb_remove(node, &handler->root); list_del(&node->list); /* remove from LRU list */ @@ -306,8 +310,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, node; node = ptr) { /* Guard against node removal. */ ptr = __mmu_int_rb_iter_next(node, start, end - 1); - hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", - node->addr, node->len); + trace_hfi1_mmu_mem_invalidate(node->addr, node->len); if (handler->ops->invalidate(handler->ops_arg, node)) { __mmu_int_rb_remove(node, root); /* move from LRU list to delete list */ diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 754f6ebf13fb..f04cec1e99d1 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -81,7 +81,8 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, struct mmu_rb_node *mnode); -struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, - unsigned long addr, unsigned long len); +bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler, + unsigned long addr, unsigned long len, + struct mmu_rb_node **rb_node); #endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h index 6ef3c1cbdcd7..774215b95df5 100644 --- a/drivers/infiniband/hw/hfi1/opa_compat.h +++ b/drivers/infiniband/hw/hfi1/opa_compat.h @@ -84,7 +84,8 @@ static inline u8 port_states_to_phys_state(struct opa_port_states *ps) /* * OPA port physical states * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState - * values. + * values are the same in OmniPath Architecture. OPA leverages some of the same + * concepts as InfiniBand, but has a few other states as well. * * When writing, only values 0-3 are valid, other values are ignored. * When reading, 0 is reserved. @@ -92,6 +93,8 @@ static inline u8 port_states_to_phys_state(struct opa_port_states *ps) * Returned by the ibphys_portstate() routine. */ enum opa_port_phys_state { + /* Values 0-7 have the same meaning in OPA as in InfiniBand. */ + IB_PORTPHYSSTATE_NOP = 0, /* 1 is reserved */ IB_PORTPHYSSTATE_POLLING = 2, @@ -101,9 +104,23 @@ enum opa_port_phys_state { IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6, IB_PORTPHYSSTATE_PHY_TEST = 7, /* 8 is reserved */ + + /* + * Offline: Port is quiet (transmitters disabled) due to lack of + * physical media, unsupported media, or transition between link up + * and next link up attempt + */ OPA_PORTPHYSSTATE_OFFLINE = 9, - OPA_PORTPHYSSTATE_GANGED = 10, + + /* 10 is reserved */ + + /* + * Phy_Test: Specific test patterns are transmitted, and receiver BER + * can be monitored. This facilitates signal integrity testing for the + * physical layer of the port. + */ OPA_PORTPHYSSTATE_TEST = 11, + OPA_PORTPHYSSTATE_MAX = 11, /* values 12-15 are reserved/ignored */ }; diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 6a9f6f9819e1..82447b7cdda1 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -68,7 +68,7 @@ /* * Code to adjust PCIe capabilities. */ -static void tune_pcie_caps(struct hfi1_devdata *); +static int tune_pcie_caps(struct hfi1_devdata *); /* * Do all the common PCIe setup and initialization. @@ -161,6 +161,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) { unsigned long len; resource_size_t addr; + int ret = 0; dd->pcidev = pdev; pci_set_drvdata(pdev, dd); @@ -179,47 +180,54 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) return -EINVAL; } - dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND); - if (!dd->kregbase) + dd->kregbase1 = ioremap_nocache(addr, RCV_ARRAY); + if (!dd->kregbase1) { + dd_dev_err(dd, "UC mapping of kregbase1 failed\n"); return -ENOMEM; + } + dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY); + dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT); + dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count); + dd->base2_start = RCV_ARRAY + dd->chip_rcv_array_count * 8; + + dd->kregbase2 = ioremap_nocache( + addr + dd->base2_start, + TXE_PIO_SEND - dd->base2_start); + if (!dd->kregbase2) { + dd_dev_err(dd, "UC mapping of kregbase2 failed\n"); + goto nomem; + } + dd_dev_info(dd, "UC base2: %p for %x\n", dd->kregbase2, + TXE_PIO_SEND - dd->base2_start); dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE); if (!dd->piobase) { - iounmap(dd->kregbase); - return -ENOMEM; + dd_dev_err(dd, "WC mapping of send buffers failed\n"); + goto nomem; } + dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE); - dd->flags |= HFI1_PRESENT; /* now register routines work */ - - dd->kregend = dd->kregbase + TXE_PIO_SEND; dd->physaddr = addr; /* used for io_remap, etc. */ /* - * Re-map the chip's RcvArray as write-combining to allow us + * Map the chip's RcvArray as write-combining to allow us * to write an entire cacheline worth of entries in one shot. - * If this re-map fails, just continue - the RcvArray programming - * function will handle both cases. */ - dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT); dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY, dd->chip_rcv_array_count * 8); - dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc); - /* - * Save BARs and command to rewrite after device reset. - */ - pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, &dd->pcibar0); - pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, &dd->pcibar1); - pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom); - pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command); - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl); - pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl); - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2, - &dd->pcie_devctl2); - pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0); - pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3); - pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2); + if (!dd->rcvarray_wc) { + dd_dev_err(dd, "WC mapping of receive array failed\n"); + goto nomem; + } + dd_dev_info(dd, "WC RcvArray: %p for %x\n", + dd->rcvarray_wc, dd->chip_rcv_array_count * 8); + dd->flags |= HFI1_PRESENT; /* chip.c CSR routines now work */ return 0; +nomem: + ret = -ENOMEM; + hfi1_pcie_ddcleanup(dd); + return ret; } /* @@ -229,59 +237,19 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) */ void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd) { - u64 __iomem *base = (void __iomem *)dd->kregbase; - dd->flags &= ~HFI1_PRESENT; - dd->kregbase = NULL; - iounmap(base); + if (dd->kregbase1) + iounmap(dd->kregbase1); + dd->kregbase1 = NULL; + if (dd->kregbase2) + iounmap(dd->kregbase2); + dd->kregbase2 = NULL; if (dd->rcvarray_wc) iounmap(dd->rcvarray_wc); + dd->rcvarray_wc = NULL; if (dd->piobase) iounmap(dd->piobase); -} - -static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt, - struct hfi1_msix_entry *hfi1_msix_entry) -{ - int ret; - int nvec = *msixcnt; - struct msix_entry *msix_entry; - int i; - - /* - * We can't pass hfi1_msix_entry array to msix_setup - * so use a dummy msix_entry array and copy the allocated - * irq back to the hfi1_msix_entry array. - */ - msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL); - if (!msix_entry) { - ret = -ENOMEM; - goto do_intx; - } - - for (i = 0; i < nvec; i++) - msix_entry[i] = hfi1_msix_entry[i].msix; - - ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); - if (ret < 0) - goto free_msix_entry; - nvec = ret; - - for (i = 0; i < nvec; i++) - hfi1_msix_entry[i].msix = msix_entry[i]; - - kfree(msix_entry); - *msixcnt = nvec; - return; - -free_msix_entry: - kfree(msix_entry); - -do_intx: - dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n", - nvec, ret); - *msixcnt = 0; - hfi1_enable_intx(dd->pcidev); + dd->piobase = NULL; } /* return the PCIe link speed from the given link status */ @@ -314,8 +282,14 @@ static u32 extract_width(u16 linkstat) static void update_lbus_info(struct hfi1_devdata *dd) { u16 linkstat; + int ret; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return; + } - pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); dd->lbus_width = extract_width(linkstat); dd->lbus_speed = extract_speed(linkstat); snprintf(dd->lbus_info, sizeof(dd->lbus_info), @@ -330,6 +304,7 @@ int pcie_speeds(struct hfi1_devdata *dd) { u32 linkcap; struct pci_dev *parent = dd->pcidev->bus->self; + int ret; if (!pci_is_pcie(dd->pcidev)) { dd_dev_err(dd, "Can't find PCI Express capability!\n"); @@ -339,7 +314,12 @@ int pcie_speeds(struct hfi1_devdata *dd) /* find if our max speed is Gen3 and parent supports Gen3 speeds */ dd->link_gen3_capable = 1; - pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap); + ret = pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return ret; + } + if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) { dd_dev_info(dd, "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n", @@ -364,49 +344,150 @@ int pcie_speeds(struct hfi1_devdata *dd) } /* - * Returns in *nent: - * - actual number of interrupts allocated + * Returns: + * - actual number of interrupts allocated or * - 0 if fell back to INTx. + * - error */ -void request_msix(struct hfi1_devdata *dd, u32 *nent, - struct hfi1_msix_entry *entry) +int request_msix(struct hfi1_devdata *dd, u32 msireq) { - int pos; + int nvec, ret; - pos = dd->pcidev->msix_cap; - if (*nent && pos) { - msix_setup(dd, pos, nent, entry); - /* did it, either MSI-X or INTx */ - } else { - *nent = 0; - hfi1_enable_intx(dd->pcidev); + nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq, + PCI_IRQ_MSIX | PCI_IRQ_LEGACY); + if (nvec < 0) { + dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); + return nvec; } - tune_pcie_caps(dd); + ret = tune_pcie_caps(dd); + if (ret) { + dd_dev_err(dd, "tune_pcie_caps() failed: %d\n", ret); + pci_free_irq_vectors(dd->pcidev); + return ret; + } + + /* check for legacy IRQ */ + if (nvec == 1 && !dd->pcidev->msix_enabled) + return 0; + + return nvec; } -void hfi1_enable_intx(struct pci_dev *pdev) +/* restore command and BARs after a reset has wiped them out */ +int restore_pci_variables(struct hfi1_devdata *dd) { - /* first, turn on INTx */ - pci_intx(pdev, 1); - /* then turn off MSI-X */ - pci_disable_msix(pdev); + int ret = 0; + + ret = pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + dd->pcibar0); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + dd->pcibar1); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom); + if (ret) + goto error; + + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, + dd->pcie_devctl); + if (ret) + goto error; + + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, + dd->pcie_lnkctl); + if (ret) + goto error; + + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2, + dd->pcie_devctl2); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, + dd->pci_lnkctl3); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2); + if (ret) + goto error; + + return 0; + +error: + dd_dev_err(dd, "Unable to write to PCI config\n"); + return ret; } -/* restore command and BARs after a reset has wiped them out */ -void restore_pci_variables(struct hfi1_devdata *dd) +/* Save BARs and command to rewrite after device reset */ +int save_pci_variables(struct hfi1_devdata *dd) { - pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command); - pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0); - pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1); - pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl); - pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2, - dd->pcie_devctl2); - pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0); - pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3); - pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2); + int ret = 0; + + ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + &dd->pcibar0); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + &dd->pcibar1); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom); + if (ret) + goto error; + + ret = pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command); + if (ret) + goto error; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, + &dd->pcie_devctl); + if (ret) + goto error; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, + &dd->pcie_lnkctl); + if (ret) + goto error; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2, + &dd->pcie_devctl2); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, + &dd->pci_lnkctl3); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2); + if (ret) + goto error; + + return 0; + +error: + dd_dev_err(dd, "Unable to read from PCI config\n"); + return ret; } /* @@ -421,21 +502,33 @@ uint aspm_mode = ASPM_MODE_DISABLED; module_param_named(aspm, aspm_mode, uint, S_IRUGO); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static void tune_pcie_caps(struct hfi1_devdata *dd) +static int tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; u16 rc_mrrs, ep_mrrs, max_mrrs, ectl; + int ret; /* * Turn on extended tags in DevCtl in case the BIOS has turned it off * to improve WFR SDMA bandwidth */ - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + ret = pcie_capability_read_word(dd->pcidev, + PCI_EXP_DEVCTL, &ectl); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return ret; + } + if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { dd_dev_info(dd, "Enabling PCIe extended tags\n"); ectl |= PCI_EXP_DEVCTL_EXT_TAG; - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + ret = pcie_capability_write_word(dd->pcidev, + PCI_EXP_DEVCTL, ectl); + if (ret) { + dd_dev_err(dd, "Unable to write to PCI config\n"); + return ret; + } } /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; @@ -444,14 +537,14 @@ static void tune_pcie_caps(struct hfi1_devdata *dd) * access to the upstream component. */ if (!parent) - return; + return -EINVAL; if (!pci_is_root_bus(parent->bus)) { dd_dev_info(dd, "Parent not root\n"); - return; + return -EINVAL; } if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - return; + return -EINVAL; rc_mpss = parent->pcie_mpss; rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ @@ -497,6 +590,8 @@ static void tune_pcie_caps(struct hfi1_devdata *dd) ep_mrrs = max_mrrs; pcie_set_readrq(dd->pcidev, ep_mrrs); } + + return 0; } /* End of PCIe capability tuning */ @@ -728,6 +823,7 @@ static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, u32 violation; u32 i; u8 c_minus1, c0, c_plus1; + int ret; for (i = 0; i < 11; i++) { /* set index */ @@ -739,8 +835,14 @@ static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, pci_write_config_dword(pdev, PCIE_CFG_REG_PL102, eq_value(c_minus1, c0, c_plus1)); /* check if these coefficients violate EQ rules */ - pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105, - &violation); + ret = pci_read_config_dword(dd->pcidev, + PCIE_CFG_REG_PL105, &violation); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + hit_error = 1; + break; + } + if (violation & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){ if (hit_error == 0) { @@ -1194,7 +1296,13 @@ retry: * that it is Gen3 capable earlier. */ dd_dev_info(dd, "%s: setting parent target link speed\n", __func__); - pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2); + ret = pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return_error = 1; + goto done; + } + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, (u32)lnkctl2); /* only write to parent if target is not as high as ours */ @@ -1203,20 +1311,37 @@ retry: lnkctl2 |= target_vector; dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, (u32)lnkctl2); - pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2); + ret = pcie_capability_write_word(parent, + PCI_EXP_LNKCTL2, lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to write to PCI config\n"); + return_error = 1; + goto done; + } } else { dd_dev_info(dd, "%s: ..target speed is OK\n", __func__); } dd_dev_info(dd, "%s: setting target link speed\n", __func__); - pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return_error = 1; + goto done; + } + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, (u32)lnkctl2); lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; lnkctl2 |= target_vector; dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, (u32)lnkctl2); - pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to write to PCI config\n"); + return_error = 1; + goto done; + } /* step 5h: arm gasket logic */ /* hold DC in reset across the SBR */ @@ -1266,7 +1391,14 @@ retry: /* restore PCI space registers we know were reset */ dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__); - restore_pci_variables(dd); + ret = restore_pci_variables(dd); + if (ret) { + dd_dev_err(dd, "%s: Could not restore PCI variables\n", + __func__); + return_error = 1; + goto done; + } + /* restore firmware control */ write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl); @@ -1296,7 +1428,13 @@ retry: setextled(dd, 0); /* check for any per-lane errors */ - pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, ®32); + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, ®32); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return_error = 1; + goto done; + } + dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32); /* extract status, look for our HFI */ diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index ed72b5aca139..7108a4b5e94c 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1012,7 +1012,7 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n", __func__, sc->sw_index, sc->hw_context, (u32)reg); - queue_work(dd->pport->hfi1_wq, + queue_work(dd->pport->link_wq, &dd->pport->link_bounce_work); break; } @@ -1568,7 +1568,8 @@ static void sc_piobufavail(struct send_context *sc) struct rvt_qp *qp; struct hfi1_qp_priv *priv; unsigned long flags; - unsigned i, n = 0; + uint i, n = 0, max_idx = 0; + u8 max_starved_cnt = 0; if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && dd->send_contexts[sc->sw_index].type != SC_VL15) @@ -1591,6 +1592,7 @@ static void sc_piobufavail(struct send_context *sc) priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; + iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx); /* refcount held until actual wake up */ qps[n++] = qp; } @@ -1605,9 +1607,14 @@ static void sc_piobufavail(struct send_context *sc) } write_sequnlock_irqrestore(&dev->iowait_lock, flags); - for (i = 0; i < n; i++) - hfi1_qp_wakeup(qps[i], + /* Wake up the most starved one first */ + if (n) + hfi1_qp_wakeup(qps[max_idx], RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN); + for (i = 0; i < n; i++) + if (i != max_idx) + hfi1_qp_wakeup(qps[i], + RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN); } /* translate a send credit update to a bit code of reasons */ diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index 838fe84e285a..a8af96d2b1b0 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -45,10 +45,14 @@ * */ +#include <linux/firmware.h> + #include "hfi.h" #include "efivar.h" #include "eprom.h" +#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat" + static int validate_scratch_checksum(struct hfi1_devdata *dd) { u64 checksum = 0, temp_scratch = 0; @@ -58,8 +62,13 @@ static int validate_scratch_checksum(struct hfi1_devdata *dd) version = (temp_scratch & BITMAP_VERSION_SMASK) >> BITMAP_VERSION_SHIFT; /* Prevent power on default of all zeroes from passing checksum */ - if (!version) + if (!version) { + dd_dev_err(dd, "%s: Config bitmap uninitialized\n", __func__); + dd_dev_err(dd, + "%s: Please update your BIOS to support active channels\n", + __func__); return 0; + } /* * ASIC scratch 0 only contains the checksum and bitmap version as @@ -84,6 +93,8 @@ static int validate_scratch_checksum(struct hfi1_devdata *dd) if (checksum + temp_scratch == 0xFFFF) return 1; + + dd_dev_err(dd, "%s: Configuration bitmap corrupted\n", __func__); return 0; } @@ -131,25 +142,22 @@ static void save_platform_config_fields(struct hfi1_devdata *dd) ppd->max_power_class = (temp_scratch & QSFP_MAX_POWER_SMASK) >> QSFP_MAX_POWER_SHIFT; + + ppd->config_from_scratch = true; } void get_platform_config(struct hfi1_devdata *dd) { int ret = 0; - unsigned long size = 0; u8 *temp_platform_config = NULL; u32 esize; + const struct firmware *platform_config_file = NULL; if (is_integrated(dd)) { if (validate_scratch_checksum(dd)) { save_platform_config_fields(dd); return; } - dd_dev_err(dd, "%s: Config bitmap corrupted/uninitialized\n", - __func__); - dd_dev_err(dd, - "%s: Please update your BIOS to support active channels\n", - __func__); } else { ret = eprom_read_platform_config(dd, (void **)&temp_platform_config, @@ -160,36 +168,37 @@ void get_platform_config(struct hfi1_devdata *dd) dd->platform_config.size = esize; return; } - /* fail, try EFI variable */ - - ret = read_hfi1_efi_var(dd, "configuration", &size, - (void **)&temp_platform_config); - if (!ret) { - dd->platform_config.data = temp_platform_config; - dd->platform_config.size = size; - return; - } } dd_dev_err(dd, "%s: Failed to get platform config, falling back to sub-optimal default file\n", __func__); - /* fall back to request firmware */ - platform_config_load = 1; -} -void free_platform_config(struct hfi1_devdata *dd) -{ - if (!platform_config_load) { - /* - * was loaded from EFI or the EPROM, release memory - * allocated by read_efi_var/eprom_read_platform_config - */ - kfree(dd->platform_config.data); + ret = request_firmware(&platform_config_file, + DEFAULT_PLATFORM_CONFIG_NAME, + &dd->pcidev->dev); + if (ret) { + dd_dev_err(dd, + "%s: No default platform config file found\n", + __func__); + return; } + /* - * else do nothing, dispose_firmware will release - * struct firmware platform_config on driver exit + * Allocate separate memory block to store data and free firmware + * structure. This allows free_platform_config to treat EPROM and + * fallback configs in the same manner. */ + dd->platform_config.data = kmemdup(platform_config_file->data, + platform_config_file->size, + GFP_KERNEL); + dd->platform_config.size = platform_config_file->size; + release_firmware(platform_config_file); +} + +void free_platform_config(struct hfi1_devdata *dd) +{ + /* Release memory allocated for eprom or fallback file read. */ + kfree(dd->platform_config.data); } void get_port_type(struct hfi1_pportdata *ppd) @@ -242,7 +251,7 @@ static int qual_power(struct hfi1_pportdata *ppd) if (ppd->offline_disabled_reason == HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) { - dd_dev_info( + dd_dev_err( ppd->dd, "%s: Port disabled due to system power restrictions\n", __func__); @@ -268,7 +277,7 @@ static int qual_bitrate(struct hfi1_pportdata *ppd) if (ppd->offline_disabled_reason == HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) { - dd_dev_info( + dd_dev_err( ppd->dd, "%s: Cable failed bitrate check, disabling port\n", __func__); @@ -709,15 +718,15 @@ static void apply_tunings( ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, GENERAL_CONFIG, config_data); if (ret != HCMD_SUCCESS) - dd_dev_info(ppd->dd, - "%s: Failed set ext device config params\n", - __func__); + dd_dev_err(ppd->dd, + "%s: Failed set ext device config params\n", + __func__); } if (tx_preset_index == OPA_INVALID_INDEX) { if (ppd->port_type == PORT_TYPE_QSFP && limiting_active) - dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n", - __func__); + dd_dev_err(ppd->dd, "%s: Invalid Tx preset index\n", + __func__); return; } @@ -900,7 +909,7 @@ static int tune_qsfp(struct hfi1_pportdata *ppd, case 0xD: /* fallthrough */ case 0xF: default: - dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n", + dd_dev_warn(ppd->dd, "%s: Unknown/unsupported cable\n", __func__); break; } @@ -935,6 +944,21 @@ void tune_serdes(struct hfi1_pportdata *ppd) if (loopback != LOOPBACK_NONE || ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { ppd->driver_link_ready = 1; + + if (qsfp_mod_present(ppd)) { + ret = acquire_chip_resource(ppd->dd, + qsfp_resource(ppd->dd), + QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + goto bail; + } + + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + } + return; } @@ -942,7 +966,7 @@ void tune_serdes(struct hfi1_pportdata *ppd) case PORT_TYPE_DISCONNECTED: ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED); - dd_dev_info(dd, "%s: Port disconnected, disabling port\n", + dd_dev_warn(dd, "%s: Port disconnected, disabling port\n", __func__); goto bail; case PORT_TYPE_FIXED: @@ -1027,7 +1051,7 @@ void tune_serdes(struct hfi1_pportdata *ppd) } break; default: - dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__); + dd_dev_warn(ppd->dd, "%s: Unknown port type\n", __func__); ppd->port_type = PORT_TYPE_UNKNOWN; tuning_method = OPA_UNKNOWN_TUNING; total_atten = 0; diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 1a7af9f60c13..4b01ccd895b4 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -68,17 +68,12 @@ static int iowait_sleep( struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *stx, - unsigned seq); + unsigned int seq, + bool pkts_sent); static void iowait_wakeup(struct iowait *wait, int reason); static void iowait_sdma_drained(struct iowait *wait); static void qp_pio_drain(struct rvt_qp *qp); -static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, - struct rvt_qpn_map *map, unsigned off) -{ - return (map - qpt->map) * RVT_BITS_PER_PAGE + off; -} - const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { [IB_WR_RDMA_WRITE] = { .length = sizeof(struct ib_rdma_wr), @@ -237,6 +232,31 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, return 0; } +/* + * qp_set_16b - Set the hdr_type based on whether the slid or the + * dlid in the connection is extended. Only applicable for RC and UC + * QPs. UD QPs determine this on the fly from the ah in the wqe + */ +static inline void qp_set_16b(struct rvt_qp *qp) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct hfi1_qp_priv *priv = qp->priv; + + /* Update ah_attr to account for extended LIDs */ + hfi1_update_ah_attr(qp->ibqp.device, &qp->remote_ah_attr); + + /* Create 32 bit LIDs */ + hfi1_make_opa_lid(&qp->remote_ah_attr); + + if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) + return; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + priv->hdr_type = hfi1_get_hdr_type(ppd->lid, &qp->remote_ah_attr); +} + void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -247,6 +267,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + qp_set_16b(qp); } if (attr_mask & IB_QP_PATH_MIG_STATE && @@ -256,6 +277,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + qp_set_16b(qp); } } @@ -377,7 +399,8 @@ static int iowait_sleep( struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *stx, - unsigned seq) + uint seq, + bool pkts_sent) { struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq); struct rvt_qp *qp; @@ -408,7 +431,8 @@ static int iowait_sleep( ibp->rvp.n_dmawait++; qp->s_flags |= RVT_S_WAIT_DMA_DESC; - list_add_tail(&priv->s_iowait.list, &sde->dmawait); + iowait_queue(pkts_sent, &priv->s_iowait, + &sde->dmawait); priv->s_iowait.lock = &dev->iowait_lock; trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC); rvt_get_qp(qp); @@ -506,82 +530,6 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5) sc5); } -struct qp_iter { - struct hfi1_ibdev *dev; - struct rvt_qp *qp; - int specials; - int n; -}; - -struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev) -{ - struct qp_iter *iter; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return NULL; - - iter->dev = dev; - iter->specials = dev->rdi.ibdev.phys_port_cnt * 2; - - return iter; -} - -int qp_iter_next(struct qp_iter *iter) -{ - struct hfi1_ibdev *dev = iter->dev; - int n = iter->n; - int ret = 1; - struct rvt_qp *pqp = iter->qp; - struct rvt_qp *qp; - - /* - * The approach is to consider the special qps - * as an additional table entries before the - * real hash table. Since the qp code sets - * the qp->next hash link to NULL, this works just fine. - * - * iter->specials is 2 * # ports - * - * n = 0..iter->specials is the special qp indices - * - * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are - * the potential hash bucket entries - * - */ - for (; n < dev->rdi.qp_dev->qp_table_size + iter->specials; n++) { - if (pqp) { - qp = rcu_dereference(pqp->next); - } else { - if (n < iter->specials) { - struct hfi1_pportdata *ppd; - struct hfi1_ibport *ibp; - int pidx; - - pidx = n % dev->rdi.ibdev.phys_port_cnt; - ppd = &dd_from_dev(dev)->pport[pidx]; - ibp = &ppd->ibport_data; - - if (!(n & 1)) - qp = rcu_dereference(ibp->rvp.qp[0]); - else - qp = rcu_dereference(ibp->rvp.qp[1]); - } else { - qp = rcu_dereference( - dev->rdi.qp_dev->qp_table[ - (n - iter->specials)]); - } - } - pqp = qp; - if (qp) { - iter->qp = qp; - iter->n = n; - return 0; - } - } - return ret; -} - static const char * const qp_type_str[] = { "SMI", "GSI", "RC", "UC", "UD", }; @@ -595,19 +543,27 @@ static int qp_idle(struct rvt_qp *qp) qp->s_tail == qp->s_head; } -void qp_iter_print(struct seq_file *s, struct qp_iter *iter) +/** + * qp_iter_print - print the qp information to seq_file + * @s: the seq_file to emit the qp information on + * @iter: the iterator for the qp hash list + */ +void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) { struct rvt_swqe *wqe; struct rvt_qp *qp = iter->qp; struct hfi1_qp_priv *priv = qp->priv; struct sdma_engine *sde; struct send_context *send_context; + struct rvt_ack_entry *e = NULL; sde = qp_to_sdma_engine(qp, priv->s_sc); wqe = rvt_get_swqe_ptr(qp, qp->s_last); send_context = qp_to_send_context(qp, priv->s_sc); + if (qp->s_ack_queue) + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; seq_printf(s, - "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n", + "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x\n", iter->n, qp_idle(qp) ? "I" : "B", qp->ibqp.qp_num, @@ -630,6 +586,10 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) qp->s_last, qp->s_acked, qp->s_cur, qp->s_tail, qp->s_head, qp->s_size, qp->s_avail, + /* ack_queue ring pointers, size */ + qp->s_tail_ack_queue, qp->r_head_ack_queue, + rvt_max_atomic(&to_idev(qp->ibqp.device)->rdi), + /* remote QP info */ qp->remote_qpn, rdma_ah_get_dlid(&qp->remote_ah_attr), rdma_ah_get_sl(&qp->remote_ah_attr), @@ -644,7 +604,13 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) send_context ? send_context->sw_index : 0, ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head, ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail, - qp->pid); + qp->pid, + qp->s_state, + qp->s_ack_state, + /* ack queue information */ + e ? e->opcode : 0, + e ? e->psn : 0, + e ? e->lpsn : 0); } void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) @@ -750,6 +716,7 @@ void hfi1_migrate_qp(struct rvt_qp *qp) qp->s_flags |= RVT_S_AHG_CLEAR; priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + qp_set_16b(qp); ev.device = qp->ibqp.device; ev.element.qp = &qp->ibqp; @@ -832,6 +799,45 @@ void notify_error_qp(struct rvt_qp *qp) } /** + * hfi1_qp_iter_cb - callback for iterator + * @qp - the qp + * @v - the sl in low bits of v + * + * This is called from the iterator callback to work + * on an individual qp. + */ +static void hfi1_qp_iter_cb(struct rvt_qp *qp, u64 v) +{ + int lastwqe; + struct ib_event ev; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u8 sl = (u8)v; + + if (qp->port_num != ppd->port || + (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) || + rdma_ah_get_sl(&qp->remote_ah_attr) != sl || + !(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK)) + return; + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +/** * hfi1_error_port_qps - put a port's RC/UC qps into error state * @ibp: the ibport. * @sl: the service level. @@ -842,44 +848,8 @@ void notify_error_qp(struct rvt_qp *qp) */ void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl) { - struct rvt_qp *qp = NULL; struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_ibdev *dev = &ppd->dd->verbs_dev; - int n; - int lastwqe; - struct ib_event ev; - - rcu_read_lock(); - - /* Deal only with RC/UC qps that use the given SL. */ - for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) { - for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp; - qp = rcu_dereference(qp->next)) { - if (qp->port_num == ppd->port && - (qp->ibqp.qp_type == IB_QPT_UC || - qp->ibqp.qp_type == IB_QPT_RC) && - rdma_ah_get_sl(&qp->remote_ah_attr) == sl && - (ib_rvt_state_ops[qp->state] & - RVT_POST_SEND_OK)) { - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_hlock); - spin_lock(&qp->s_lock); - lastwqe = rvt_error_qp(qp, - IB_WC_WR_FLUSH_ERR); - spin_unlock(&qp->s_lock); - spin_unlock(&qp->s_hlock); - spin_unlock_irq(&qp->r_lock); - if (lastwqe) { - ev.device = qp->ibqp.device; - ev.element.qp = &qp->ibqp; - ev.event = - IB_EVENT_QP_LAST_WQE_REACHED; - qp->ibqp.event_handler(&ev, - qp->ibqp.qp_context); - } - } - } - } - rcu_read_unlock(); + rvt_qp_iter(&dev->rdi, sl, hfi1_qp_iter_cb); } diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 6fe542b6a927..c06d2f8348e0 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -1,7 +1,7 @@ #ifndef _QP_H #define _QP_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -94,26 +94,7 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag); struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5); struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); -struct qp_iter; - -/** - * qp_iter_init - initialize the iterator for the qp hash list - * @dev: the hfi1_ibdev - */ -struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev); - -/** - * qp_iter_next - Find the next qp in the hash list - * @iter: the iterator for the qp hash list - */ -int qp_iter_next(struct qp_iter *iter); - -/** - * qp_iter_print - print the qp information to seq_file - * @s: the seq_file to emit the qp information on - * @iter: the iterator for the qp hash list - */ -void qp_iter_print(struct seq_file *s, struct qp_iter *iter); +void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); void _hfi1_schedule_send(struct rvt_qp *qp); void hfi1_schedule_send(struct rvt_qp *qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 1080778a1f7c..e1cf0c08ca6f 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -100,8 +100,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - /* header size in 32-bit words LRH+BTH = (8+12)/4. */ - hwords = 5; + if (priv->hdr_type == HFI1_PKT_TYPE_9B) + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + else + /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ + hwords = 7; switch (qp->s_ack_state) { case OP(RDMA_READ_RESPONSE_LAST): @@ -258,8 +262,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct ib_other_headers *ohdr; struct rvt_sge_state *ss; struct rvt_swqe *wqe; - /* header size in 32-bit words LRH+BTH = (8+12)/4. */ - u32 hwords = 5; + u32 hwords; u32 len; u32 bth0 = 0; u32 bth2; @@ -273,9 +276,23 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (IS_ERR(ps->s_txreq)) goto bail_no_tx; - ohdr = &ps->s_txreq->phdr.hdr.u.oth; - if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) - ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; + ps->s_txreq->phdr.hdr.hdr_type = priv->hdr_type; + if (priv->hdr_type == HFI1_PKT_TYPE_9B) { + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } else { + /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ + hwords = 7; + if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + } /* Sending responses has higher priority over sending requests. */ if ((qp->s_flags & RVT_S_RESP_PENDING) && @@ -425,7 +442,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) case IB_WR_RDMA_WRITE: if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - /* FALLTHROUGH */ + goto no_flow_control; case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && @@ -433,6 +450,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } +no_flow_control: put_ib_reth_vaddr( wqe->rdma_wr.remote_addr, &ohdr->u.rc.reth); @@ -703,6 +721,154 @@ bail_no_tx: return 0; } +static inline void hfi1_make_bth_aeth(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1) +{ + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + (qp->r_nak_state << + IB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = rvt_compute_aeth(qp); + + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); +} + +static inline void hfi1_queue_rc_ack(struct rvt_qp *qp, bool is_fecn) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto unlock; + this_cpu_inc(*ibp->rvp.rc_qacks); + qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + if (is_fecn) + qp->s_flags |= RVT_S_ECN; + + /* Schedule the send tasklet. */ + hfi1_schedule_send(qp); +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +static inline void hfi1_make_rc_ack_9B(struct rvt_qp *qp, + struct hfi1_opa_header *opa_hdr, + u8 sc5, bool is_fecn, + u64 *pbc_flags, u32 *hwords, + u32 *nwords) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct ib_header *hdr = &opa_hdr->ibh; + struct ib_other_headers *ohdr; + u16 lrh0 = HFI1_LRH_BTH; + u16 pkey; + u32 bth0, bth1; + + opa_hdr->hdr_type = HFI1_PKT_TYPE_9B; + ohdr = &hdr->u.oth; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ + *hwords = 6; + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { + *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, + rdma_ah_read_grh(&qp->remote_ah_attr), + *hwords - 2, SIZE_OF_CRC); + ohdr = &hdr->u.l.oth; + lrh0 = HFI1_LRH_GRH; + } + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); + + /* read pkey_index w/o lock (its atomic) */ + pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + + lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT | + (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) << + IB_SL_SHIFT; + + hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC, + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), + ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr)); + + bth0 = pkey | (OP(ACKNOWLEDGE) << 24); + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + bth1 = (!!is_fecn) << IB_BECN_SHIFT; + hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); +} + +static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp, + struct hfi1_opa_header *opa_hdr, + u8 sc5, bool is_fecn, + u64 *pbc_flags, u32 *hwords, + u32 *nwords) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_16b_header *hdr = &opa_hdr->opah; + struct ib_other_headers *ohdr; + u32 bth0, bth1; + u16 len, pkey; + u8 becn = !!is_fecn; + u8 l4 = OPA_16B_L4_IB_LOCAL; + u8 extra_bytes; + + opa_hdr->hdr_type = HFI1_PKT_TYPE_16B; + ohdr = &hdr->u.oth; + /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */ + *hwords = 8; + extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0); + *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2); + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { + *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, + rdma_ah_read_grh(&qp->remote_ah_attr), + *hwords - 4, *nwords); + ohdr = &hdr->u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + } + *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + + /* read pkey_index w/o lock (its atomic) */ + pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + + /* Convert dwords to flits */ + len = (*hwords + *nwords) >> 1; + + hfi1_make_16b_hdr(hdr, + ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr), + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), + 16B), + len, pkey, becn, 0, l4, sc5); + + bth0 = pkey | (OP(ACKNOWLEDGE) << 24); + bth0 |= extra_bytes << 20; + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth1 = OPA_BTH_MIG_REQ; + hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); +} + +typedef void (*hfi1_make_rc_ack)(struct rvt_qp *qp, + struct hfi1_opa_header *opa_hdr, + u8 sc5, bool is_fecn, + u64 *pbc_flags, u32 *hwords, + u32 *nwords); + +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B +}; + /** * hfi1_send_rc_ack - Construct an ACK packet and send it * @qp: a pointer to the QP @@ -711,83 +877,48 @@ bail_no_tx: * Note that RDMA reads and atomics are handled in the * send side QP state and send engine. */ -void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp, - int is_fecn) +void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp, bool is_fecn) { struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; u64 pbc, pbc_flags = 0; - u16 lrh0; - u16 sc5; - u32 bth0; - u32 hwords; - u32 vl, plen; - struct send_context *sc; + u32 hwords = 0; + u32 nwords = 0; + u32 plen; struct pio_buf *pbuf; - struct ib_header hdr; - struct ib_other_headers *ohdr; - unsigned long flags; + struct hfi1_opa_header opa_hdr; /* clear the defer count */ qp->r_adefered = 0; /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ - if (qp->s_flags & RVT_S_RESP_PENDING) - goto queue_ack; + if (qp->s_flags & RVT_S_RESP_PENDING) { + hfi1_queue_rc_ack(qp, is_fecn); + return; + } /* Ensure s_rdma_ack_cnt changes are committed */ smp_read_barrier_depends(); - if (qp->s_rdma_ack_cnt) - goto queue_ack; - - /* Construct the header */ - /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ - hwords = 6; - if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { - hwords += hfi1_make_grh(ibp, &hdr.u.l.grh, - rdma_ah_read_grh(&qp->remote_ah_attr), - hwords, 0); - ohdr = &hdr.u.l.oth; - lrh0 = HFI1_LRH_GRH; - } else { - ohdr = &hdr.u.oth; - lrh0 = HFI1_LRH_BTH; + if (qp->s_rdma_ack_cnt) { + hfi1_queue_rc_ack(qp, is_fecn); + return; } - /* read pkey_index w/o lock (its atomic) */ - bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24); - if (qp->s_mig_state == IB_MIG_MIGRATED) - bth0 |= IB_BTH_MIG_REQ; - if (qp->r_nak_state) - ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | - (qp->r_nak_state << - IB_AETH_CREDIT_SHIFT)); - else - ohdr->u.aeth = rvt_compute_aeth(qp); - sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; - /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ - pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); - lrh0 |= (sc5 & 0xf) << 12 | (rdma_ah_get_sl(&qp->remote_ah_attr) - & 0xf) << 4; - hdr.lrh[0] = cpu_to_be16(lrh0); - hdr.lrh[1] = cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr)); - hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); - hdr.lrh[3] = cpu_to_be16(ppd->lid | - rdma_ah_get_path_bits(&qp->remote_ah_attr)); - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); - ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << IB_BECN_SHIFT); - ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); /* Don't try to send ACKs if the link isn't ACTIVE */ if (driver_lstate(ppd) != IB_PORT_ACTIVE) return; - sc = rcd->sc; - plen = 2 /* PBC */ + hwords; - vl = sc_to_vlt(ppd->dd, sc5); - pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + /* Make the appropriate header */ + hfi1_make_rc_ack_tbl[priv->hdr_type](qp, &opa_hdr, sc5, is_fecn, + &pbc_flags, &hwords, &nwords); - pbuf = sc_buffer_alloc(sc, plen, NULL, NULL); + plen = 2 /* PBC */ + hwords + nwords; + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, + sc_to_vlt(ppd->dd, sc5), plen); + pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL); if (!pbuf) { /* * We have no room to send at the moment. Pass @@ -795,31 +926,18 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp, * so that when enough buffer space becomes available, * the ACK is sent ahead of other outgoing packets. */ - goto queue_ack; + hfi1_queue_rc_ack(qp, is_fecn); + return; } - - trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr); + trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &opa_hdr, ib_is_sc5(sc5)); /* write the pbc and data */ - ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords); - + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + (priv->hdr_type == HFI1_PKT_TYPE_9B ? + (void *)&opa_hdr.ibh : + (void *)&opa_hdr.opah), hwords); return; - -queue_ack: - spin_lock_irqsave(&qp->s_lock, flags); - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) - goto unlock; - this_cpu_inc(*ibp->rvp.rc_qacks); - qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; - qp->s_nak_state = qp->r_nak_state; - qp->s_ack_psn = qp->r_ack_psn; - if (is_fecn) - qp->s_flags |= RVT_S_ECN; - - /* Schedule the send engine. */ - hfi1_schedule_send(qp); -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); } /** @@ -984,10 +1102,13 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) /* * This should be called with the QP s_lock held and interrupts disabled. */ -void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr) +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) { struct ib_other_headers *ohdr; + struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe; + struct ib_header *hdr = NULL; + struct hfi1_16b_header *hdr_16b = NULL; u32 opcode; u32 psn; @@ -996,10 +1117,22 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr) return; /* Find out where the BTH is */ - if (ib_get_lnh(hdr) == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; + if (priv->hdr_type == HFI1_PKT_TYPE_9B) { + hdr = &opah->ibh; + if (ib_get_lnh(hdr) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + } else { + u8 l4; + + hdr_16b = &opah->opah; + l4 = hfi1_16B_get_l4(hdr_16b); + if (l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &hdr_16b->u.oth; + else + ohdr = &hdr_16b->u.l.oth; + } opcode = ib_bth_get_opcode(ohdr); if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && @@ -1009,7 +1142,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr) return; } - psn = be32_to_cpu(ohdr->bth[2]); + psn = ib_bth_get_psn(ohdr); reset_sending_psn(qp, psn); /* @@ -1399,36 +1532,34 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, /** * rc_rcv_resp - process an incoming RC response packet - * @ibp: the port this packet came in on - * @ohdr: the other headers for this packet - * @data: the packet data - * @tlen: the packet length - * @qp: the QP for this packet - * @opcode: the opcode for this packet - * @psn: the packet sequence number for this packet - * @hdrsize: the header length - * @pmtu: the path MTU + * @packet: data packet information * * This is called from hfi1_rc_rcv() to process an incoming RC response * packet for the given QP. * Called at interrupt level. */ -static void rc_rcv_resp(struct hfi1_ibport *ibp, - struct ib_other_headers *ohdr, - void *data, u32 tlen, struct rvt_qp *qp, - u32 opcode, u32 psn, u32 hdrsize, u32 pmtu, - struct hfi1_ctxtdata *rcd) +static void rc_rcv_resp(struct hfi1_packet *packet) { + struct hfi1_ctxtdata *rcd = packet->rcd; + void *data = packet->payload; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; struct rvt_swqe *wqe; enum ib_wc_status status; unsigned long flags; int diff; - u32 pad; - u32 aeth; u64 val; + u32 aeth; + u32 psn = ib_bth_get_psn(packet->ohdr); + u32 pmtu = qp->pmtu; + u16 hdrsize = packet->hlen; + u8 opcode = packet->opcode; + u8 pad = packet->pad; + u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); spin_lock_irqsave(&qp->s_lock, flags); - trace_hfi1_ack(qp, psn); /* Ignore invalid responses. */ @@ -1494,7 +1625,7 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp, if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) goto ack_op_err; read_middle: - if (unlikely(tlen != (hdrsize + pmtu + 4))) + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) goto ack_len_err; if (unlikely(pmtu >= qp->s_rdma_read_len)) goto ack_len_err; @@ -1526,13 +1657,11 @@ read_middle: aeth = be32_to_cpu(ohdr->u.aeth); if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) goto ack_done; - /* Get the number of bytes the message was padded by. */ - pad = ib_bth_get_pad(ohdr); /* * Check that the data size is >= 0 && <= pmtu. * Remember to account for ICRC (4). */ - if (unlikely(tlen < (hdrsize + pad + 4))) + if (unlikely(tlen < (hdrsize + extra_bytes))) goto ack_len_err; /* * If this is a response to a resent RDMA read, we @@ -1550,16 +1679,14 @@ read_middle: goto ack_seq_err; if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) goto ack_op_err; - /* Get the number of bytes the message was padded by. */ - pad = ib_bth_get_pad(ohdr); /* * Check that the data size is >= 1 && <= pmtu. * Remember to account for ICRC (4). */ - if (unlikely(tlen <= (hdrsize + pad + 4))) + if (unlikely(tlen <= (hdrsize + extra_bytes))) goto ack_len_err; read_last: - tlen -= hdrsize + pad + 4; + tlen -= hdrsize + extra_bytes; if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); @@ -1844,7 +1971,7 @@ static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, spin_unlock_irqrestore(&ppd->cc_log_lock, flags); } -void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, u32 rqpn, u8 svc_type) { struct cca_timer *cca_timer; @@ -1901,12 +2028,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, /** * hfi1_rc_rcv - process an incoming RC packet - * @rcd: the context pointer - * @hdr: the header of this packet - * @rcv_flags: flags relevant to rcv processing - * @data: the packet data - * @tlen: the packet length - * @qp: the QP for this packet + * @packet: data packet information * * This is called from qp_rcv() to process an incoming RC packet * for the given QP. @@ -1915,17 +2037,16 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, void hfi1_rc_rcv(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; - struct ib_header *hdr = packet->hdr; - u32 rcv_flags = packet->rcv_flags; - void *data = packet->ebuf; + void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct ib_other_headers *ohdr = packet->ohdr; - u32 bth0, opcode; + u32 bth0 = be32_to_cpu(ohdr->bth[0]); + u32 opcode = packet->opcode; u32 hdrsize = packet->hlen; - u32 psn; - u32 pad; + u32 psn = ib_bth_get_psn(packet->ohdr); + u32 pad = packet->pad; struct ib_wc wc; u32 pmtu = qp->pmtu; int diff; @@ -1935,17 +2056,15 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) bool is_fecn = false; bool copy_last = false; u32 rkey; + u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); lockdep_assert_held(&qp->r_lock); - bth0 = be32_to_cpu(ohdr->bth[0]); - if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) + + if (hfi1_ruc_check_hdr(ibp, packet)) return; is_fecn = process_ecn(qp, packet, false); - psn = be32_to_cpu(ohdr->bth[2]); - opcode = ib_bth_get_opcode(ohdr); - /* * Process responses (ACKs) before anything else. Note that the * packet sequence number will be for something in the send work @@ -1954,8 +2073,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) */ if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) { - rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn, - hdrsize, pmtu, rcd); + rc_rcv_resp(packet); if (is_fecn) goto send_ack; return; @@ -2022,7 +2140,12 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) case OP(RDMA_WRITE_MIDDLE): send_middle: /* Check for invalid length PMTU or posted rwqe len. */ - if (unlikely(tlen != (hdrsize + pmtu + 4))) + /* + * There will be no padding for 9B packet but 16B packets + * will come in with some padding since we always add + * CRC and LT bytes which will need to be flit aligned + */ + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) goto nack_inv; qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) @@ -2074,14 +2197,12 @@ no_immediate_data: wc.wc_flags = 0; wc.ex.imm_data = 0; send_last: - /* Get the number of bytes the message was padded by. */ - pad = ib_bth_get_pad(ohdr); /* Check for invalid length. */ /* LAST len should be >= 1 */ - if (unlikely(tlen < (hdrsize + pad + 4))) + if (unlikely(tlen < (hdrsize + extra_bytes))) goto nack_inv; - /* Don't count the CRC. */ - tlen -= (hdrsize + pad + 4); + /* Don't count the CRC(and padding and LT byte for 16B). */ + tlen -= (hdrsize + extra_bytes); wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; @@ -2368,28 +2489,19 @@ send_ack: void hfi1_rc_hdrerr( struct hfi1_ctxtdata *rcd, - struct ib_header *hdr, - u32 rcv_flags, + struct hfi1_packet *packet, struct rvt_qp *qp) { - int has_grh = rcv_flags & HFI1_HAS_GRH; - struct ib_other_headers *ohdr; struct hfi1_ibport *ibp = rcd_to_iport(rcd); int diff; u32 opcode; - u32 psn, bth0; - - /* Check for GRH */ - ohdr = &hdr->u.oth; - if (has_grh) - ohdr = &hdr->u.l.oth; + u32 psn; - bth0 = be32_to_cpu(ohdr->bth[0]); - if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0)) + if (hfi1_ruc_check_hdr(ibp, packet)) return; - psn = be32_to_cpu(ohdr->bth[2]); - opcode = ib_bth_get_opcode(ohdr); + psn = ib_bth_get_psn(packet->ohdr); + opcode = ib_bth_get_opcode(packet->ohdr); /* Only deal with RDMA Writes for now */ if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 3a17daba28a9..b3291f0fde9a 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -74,8 +74,10 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ - if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + NULL, &wqe->sg_list[i], + IB_ACCESS_LOCAL_WRITE); + if (unlikely(ret <= 0)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; j++; @@ -214,100 +216,104 @@ static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) * * The s_lock will be acquired around the hfi1_migrate_qp() call. */ -int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct ib_header *hdr, - int has_grh, struct rvt_qp *qp, u32 bth0) +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet) { __be64 guid; unsigned long flags; + struct rvt_qp *qp = packet->qp; u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; + u32 dlid = packet->dlid; + u32 slid = packet->slid; + u32 sl = packet->sl; + int migrated; + u32 bth0, bth1; + u16 pkey; + + bth0 = be32_to_cpu(packet->ohdr->bth[0]); + bth1 = be32_to_cpu(packet->ohdr->bth[1]); + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + pkey = hfi1_16B_get_pkey(packet->hdr); + migrated = bth1 & OPA_BTH_MIG_REQ; + } else { + pkey = ib_bth_get_pkey(packet->ohdr); + migrated = bth0 & IB_BTH_MIG_REQ; + } - if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) { - if (!has_grh) { - if (rdma_ah_get_ah_flags(&qp->alt_ah_attr) & - IB_AH_GRH) - goto err; + if (qp->s_mig_state == IB_MIG_ARMED && migrated) { + if (!packet->grh) { + if ((rdma_ah_get_ah_flags(&qp->alt_ah_attr) & + IB_AH_GRH) && + (packet->etype != RHF_RCV_TYPE_BYPASS)) + return 1; } else { const struct ib_global_route *grh; if (!(rdma_ah_get_ah_flags(&qp->alt_ah_attr) & IB_AH_GRH)) - goto err; + return 1; grh = rdma_ah_read_grh(&qp->alt_ah_attr); guid = get_sguid(ibp, grh->sgid_index); - if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix, + if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix, guid)) - goto err; + return 1; if (!gid_ok( - &hdr->u.l.grh.sgid, + &packet->grh->sgid, grh->dgid.global.subnet_prefix, grh->dgid.global.interface_id)) - goto err; + return 1; } - if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, sc5, - ib_get_slid(hdr)))) { - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, - (u16)bth0, - ib_get_sl(hdr), - 0, qp->ibqp.qp_num, - ib_get_slid(hdr), - ib_get_dlid(hdr)); - goto err; + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), pkey, + sc5, slid))) { + hfi1_bad_pkey(ibp, pkey, sl, 0, qp->ibqp.qp_num, + slid, dlid); + return 1; } /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ - if (ib_get_slid(hdr) != - rdma_ah_get_dlid(&qp->alt_ah_attr) || + if (slid != rdma_ah_get_dlid(&qp->alt_ah_attr) || ppd_from_ibp(ibp)->port != rdma_ah_get_port_num(&qp->alt_ah_attr)) - goto err; + return 1; spin_lock_irqsave(&qp->s_lock, flags); hfi1_migrate_qp(qp); spin_unlock_irqrestore(&qp->s_lock, flags); } else { - if (!has_grh) { - if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & - IB_AH_GRH) - goto err; + if (!packet->grh) { + if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & + IB_AH_GRH) && + (packet->etype != RHF_RCV_TYPE_BYPASS)) + return 1; } else { const struct ib_global_route *grh; if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) - goto err; + return 1; grh = rdma_ah_read_grh(&qp->remote_ah_attr); guid = get_sguid(ibp, grh->sgid_index); - if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix, + if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix, guid)) - goto err; + return 1; if (!gid_ok( - &hdr->u.l.grh.sgid, + &packet->grh->sgid, grh->dgid.global.subnet_prefix, grh->dgid.global.interface_id)) - goto err; + return 1; } - if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, sc5, - ib_get_slid(hdr)))) { - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, - (u16)bth0, - ib_get_sl(hdr), - 0, qp->ibqp.qp_num, - ib_get_slid(hdr), - ib_get_dlid(hdr)); - goto err; + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), pkey, + sc5, slid))) { + hfi1_bad_pkey(ibp, pkey, sl, 0, qp->ibqp.qp_num, + slid, dlid); + return 1; } /* Validate the SLID. See Ch. 9.6.1.5 */ - if (ib_get_slid(hdr) != - rdma_ah_get_dlid(&qp->remote_ah_attr) || + if ((slid != rdma_ah_get_dlid(&qp->remote_ah_attr)) || ppd_from_ibp(ibp)->port != qp->port_num) - goto err; - if (qp->s_mig_state == IB_MIG_REARM && - !(bth0 & IB_BTH_MIG_REQ)) + return 1; + if (qp->s_mig_state == IB_MIG_REARM && !migrated) qp->s_mig_state = IB_MIG_ARMED; } return 0; - -err: - return 1; } /** @@ -643,7 +649,7 @@ done: * @ibp: a pointer to the IB port * @hdr: a pointer to the GRH header being constructed * @grh: the global route address to send to - * @hwords: the number of 32 bit words of header being sent + * @hwords: size of header after grh being sent in dwords * @nwords: the number of 32 bit words of data being sent * * Return the size of the header in 32 bit words. @@ -655,7 +661,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | (grh->flow_label << IB_GRH_FLOW_SHIFT)); - hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + hdr->paylen = cpu_to_be16((hwords + nwords) << 2); /* next_hdr is defined by C8-7 in ch. 8.4.1 */ hdr->next_hdr = IB_GRH_NEXT_HDR; hdr->hop_limit = grh->hop_limit; @@ -671,7 +677,8 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, return sizeof(struct ib_grh) / sizeof(u32); } -#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4) +#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, \ + hdr.ibh.u.oth.bth[2]) / 4) /** * build_ahg - create ahg in s_ahg @@ -728,73 +735,186 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) } } -void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, - struct hfi1_pkt_state *ps) +static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2) +{ + bth1 |= qp->remote_qpn; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(bth1); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; - u16 lrh0; - u32 nwords; - u32 extra_bytes; - u32 bth1; - - /* Construct the header. */ - extra_bytes = -ps->s_txreq->s_cur_size & 3; - nwords = (ps->s_txreq->s_cur_size + extra_bytes) >> 2; - lrh0 = HFI1_LRH_BTH; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 bth1 = 0; + u32 slid; + u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + u8 l4 = OPA_16B_L4_IB_LOCAL; + u8 extra_bytes = hfi1_get_16b_padding((qp->s_hdrwords << 2), + ps->s_txreq->s_cur_size); + u32 nwords = SIZE_OF_CRC + ((ps->s_txreq->s_cur_size + + extra_bytes + SIZE_OF_LT) >> 2); + u8 becn = 0; + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { + struct ib_grh *grh; + struct ib_global_route *grd = + rdma_ah_retrieve_grh(&qp->remote_ah_attr); + int hdrwords; + + /* + * Ensure OPA GIDs are transformed to IB gids + * before creating the GRH. + */ + if (grd->sgid_index == OPA_GID_INDEX) + grd->sgid_index = 0; + grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh; + l4 = OPA_16B_L4_IB_GLOBAL; + hdrwords = qp->s_hdrwords - 4; + qp->s_hdrwords += hfi1_make_grh(ibp, grh, grd, + hdrwords, nwords); + middle = 0; + } + + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth1 |= OPA_BTH_MIG_REQ; + else + middle = 0; + + if (middle) + build_ahg(qp, bth2); + else + qp->s_flags &= ~RVT_S_AHG_VALID; + + bth0 |= pkey; + bth0 |= extra_bytes << 20; + if (qp->s_flags & RVT_S_ECN) { + qp->s_flags &= ~RVT_S_ECN; + /* we recently received a FECN, so return a BECN */ + becn = 1; + } + hfi1_make_ruc_bth(qp, ohdr, bth0, bth1, bth2); + + if (!ppd->lid) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); + else + slid = ppd->lid | + (rdma_ah_get_path_bits(&qp->remote_ah_attr) & + ((1 << ppd->lmc) - 1)); + + hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah, + slid, + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), + 16B), + (qp->s_hdrwords + nwords) >> 1, + pkey, becn, 0, l4, priv->s_sc); +} + +static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = ps->ibp; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 bth1 = 0; + u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + u16 lrh0 = HFI1_LRH_BTH; + u16 slid; + u8 extra_bytes = -ps->s_txreq->s_cur_size & 3; + u32 nwords = SIZE_OF_CRC + ((ps->s_txreq->s_cur_size + + extra_bytes) >> 2); + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { + struct ib_grh *grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh; + int hdrwords = qp->s_hdrwords - 2; + + lrh0 = HFI1_LRH_GRH; qp->s_hdrwords += - hfi1_make_grh(ibp, - &ps->s_txreq->phdr.hdr.u.l.grh, + hfi1_make_grh(ibp, grh, rdma_ah_read_grh(&qp->remote_ah_attr), - qp->s_hdrwords, nwords); - lrh0 = HFI1_LRH_GRH; + hdrwords, nwords); middle = 0; } lrh0 |= (priv->s_sc & 0xf) << 12 | (rdma_ah_get_sl(&qp->remote_ah_attr) & 0xf) << 4; - /* - * reset s_ahg/AHG fields - * - * This insures that the ahgentry/ahgcount - * are at a non-AHG default to protect - * build_verbs_tx_desc() from using - * an include ahgidx. - * - * build_ahg() will modify as appropriate - * to use the AHG feature. - */ - priv->s_ahg->tx_flags = 0; - priv->s_ahg->ahgcount = 0; - priv->s_ahg->ahgidx = 0; + if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; else middle = 0; + if (middle) build_ahg(qp, bth2); else qp->s_flags &= ~RVT_S_AHG_VALID; - ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0); - ps->s_txreq->phdr.hdr.lrh[1] = - cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr)); - ps->s_txreq->phdr.hdr.lrh[2] = - cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); - ps->s_txreq->phdr.hdr.lrh[3] = - cpu_to_be16(ppd_from_ibp(ibp)->lid | - rdma_ah_get_path_bits(&qp->remote_ah_attr)); - bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); + + bth0 |= pkey; bth0 |= extra_bytes << 20; - ohdr->bth[0] = cpu_to_be32(bth0); - bth1 = qp->remote_qpn; if (qp->s_flags & RVT_S_ECN) { qp->s_flags &= ~RVT_S_ECN; /* we recently received a FECN, so return a BECN */ bth1 |= (IB_BECN_MASK << IB_BECN_SHIFT); } - ohdr->bth[1] = cpu_to_be32(bth1); - ohdr->bth[2] = cpu_to_be32(bth2); + hfi1_make_ruc_bth(qp, ohdr, bth0, bth1, bth2); + + if (!ppd->lid) + slid = be16_to_cpu(IB_LID_PERMISSIVE); + else + slid = ppd->lid | + (rdma_ah_get_path_bits(&qp->remote_ah_attr) & + ((1 << ppd->lmc) - 1)); + hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh, + lrh0, + qp->s_hdrwords + nwords, + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), + ppd_from_ibp(ibp)->lid | + rdma_ah_get_path_bits(&qp->remote_ah_attr)); +} + +typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps); + +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_ruc_header_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_ruc_header_16B +}; + +void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + + /* + * reset s_ahg/AHG fields + * + * This insures that the ahgentry/ahgcount + * are at a non-AHG default to protect + * build_verbs_tx_desc() from using + * an include ahgidx. + * + * build_ahg() will modify as appropriate + * to use the AHG feature. + */ + priv->s_ahg->tx_flags = 0; + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; + + /* Make the appropriate header */ + hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps); } /* when sending, force a reschedule every one of these periods */ @@ -816,6 +936,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, static bool schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { + ps->pkts_sent = true; + if (unlikely(time_after(jiffies, ps->timeout))) { if (!ps->in_thread || workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) { @@ -912,6 +1034,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.timeout = jiffies + ps.timeout_int; ps.cpu = priv->s_sde ? priv->s_sde->cpu : cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + ps.pkts_sent = false; /* insure a pre-built packet is handled */ ps.s_txreq = get_waiting_verbs_txreq(qp); @@ -934,7 +1057,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) spin_lock_irqsave(&qp->s_lock, ps.flags); } } while (make_req(qp, &ps)); - + iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); spin_unlock_irqrestore(&qp->s_lock, ps.flags); } diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index bfd0d5187e9b..6781bcdb10b3 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -246,7 +246,7 @@ static void __sdma_process_event( enum sdma_events event); static void dump_sdma_state(struct sdma_engine *sde); static void sdma_make_progress(struct sdma_engine *sde, u64 status); -static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail); +static void sdma_desc_avail(struct sdma_engine *sde, uint avail); static void sdma_flush_descq(struct sdma_engine *sde); /** @@ -325,7 +325,7 @@ static void sdma_wait_for_packet_egress(struct sdma_engine *sde, /* timed out - bounce the link */ dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n", __func__, sde->this_idx, (u32)reg); - queue_work(dd->pport->hfi1_wq, + queue_work(dd->pport->link_wq, &dd->pport->link_bounce_work); break; } @@ -1340,10 +1340,8 @@ static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) * @dd: hfi1_devdata * @port: port number (currently only zero) * - * sdma_init initializes the specified number of engines. - * - * The code initializes each sde, its csrs. Interrupts - * are not required to be enabled. + * Initializes each sde and its csrs. + * Interrupts are not required to be enabled. * * Returns: * 0 - success, -errno on failure @@ -1764,13 +1762,14 @@ retry: * * This is called with head_lock held. */ -static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail) +static void sdma_desc_avail(struct sdma_engine *sde, uint avail) { struct iowait *wait, *nw; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; - unsigned i, n = 0, seq; + uint i, n = 0, seq, max_idx = 0; struct sdma_txreq *stx; struct hfi1_ibdev *dev = &sde->dd->verbs_dev; + u8 max_starved_cnt = 0; #ifdef CONFIG_SDMA_VERBOSITY dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, @@ -1805,6 +1804,9 @@ static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail) if (num_desc > avail) break; avail -= num_desc; + /* Find the most starved wait memeber */ + iowait_starve_find_max(wait, &max_starved_cnt, + n, &max_idx); list_del_init(&wait->list); waits[n++] = wait; } @@ -1813,8 +1815,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail) } } while (read_seqretry(&dev->iowait_lock, seq)); + /* Schedule the most starved one first */ + if (n) + waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON); + for (i = 0; i < n; i++) - waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); + if (i != max_idx) + waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); } /* head_lock must be held */ @@ -2351,7 +2358,8 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) static int sdma_check_progress( struct sdma_engine *sde, struct iowait *wait, - struct sdma_txreq *tx) + struct sdma_txreq *tx, + bool pkts_sent) { int ret; @@ -2364,7 +2372,7 @@ static int sdma_check_progress( seq = raw_seqcount_begin( (const seqcount_t *)&sde->head_lock.seqcount); - ret = wait->sleep(sde, wait, tx, seq); + ret = wait->sleep(sde, wait, tx, seq, pkts_sent); if (ret == -EAGAIN) sde->desc_avail = sdma_descq_freecnt(sde); } else { @@ -2378,6 +2386,7 @@ static int sdma_check_progress( * @sde: sdma engine to use * @wait: wait structure to use when full (may be NULL) * @tx: sdma_txreq to submit + * @pkts_sent: has any packet been sent yet? * * The call submits the tx into the ring. If a iowait structure is non-NULL * the packet will be queued to the list in wait. @@ -2389,7 +2398,8 @@ static int sdma_check_progress( */ int sdma_send_txreq(struct sdma_engine *sde, struct iowait *wait, - struct sdma_txreq *tx) + struct sdma_txreq *tx, + bool pkts_sent) { int ret = 0; u16 tail; @@ -2431,7 +2441,7 @@ unlock_noconn: ret = -ECOMM; goto unlock; nodesc: - ret = sdma_check_progress(sde, wait, tx); + ret = sdma_check_progress(sde, wait, tx, pkts_sent); if (ret == -EAGAIN) { ret = 0; goto retry; @@ -2500,8 +2510,10 @@ retry: } update_tail: total_count = submit_count + flush_count; - if (wait) + if (wait) { iowait_sdma_add(wait, total_count); + iowait_starve_clear(submit_count > 0, wait); + } if (tail != INVALID_TAIL) sdma_update_tail(sde, tail); spin_unlock_irqrestore(&sde->tail_lock, flags); @@ -2529,7 +2541,7 @@ unlock_noconn: ret = -ECOMM; goto update_tail; nodesc: - ret = sdma_check_progress(sde, wait, tx); + ret = sdma_check_progress(sde, wait, tx, submit_count > 0); if (ret == -EAGAIN) { ret = 0; goto retry; diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 64f10b8b5db8..107011d8613b 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -852,7 +852,8 @@ struct iowait; int sdma_send_txreq(struct sdma_engine *sde, struct iowait *wait, - struct sdma_txreq *tx); + struct sdma_txreq *tx, + bool pkts_sent); int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, struct list_head *tx_list, diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 2f3bbcac1e34..6d2702ef34ac 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -95,7 +95,7 @@ static void port_release(struct kobject *kobj) /* nothing to do since memory is freed by hfi1_free_devdata() */ } -static struct bin_attribute cc_table_bin_attr = { +static const struct bin_attribute cc_table_bin_attr = { .attr = {.name = "cc_table_bin", .mode = 0444}, .read = read_cc_table_bin, .size = PAGE_SIZE, @@ -137,7 +137,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, return count; } -static struct bin_attribute cc_setting_bin_attr = { +static const struct bin_attribute cc_setting_bin_attr = { .attr = {.name = "cc_settings_bin", .mode = 0444}, .read = read_cc_setting_bin, .size = PAGE_SIZE, diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index eafae487face..9938bb983ce6 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -47,7 +47,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" -u8 ibhdr_exhdr_len(struct ib_header *hdr) +static u8 __get_ib_hdr_len(struct ib_header *hdr) { struct ib_other_headers *ohdr; u8 opcode; @@ -61,13 +61,69 @@ u8 ibhdr_exhdr_len(struct ib_header *hdr) 0 : hdr_len_by_opcode[opcode] - (12 + 8); } -#define IMM_PRN "imm %d" -#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x" -#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x" -#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x" -#define IETH_PRN "ieth rkey 0x%.8x" -#define ATOMICACKETH_PRN "origdata %llx" -#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %llx cdata %llx" +static u8 __get_16b_hdr_len(struct hfi1_16b_header *hdr) +{ + struct ib_other_headers *ohdr; + u8 opcode; + + if (hfi1_16B_get_l4(hdr) == OPA_16B_L4_IB_LOCAL) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + opcode = ib_bth_get_opcode(ohdr); + return hdr_len_by_opcode[opcode] == 0 ? + 0 : hdr_len_by_opcode[opcode] - (12 + 8 + 8); +} + +u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet) +{ + if (packet->etype != RHF_RCV_TYPE_BYPASS) + return __get_ib_hdr_len(packet->hdr); + else + return __get_16b_hdr_len(packet->hdr); +} + +u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opa_hdr) +{ + if (!opa_hdr->hdr_type) + return __get_ib_hdr_len(&opa_hdr->ibh); + else + return __get_16b_hdr_len(&opa_hdr->opah); +} + +const char *hfi1_trace_get_packet_str(struct hfi1_packet *packet) +{ + if (packet->etype != RHF_RCV_TYPE_BYPASS) + return "IB"; + + switch (hfi1_16B_get_l2(packet->hdr)) { + case 0: + return "0"; + case 1: + return "1"; + case 2: + return "16B"; + case 3: + return "9B"; + } + return ""; +} + +const char *hfi1_trace_get_packet_type_str(u8 l4) +{ + if (l4) + return "16B"; + else + return "9B"; +} + +#define IMM_PRN "imm:%d" +#define RETH_PRN "reth vaddr:0x%.16llx rkey:0x%.8x dlen:0x%.8x" +#define AETH_PRN "aeth syn:0x%.2x %s msn:0x%.8x" +#define DETH_PRN "deth qkey:0x%.8x sqpn:0x%.6x" +#define IETH_PRN "ieth rkey:0x%.8x" +#define ATOMICACKETH_PRN "origdata:%llx" +#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx" #define OP(transport, op) IB_OPCODE_## transport ## _ ## op @@ -84,6 +140,125 @@ static const char *parse_syndrome(u8 syndrome) return ""; } +void hfi1_trace_parse_9b_bth(struct ib_other_headers *ohdr, + u8 *ack, u8 *becn, u8 *fecn, u8 *mig, + u8 *se, u8 *pad, u8 *opcode, u8 *tver, + u16 *pkey, u32 *psn, u32 *qpn) +{ + *ack = ib_bth_get_ackreq(ohdr); + *becn = ib_bth_get_becn(ohdr); + *fecn = ib_bth_get_fecn(ohdr); + *mig = ib_bth_get_migreq(ohdr); + *se = ib_bth_get_se(ohdr); + *pad = ib_bth_get_pad(ohdr); + *opcode = ib_bth_get_opcode(ohdr); + *tver = ib_bth_get_tver(ohdr); + *pkey = ib_bth_get_pkey(ohdr); + *psn = ib_bth_get_psn(ohdr); + *qpn = ib_bth_get_qpn(ohdr); +} + +void hfi1_trace_parse_16b_bth(struct ib_other_headers *ohdr, + u8 *ack, u8 *mig, u8 *opcode, + u8 *pad, u8 *se, u8 *tver, + u32 *psn, u32 *qpn) +{ + *ack = ib_bth_get_ackreq(ohdr); + *mig = ib_bth_get_migreq(ohdr); + *opcode = ib_bth_get_opcode(ohdr); + *pad = ib_bth_get_pad(ohdr); + *se = ib_bth_get_se(ohdr); + *tver = ib_bth_get_tver(ohdr); + *psn = ib_bth_get_psn(ohdr); + *qpn = ib_bth_get_qpn(ohdr); +} + +void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5, + u8 *lnh, u8 *lver, u8 *sl, u8 *sc, + u16 *len, u32 *dlid, u32 *slid) +{ + *lnh = ib_get_lnh(hdr); + *lver = ib_get_lver(hdr); + *sl = ib_get_sl(hdr); + *sc = ib_get_sc(hdr) | (sc5 << 4); + *len = ib_get_len(hdr); + *dlid = ib_get_dlid(hdr); + *slid = ib_get_slid(hdr); +} + +void hfi1_trace_parse_16b_hdr(struct hfi1_16b_header *hdr, + u8 *age, u8 *becn, u8 *fecn, + u8 *l4, u8 *rc, u8 *sc, + u16 *entropy, u16 *len, u16 *pkey, + u32 *dlid, u32 *slid) +{ + *age = hfi1_16B_get_age(hdr); + *becn = hfi1_16B_get_becn(hdr); + *fecn = hfi1_16B_get_fecn(hdr); + *l4 = hfi1_16B_get_l4(hdr); + *rc = hfi1_16B_get_rc(hdr); + *sc = hfi1_16B_get_sc(hdr); + *entropy = hfi1_16B_get_entropy(hdr); + *len = hfi1_16B_get_len(hdr); + *pkey = hfi1_16B_get_pkey(hdr); + *dlid = hfi1_16B_get_dlid(hdr); + *slid = hfi1_16B_get_slid(hdr); +} + +#define LRH_PRN "len:%d sc:%d dlid:0x%.4x slid:0x%.4x " +#define LRH_9B_PRN "lnh:%d,%s lver:%d sl:%d" +#define LRH_16B_PRN "age:%d becn:%d fecn:%d l4:%d " \ + "rc:%d sc:%d pkey:0x%.4x entropy:0x%.4x" +const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, + u8 age, u8 becn, u8 fecn, u8 l4, + u8 lnh, const char *lnh_name, u8 lver, + u8 rc, u8 sc, u8 sl, u16 entropy, + u16 len, u16 pkey, u32 dlid, u32 slid) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, LRH_PRN, len, sc, dlid, slid); + + if (bypass) + trace_seq_printf(p, LRH_16B_PRN, + age, becn, fecn, l4, rc, sc, pkey, entropy); + + else + trace_seq_printf(p, LRH_9B_PRN, + lnh, lnh_name, lver, sl); + trace_seq_putc(p, 0); + + return ret; +} + +#define BTH_9B_PRN \ + "op:0x%.2x,%s se:%d m:%d pad:%d tver:%d pkey:0x%.4x " \ + "f:%d b:%d qpn:0x%.6x a:%d psn:0x%.8x" +#define BTH_16B_PRN \ + "op:0x%.2x,%s se:%d m:%d pad:%d tver:%d " \ + "qpn:0x%.6x a:%d psn:0x%.8x" +const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, + u8 ack, u8 becn, u8 fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (bypass) + trace_seq_printf(p, BTH_16B_PRN, + opcode, opname, + se, mig, pad, tver, qpn, ack, psn); + + else + trace_seq_printf(p, BTH_9B_PRN, + opcode, opname, + se, mig, pad, tver, pkey, fecn, becn, + qpn, ack, psn); + trace_seq_putc(p, 0); + + return ret; +} + const char *parse_everbs_hdrs( struct trace_seq *p, u8 opcode, diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 92dc88f013c9..af50c0793450 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -51,3 +51,4 @@ #include "trace_rc.h" #include "trace_rx.h" #include "trace_tx.h" +#include "trace_mmu.h" diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 090f6b506953..6721f84dafa5 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -55,8 +55,79 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_ibhdrs +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(CNP)) + u8 ibhdr_exhdr_len(struct ib_header *hdr); const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); +u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opah); +u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet); +const char *hfi1_trace_get_packet_type_str(u8 l4); +const char *hfi1_trace_get_packet_str(struct hfi1_packet *packet); +void hfi1_trace_parse_9b_bth(struct ib_other_headers *ohdr, + u8 *ack, u8 *becn, u8 *fecn, u8 *mig, + u8 *se, u8 *pad, u8 *opcode, u8 *tver, + u16 *pkey, u32 *psn, u32 *qpn); +void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5, + u8 *lnh, u8 *lver, u8 *sl, u8 *sc, + u16 *len, u32 *dlid, u32 *slid); +void hfi1_trace_parse_16b_bth(struct ib_other_headers *ohdr, + u8 *ack, u8 *mig, u8 *opcode, + u8 *pad, u8 *se, u8 *tver, + u32 *psn, u32 *qpn); +void hfi1_trace_parse_16b_hdr(struct hfi1_16b_header *hdr, + u8 *age, u8 *becn, u8 *fecn, + u8 *l4, u8 *rc, u8 *sc, + u16 *entropy, u16 *len, u16 *pkey, + u32 *dlid, u32 *slid); + +const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, + u8 age, u8 becn, u8 fecn, u8 l4, + u8 lnh, const char *lnh_name, u8 lver, + u8 rc, u8 sc, u8 sl, u16 entropy, + u16 len, u16 pkey, u32 dlid, u32 slid); + +const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, + u8 ack, u8 becn, u8 fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn); #define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) @@ -65,140 +136,303 @@ const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); __print_symbolic(lrh, \ lrh_name(LRH_BTH), \ lrh_name(LRH_GRH)) +#define PKT_ENTRY(pkt) __string(ptype, hfi1_trace_get_packet_str(packet)) +#define PKT_ASSIGN(pkt) __assign_str(ptype, hfi1_trace_get_packet_str(packet)) -#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" -#define BTH_PRN \ - "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ - "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" -#define EHDR_PRN "%s" - -DECLARE_EVENT_CLASS(hfi1_ibhdr_template, +DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, TP_PROTO(struct hfi1_devdata *dd, - struct ib_header *hdr), - TP_ARGS(dd, hdr), + struct hfi1_packet *packet, + bool sc5), + TP_ARGS(dd, packet, sc5), TP_STRUCT__entry( DD_DEV_ENTRY(dd) - /* LRH */ - __field(u8, vl) + PKT_ENTRY(packet) + __field(bool, bypass) + __field(u8, ack) + __field(u8, age) + __field(u8, becn) + __field(u8, fecn) + __field(u8, l4) + __field(u8, lnh) __field(u8, lver) + __field(u8, mig) + __field(u8, opcode) + __field(u8, pad) + __field(u8, rc) + __field(u8, sc) + __field(u8, se) __field(u8, sl) - __field(u8, lnh) - __field(u16, dlid) + __field(u8, tver) + __field(u16, entropy) __field(u16, len) - __field(u16, slid) - /* BTH */ + __field(u16, pkey) + __field(u32, dlid) + __field(u32, psn) + __field(u32, qpn) + __field(u32, slid) + /* extended headers */ + __dynamic_array(u8, ehdrs, + hfi1_trace_packet_hdr_len(packet)) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + PKT_ASSIGN(packet); + + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + __entry->bypass = true; + hfi1_trace_parse_16b_hdr(packet->hdr, + &__entry->age, + &__entry->becn, + &__entry->fecn, + &__entry->l4, + &__entry->rc, + &__entry->sc, + &__entry->entropy, + &__entry->len, + &__entry->pkey, + &__entry->dlid, + &__entry->slid); + + hfi1_trace_parse_16b_bth(packet->ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } else { + __entry->bypass = false; + hfi1_trace_parse_9b_hdr(packet->hdr, sc5, + &__entry->lnh, + &__entry->lver, + &__entry->sl, + &__entry->sc, + &__entry->len, + &__entry->dlid, + &__entry->slid); + + hfi1_trace_parse_9b_bth(packet->ohdr, + &__entry->ack, + &__entry->becn, + &__entry->fecn, + &__entry->mig, + &__entry->se, + &__entry->pad, + &__entry->opcode, + &__entry->tver, + &__entry->pkey, + &__entry->psn, + &__entry->qpn); + } + /* extended headers */ + memcpy(__get_dynamic_array(ehdrs), + &packet->ohdr->u, + __get_dynamic_array_len(ehdrs)); + ), + TP_printk("[%s] (%s) %s %s hlen:%d %s", + __get_str(dev), + __get_str(ptype), + hfi1_trace_fmt_lrh(p, + __entry->bypass, + __entry->age, + __entry->becn, + __entry->fecn, + __entry->l4, + __entry->lnh, + show_lnh(__entry->lnh), + __entry->lver, + __entry->rc, + __entry->sc, + __entry->sl, + __entry->entropy, + __entry->len, + __entry->pkey, + __entry->dlid, + __entry->slid), + hfi1_trace_fmt_bth(p, + __entry->bypass, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn), + /* extended headers */ + __get_dynamic_array_len(ehdrs), + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_input_ibhdr_template, input_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_packet *packet, bool sc5), + TP_ARGS(dd, packet, sc5)); + +DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(bool, bypass) + __field(u8, ack) + __field(u8, age) + __field(u8, becn) + __field(u8, fecn) + __field(u8, l4) + __field(u8, lnh) + __field(u8, lver) + __field(u8, mig) __field(u8, opcode) - __field(u8, se) - __field(u8, m) __field(u8, pad) + __field(u8, rc) + __field(u8, sc) + __field(u8, se) + __field(u8, sl) __field(u8, tver) + __field(u16, entropy) + __field(u16, len) __field(u16, pkey) - __field(u8, f) - __field(u8, b) - __field(u32, qpn) - __field(u8, a) + __field(u32, dlid) __field(u32, psn) + __field(u32, qpn) + __field(u32, slid) /* extended headers */ - __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) + __dynamic_array(u8, ehdrs, + hfi1_trace_opa_hdr_len(opah)) ), - TP_fast_assign( + TP_fast_assign( struct ib_other_headers *ohdr; DD_DEV_ASSIGN(dd); - /* LRH */ - __entry->vl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); - __entry->lver = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; - __entry->sl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->lnh = - (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - __entry->dlid = - be16_to_cpu(hdr->lrh[1]); - /* allow for larger len */ - __entry->len = - be16_to_cpu(hdr->lrh[2]); - __entry->slid = - be16_to_cpu(hdr->lrh[3]); - /* BTH */ - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - __entry->opcode = - (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->se = - (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; - __entry->m = - (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; - __entry->pad = - (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - __entry->tver = - (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; - __entry->pkey = - be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->f = - (be32_to_cpu(ohdr->bth[1]) >> IB_FECN_SHIFT) & - IB_FECN_MASK; - __entry->b = - (be32_to_cpu(ohdr->bth[1]) >> IB_BECN_SHIFT) & - IB_BECN_MASK; - __entry->qpn = - be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - __entry->a = - (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; - /* allow for larger PSN */ - __entry->psn = - be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; + + if (opah->hdr_type) { + __entry->bypass = true; + hfi1_trace_parse_16b_hdr(&opah->opah, + &__entry->age, + &__entry->becn, + &__entry->fecn, + &__entry->l4, + &__entry->rc, + &__entry->sc, + &__entry->entropy, + &__entry->len, + &__entry->pkey, + &__entry->dlid, + &__entry->slid); + + if (entry->l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &opah->opah.u.oth; + else + ohdr = &opah->opah.u.l.oth; + hfi1_trace_parse_16b_bth(ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } else { + __entry->bypass = false; + hfi1_trace_parse_9b_hdr(&opah->ibh, sc5, + &__entry->lnh, + &__entry->lver, + &__entry->sl, + &__entry->sc, + &__entry->len, + &__entry->dlid, + &__entry->slid); + if (entry->lnh == HFI1_LRH_BTH) + ohdr = &opah->ibh.u.oth; + else + ohdr = &opah->ibh.u.l.oth; + hfi1_trace_parse_9b_bth(ohdr, + &__entry->ack, + &__entry->becn, + &__entry->fecn, + &__entry->mig, + &__entry->se, + &__entry->pad, + &__entry->opcode, + &__entry->tver, + &__entry->pkey, + &__entry->psn, + &__entry->qpn); + } + /* extended headers */ - memcpy(__get_dynamic_array(ehdrs), &ohdr->u, - ibhdr_exhdr_len(hdr)); - ), - TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, - __get_str(dev), - /* LRH */ - __entry->vl, - __entry->lver, - __entry->sl, - __entry->lnh, show_lnh(__entry->lnh), - __entry->dlid, - __entry->len, - __entry->slid, - /* BTH */ - __entry->opcode, show_ib_opcode(__entry->opcode), - __entry->se, - __entry->m, - __entry->pad, - __entry->tver, - __entry->pkey, - __entry->f, - __entry->b, - __entry->qpn, - __entry->a, - __entry->psn, - /* extended headers */ - __parse_ib_ehdrs( - __entry->opcode, - (void *)__get_dynamic_array(ehdrs)) - ) + memcpy(__get_dynamic_array(ehdrs), + &ohdr->u, __get_dynamic_array_len(ehdrs)); + ), + TP_printk("[%s] (%s) %s %s hlen:%d %s", + __get_str(dev), + hfi1_trace_get_packet_type_str(__entry->l4), + hfi1_trace_fmt_lrh(p, + __entry->bypass, + __entry->age, + __entry->becn, + __entry->fecn, + __entry->l4, + __entry->lnh, + show_lnh(__entry->lnh), + __entry->lver, + __entry->rc, + __entry->sc, + __entry->sl, + __entry->entropy, + __entry->len, + __entry->pkey, + __entry->dlid, + __entry->slid), + hfi1_trace_fmt_bth(p, + __entry->bypass, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn), + /* extended headers */ + __get_dynamic_array_len(ehdrs), + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) ); -DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr), - TP_ARGS(dd, hdr)); +DEFINE_EVENT(hfi1_output_ibhdr_template, pio_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5)); -DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr), - TP_ARGS(dd, hdr)); +DEFINE_EVENT(hfi1_output_ibhdr_template, ack_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5)); -DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr), - TP_ARGS(dd, hdr)); +DEFINE_EVENT(hfi1_output_ibhdr_template, sdma_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5)); -DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr), - TP_ARGS(dd, hdr)); #endif /* __HFI1_TRACE_IBHDRS_H */ diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h index deac77ddaeab..8db2253523ff 100644 --- a/drivers/infiniband/hw/hfi1/trace_misc.h +++ b/drivers/infiniband/hw/hfi1/trace_misc.h @@ -72,6 +72,26 @@ TRACE_EVENT(hfi1_interrupt, __entry->src) ); +DECLARE_EVENT_CLASS( + hfi1_csr_template, + TP_PROTO(void __iomem *addr, u64 value), + TP_ARGS(addr, value), + TP_STRUCT__entry( + __field(void __iomem *, addr) + __field(u64, value) + ), + TP_fast_assign( + __entry->addr = addr; + __entry->value = value; + ), + TP_printk("addr %p value %llx", __entry->addr, __entry->value) +); + +DEFINE_EVENT( + hfi1_csr_template, hfi1_write_rcvarray, + TP_PROTO(void __iomem *addr, u64 value), + TP_ARGS(addr, value)); + #ifdef CONFIG_FAULT_INJECTION TRACE_EVENT(hfi1_fault_opcode, TP_PROTO(struct rvt_qp *qp, u8 opcode), diff --git a/drivers/infiniband/hw/hfi1/trace_mmu.h b/drivers/infiniband/hw/hfi1/trace_mmu.h new file mode 100644 index 000000000000..3b7abbc382c2 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_mmu.h @@ -0,0 +1,95 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_MMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_MMU_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_mmu + +DECLARE_EVENT_CLASS(hfi1_mmu_rb_template, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len), + TP_STRUCT__entry(__field(unsigned long, addr) + __field(unsigned long, len) + ), + TP_fast_assign(__entry->addr = addr; + __entry->len = len; + ), + TP_printk("MMU node addr 0x%lx, len %lu", + __entry->addr, + __entry->len + ) +); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_insert, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_search, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_remove, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len)); + +#endif /* __HFI1_TRACE_RC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_mmu +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index f77e59fb43fe..f9909d240dcc 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -52,9 +52,25 @@ #include "hfi.h" +#define tidtype_name(type) { PT_##type, #type } +#define show_tidtype(type) \ +__print_symbolic(type, \ + tidtype_name(EXPECTED), \ + tidtype_name(EAGER), \ + tidtype_name(INVALID)) \ + #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_rx +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + TRACE_EVENT(hfi1_rcvhdr, TP_PROTO(struct hfi1_devdata *dd, u32 ctxt, @@ -98,24 +114,24 @@ TRACE_EVENT(hfi1_rcvhdr, ); TRACE_EVENT(hfi1_receive_interrupt, - TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), - TP_ARGS(dd, ctxt), + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd), + TP_ARGS(dd, rcd), TP_STRUCT__entry(DD_DEV_ENTRY(dd) __field(u32, ctxt) __field(u8, slow_path) __field(u8, dma_rtail) ), TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - if (dd->rcd[ctxt]->do_interrupt == + __entry->ctxt = rcd->ctxt; + if (rcd->do_interrupt == &handle_receive_interrupt) { __entry->slow_path = 1; __entry->dma_rtail = 0xFF; - } else if (dd->rcd[ctxt]->do_interrupt == + } else if (rcd->do_interrupt == &handle_receive_interrupt_dma_rtail){ __entry->dma_rtail = 1; __entry->slow_path = 0; - } else if (dd->rcd[ctxt]->do_interrupt == + } else if (rcd->do_interrupt == &handle_receive_interrupt_nodma_rtail) { __entry->dma_rtail = 0; __entry->slow_path = 0; @@ -129,7 +145,8 @@ TRACE_EVENT(hfi1_receive_interrupt, ) ); -TRACE_EVENT(hfi1_exp_tid_reg, +DECLARE_EVENT_CLASS( + hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, unsigned long va, unsigned long pa, dma_addr_t dma), @@ -163,38 +180,45 @@ TRACE_EVENT(hfi1_exp_tid_reg, ) ); -TRACE_EVENT(hfi1_exp_tid_unreg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned int, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); +DEFINE_EVENT( + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)); + +DEFINE_EVENT( + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)); + +TRACE_EVENT( + hfi1_put_tid, + TP_PROTO(struct hfi1_devdata *dd, + u32 index, u32 type, unsigned long pa, u16 order), + TP_ARGS(dd, index, type, pa, order), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(unsigned long, pa); + __field(u32, index); + __field(u32, type); + __field(u16, order); + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->pa = pa; + __entry->index = index; + __entry->type = type; + __entry->order = order; + ), + TP_printk("[%s] type %s pa %lx index %u order %u", + __get_str(dev), + show_tidtype(__entry->type), + __entry->pa, + __entry->index, + __entry->order + ) +); TRACE_EVENT(hfi1_exp_tid_inval, TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index c59809a7f121..c57af3b31fe1 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -198,6 +198,140 @@ TRACE_EVENT(hfi1_sdma_engine_select, ) ); +TRACE_EVENT(hfi1_sdma_user_free_queues, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt), + TP_ARGS(dd, ctxt, subctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + ), + TP_printk("[%s] SDMA [%u:%u] Freeing user SDMA queues", + __get_str(dev), + __entry->ctxt, + __entry->subctxt + ) +); + +TRACE_EVENT(hfi1_sdma_user_process_request, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx), + TP_ARGS(dd, ctxt, subctxt, comp_idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(u16, comp_idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->comp_idx = comp_idx; + ), + TP_printk("[%s] SDMA [%u:%u] Using req/comp entry: %u", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->comp_idx + ) +); + +DECLARE_EVENT_CLASS( + hfi1_sdma_value_template, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, u16 comp_idx, + u32 value), + TP_ARGS(dd, ctxt, subctxt, comp_idx, value), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(u16, comp_idx) + __field(u32, value) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->comp_idx = comp_idx; + __entry->value = value; + ), + TP_printk("[%s] SDMA [%u:%u:%u] value: %u", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->comp_idx, + __entry->value + ) +); + +DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_initial_tidoffset, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 tidoffset), + TP_ARGS(dd, ctxt, subctxt, comp_idx, tidoffset)); + +DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_data_length, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 data_len), + TP_ARGS(dd, ctxt, subctxt, comp_idx, data_len)); + +DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_compute_length, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 data_len), + TP_ARGS(dd, ctxt, subctxt, comp_idx, data_len)); + +TRACE_EVENT(hfi1_sdma_user_tid_info, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 tidoffset, u32 units, u8 shift), + TP_ARGS(dd, ctxt, subctxt, comp_idx, tidoffset, units, shift), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(u16, comp_idx) + __field(u32, tidoffset) + __field(u32, units) + __field(u8, shift) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->comp_idx = comp_idx; + __entry->tidoffset = tidoffset; + __entry->units = units; + __entry->shift = shift; + ), + TP_printk("[%s] SDMA [%u:%u:%u] TID offset %ubytes %uunits om %u", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->comp_idx, + __entry->tidoffset, + __entry->units, + __entry->shift + ) +); + +TRACE_EVENT(hfi1_sdma_request, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + unsigned long dim), + TP_ARGS(dd, ctxt, subctxt, dim), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(unsigned long, dim) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->dim = dim; + ), + TP_printk("[%s] SDMA from %u:%u (%lu)", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->dim + ) +); + DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, TP_PROTO(struct sdma_engine *sde, u64 status), TP_ARGS(sde, status), diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 5da1e4546543..0b646173ca22 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -65,7 +65,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct hfi1_qp_priv *priv = qp->priv; struct ib_other_headers *ohdr; struct rvt_swqe *wqe; - u32 hwords = 5; + u32 hwords; u32 bth0 = 0; u32 len; u32 pmtu = qp->pmtu; @@ -93,9 +93,23 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto done_free_tx; } - ohdr = &ps->s_txreq->phdr.hdr.u.oth; - if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) - ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; + ps->s_txreq->phdr.hdr.hdr_type = priv->hdr_type; + if (priv->hdr_type == HFI1_PKT_TYPE_9B) { + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } else { + /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ + hwords = 7; + if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + } /* Get the next send request. */ wqe = rvt_get_swqe_ptr(qp, qp->s_cur); @@ -297,31 +311,26 @@ bail_no_tx: void hfi1_uc_rcv(struct hfi1_packet *packet) { struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); - struct ib_header *hdr = packet->hdr; - u32 rcv_flags = packet->rcv_flags; - void *data = packet->ebuf; + void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct ib_other_headers *ohdr = packet->ohdr; - u32 bth0, opcode; + u32 opcode = packet->opcode; u32 hdrsize = packet->hlen; u32 psn; - u32 pad; + u32 pad = packet->pad; struct ib_wc wc; u32 pmtu = qp->pmtu; struct ib_reth *reth; - int has_grh = rcv_flags & HFI1_HAS_GRH; int ret; + u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); - bth0 = be32_to_cpu(ohdr->bth[0]); - if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0)) + if (hfi1_ruc_check_hdr(ibp, packet)) return; process_ecn(qp, packet, true); - psn = be32_to_cpu(ohdr->bth[2]); - opcode = ib_bth_get_opcode(ohdr); - + psn = ib_bth_get_psn(ohdr); /* Compare the PSN verses the expected PSN. */ if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) { /* @@ -414,7 +423,12 @@ send_first: /* FALLTHROUGH */ case OP(SEND_MIDDLE): /* Check for invalid length PMTU or posted rwqe len. */ - if (unlikely(tlen != (hdrsize + pmtu + 4))) + /* + * There will be no padding for 9B packet but 16B packets + * will come in with some padding since we always add + * CRC and LT bytes which will need to be flit aligned + */ + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) goto rewind; qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) @@ -432,14 +446,12 @@ no_immediate_data: wc.ex.imm_data = 0; wc.wc_flags = 0; send_last: - /* Get the number of bytes the message was padded by. */ - pad = ib_bth_get_pad(ohdr); /* Check for invalid length. */ /* LAST len should be >= 1 */ - if (unlikely(tlen < (hdrsize + pad + 4))) + if (unlikely(tlen < (hdrsize + extra_bytes))) goto rewind; /* Don't count the CRC. */ - tlen -= (hdrsize + pad + 4); + tlen -= (hdrsize + extra_bytes); wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto rewind; @@ -527,14 +539,12 @@ rdma_first: rdma_last_imm: wc.wc_flags = IB_WC_WITH_IMM; - /* Get the number of bytes the message was padded by. */ - pad = ib_bth_get_pad(ohdr); /* Check for invalid length. */ /* LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + pad + 4))) goto drop; /* Don't count the CRC. */ - tlen -= (hdrsize + pad + 4); + tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { @@ -554,14 +564,12 @@ rdma_last_imm: case OP(RDMA_WRITE_LAST): rdma_last: - /* Get the number of bytes the message was padded by. */ - pad = ib_bth_get_pad(ohdr); /* Check for invalid length. */ /* LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + pad + 4))) goto drop; /* Don't count the CRC. */ - tlen -= (hdrsize + pad + 4); + tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 6a4e95cefae5..2ba74fdd6f15 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -53,6 +53,12 @@ #include "verbs_txreq.h" #include "qp.h" +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_req hfi1_make_ud_req_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_ud_req_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_ud_req_16B +}; + /** * ud_loopback - handle send on loopback QPs * @sqp: the sending QP @@ -67,6 +73,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) { struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); struct hfi1_pportdata *ppd; + struct hfi1_qp_priv *priv = sqp->priv; struct rvt_qp *qp; struct rdma_ah_attr *ah_attr; unsigned long flags; @@ -102,18 +109,19 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num > 1) { u16 pkey; - u16 slid; + u32 slid; u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index); slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); if (unlikely(ingress_pkey_check(ppd, pkey, sc5, - qp->s_pkey_index, slid))) { - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey, - rdma_ah_get_sl(ah_attr), - sqp->ibqp.qp_num, qp->ibqp.qp_num, - slid, rdma_ah_get_dlid(ah_attr)); + qp->s_pkey_index, + slid, false))) { + hfi1_bad_pkey(ibp, pkey, + rdma_ah_get_sl(ah_attr), + sqp->ibqp.qp_num, qp->ibqp.qp_num, + slid, rdma_ah_get_dlid(ah_attr)); goto drop; } } @@ -128,18 +136,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) qkey = (int)swqe->ud_wr.remote_qkey < 0 ? sqp->qkey : swqe->ud_wr.remote_qkey; - if (unlikely(qkey != qp->qkey)) { - u16 lid; - - lid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & - ((1 << ppd->lmc) - 1)); - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, - rdma_ah_get_sl(ah_attr), - sqp->ibqp.qp_num, qp->ibqp.qp_num, - lid, - rdma_ah_get_dlid(ah_attr)); - goto drop; - } + if (unlikely(qkey != qp->qkey)) + goto drop; /* silently drop per IBTA spec */ } /* @@ -185,9 +183,33 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { struct ib_grh grh; - const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); + struct ib_global_route grd = *(rdma_ah_read_grh(ah_attr)); + + /* + * For loopback packets with extended LIDs, the + * sgid_index in the GRH is 0 and the dgid is + * OPA GID of the sender. While creating a response + * to the loopback packet, IB core creates the new + * sgid_index from the DGID and that will be the + * OPA_GID_INDEX. The new dgid is from the sgid + * index and that will be in the IB GID format. + * + * We now have a case where the sent packet had a + * different sgid_index and dgid compared to the + * one that was received in response. + * + * Fix this inconsistency. + */ + if (priv->hdr_type == HFI1_PKT_TYPE_16B) { + if (grd.sgid_index == 0) + grd.sgid_index = OPA_GID_INDEX; + + if (ib_is_opa_gid(&grd.dgid)) + grd.dgid.global.interface_id = + cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]); + } - hfi1_make_grh(ibp, &grh, grd, 0, 0); + hfi1_make_grh(ibp, &grh, &grd, 0, 0); hfi1_copy_sge(&qp->r_sge, &grh, sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; @@ -244,7 +266,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.pkey_index = 0; } wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & - ((1 << ppd->lmc) - 1)); + ((1 << ppd->lmc) - 1)); /* Check for loopback when the port lid is not set */ if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI) wc.slid = be16_to_cpu(IB_LID_PERMISSIVE); @@ -261,6 +283,183 @@ drop: rcu_read_unlock(); } +static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u16 *pkey, u32 extra_bytes, bool bypass) +{ + u32 bth0; + struct hfi1_ibport *ibp; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else { + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + } + + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) + *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + else + *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + if (!bypass) + bth0 |= *pkey; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); + ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); +} + +void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe) +{ + u32 nwords, extra_bytes; + u16 len, slid, dlid, pkey; + u16 lrh0 = 0; + u8 sc5; + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct ib_grh *grh; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + + extra_bytes = -wqe->length & 3; + nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; + /* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + qp->s_hdrwords++; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh; + qp->s_hdrwords += hfi1_make_grh(ibp, grh, + rdma_ah_read_grh(ah_attr), + qp->s_hdrwords - 2, nwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + } else { + lrh0 = HFI1_LRH_BTH; + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } + + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) { + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + priv->s_sc = 0xf; + } else { + lrh0 |= (sc5 & 0xf) << 12; + priv->s_sc = sc5; + } + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B); + if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } else { + u16 lid = (u16)ppd->lid; + + if (lid) { + lid |= rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1); + slid = lid; + } else { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } + } + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, false); + len = qp->s_hdrwords + nwords; + + /* Setup the packet */ + ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_9B; + hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh, + lrh0, len, dlid, slid); +} + +void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u32 dlid, slid, nwords, extra_bytes; + u16 len, pkey; + u8 l4, sc5; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ + qp->s_hdrwords = 9; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + qp->s_hdrwords++; + + /* SW provides space for CRC and LT for bypass packets. */ + extra_bytes = hfi1_get_16b_padding((qp->s_hdrwords << 2), + wqe->length); + nwords = ((wqe->length + extra_bytes + SIZE_OF_LT) >> 2) + SIZE_OF_CRC; + + if ((rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) { + struct ib_grh *grh; + struct ib_global_route *grd = rdma_ah_retrieve_grh(ah_attr); + /* + * Ensure OPA GIDs are transformed to IB gids + * before creating the GRH. + */ + if (grd->sgid_index == OPA_GID_INDEX) { + dd_dev_warn(ppd->dd, "Bad sgid_index. sgid_index: %d\n", + grd->sgid_index); + grd->sgid_index = 0; + } + grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh; + qp->s_hdrwords += hfi1_make_grh(ibp, grh, grd, + qp->s_hdrwords - 4, nwords); + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + } else { + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + l4 = OPA_16B_L4_IB_LOCAL; + } + + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + if (qp->ibqp.qp_type == IB_QPT_SMI) + priv->s_sc = 0xf; + else + priv->s_sc = sc5; + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 16B); + if (!ppd->lid) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); + else + slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1)); + + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + /* Convert dwords to flits */ + len = (qp->s_hdrwords + nwords) >> 1; + + /* Setup the packet */ + ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_16B; + hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah, + slid, dlid, len, pkey, 0, 0, l4, priv->s_sc); +} + /** * hfi1_make_ud_req - construct a UD request packet * @qp: the QP @@ -272,18 +471,12 @@ drop: int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; - struct ib_other_headers *ohdr; struct rdma_ah_attr *ah_attr; struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; struct rvt_swqe *wqe; - u32 nwords; - u32 extra_bytes; - u32 bth0; - u16 lrh0; - u16 lid; int next_cur; - u8 sc5; + u32 lid; ps->s_txreq = get_txreq(ps->dev, qp); if (IS_ERR(ps->s_txreq)) @@ -320,13 +513,14 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; - if (rdma_ah_get_dlid(ah_attr) < be16_to_cpu(IB_MULTICAST_LID_BASE) || - rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) { + priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); + if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || + (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { lid = rdma_ah_get_dlid(ah_attr) & ~((1 << ppd->lmc) - 1); if (unlikely(!loopback && - (lid == ppd->lid || - (lid == be16_to_cpu(IB_LID_PERMISSIVE) && - qp->ibqp.qp_type == IB_QPT_GSI)))) { + ((lid == ppd->lid) || + ((lid == be32_to_cpu(OPA_LID_PERMISSIVE)) && + (qp->ibqp.qp_type == IB_QPT_GSI))))) { unsigned long tflags = ps->flags; /* * If DMAs are in progress, we can't generate @@ -350,11 +544,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } qp->s_cur = next_cur; - extra_bytes = -wqe->length & 3; - nwords = (wqe->length + extra_bytes) >> 2; - - /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ - qp->s_hdrwords = 7; ps->s_txreq->s_cur_size = wqe->length; ps->s_txreq->ss = &qp->s_sge; qp->s_srate = rdma_ah_get_static_rate(ah_attr); @@ -365,77 +554,12 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_sge.num_sge = wqe->wr.num_sge; qp->s_sge.total_len = wqe->length; - if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { - /* Header size in 32-bit words. */ - qp->s_hdrwords += hfi1_make_grh(ibp, - &ps->s_txreq->phdr.hdr.u.l.grh, - rdma_ah_read_grh(ah_attr), - qp->s_hdrwords, nwords); - lrh0 = HFI1_LRH_GRH; - ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; - /* - * Don't worry about sending to locally attached multicast - * QPs. It is unspecified by the spec. what happens. - */ - } else { - /* Header size in 32-bit words. */ - lrh0 = HFI1_LRH_BTH; - ohdr = &ps->s_txreq->phdr.hdr.u.oth; - } - if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { - qp->s_hdrwords++; - ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; - bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; - } else { - bth0 = IB_OPCODE_UD_SEND_ONLY << 24; - } - sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; - lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; - if (qp->ibqp.qp_type == IB_QPT_SMI) { - lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ - priv->s_sc = 0xf; - } else { - lrh0 |= (sc5 & 0xf) << 12; - priv->s_sc = sc5; - } + /* Make the appropriate header */ + hfi1_make_ud_req_tbl[priv->hdr_type](qp, ps, qp->s_wqe); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); ps->s_txreq->sde = priv->s_sde; priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); ps->s_txreq->psc = priv->s_sendcontext; - ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0); - ps->s_txreq->phdr.hdr.lrh[1] = - cpu_to_be16(rdma_ah_get_dlid(ah_attr)); - ps->s_txreq->phdr.hdr.lrh[2] = - cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); - if (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) { - ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE; - } else { - lid = ppd->lid; - if (lid) { - lid |= rdma_ah_get_path_bits(ah_attr) & - ((1 << ppd->lmc) - 1); - ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid); - } else { - ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE; - } - } - if (wqe->wr.send_flags & IB_SEND_SOLICITED) - bth0 |= IB_BTH_SOLICITED; - bth0 |= extra_bytes << 20; - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) - bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); - else - bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); - ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); - /* - * Qkeys with the high order bit set mean use the - * qkey from the QP context instead of the WR (see 10.2.5). - */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.remote_qkey); - ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); /* disarm any ahg */ priv->s_ahg->ahgcount = 0; priv->s_ahg->ahgidx = 0; @@ -505,6 +629,64 @@ int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey) return -1; } +void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh) +{ + u64 pbc, pbc_flags = 0; + u32 bth0, plen, vl, hwords = 7; + u16 len; + u8 l4; + struct hfi1_16b_header hdr; + struct ib_other_headers *ohdr; + struct pio_buf *pbuf; + struct send_context *ctxt = qp_to_send_context(qp, sc5); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 nwords; + + /* Populate length */ + nwords = ((hfi1_get_16b_padding(hwords << 2, 0) + + SIZE_OF_LT) >> 2) + SIZE_OF_CRC; + if (old_grh) { + struct ib_grh *grh = &hdr.u.l.grh; + + grh->version_tclass_flow = old_grh->version_tclass_flow; + grh->paylen = cpu_to_be16((hwords - 4 + nwords) << 2); + grh->hop_limit = 0xff; + grh->sgid = old_grh->dgid; + grh->dgid = old_grh->sgid; + ohdr = &hdr.u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + hwords += sizeof(struct ib_grh) / sizeof(u32); + } else { + ohdr = &hdr.u.oth; + l4 = OPA_16B_L4_IB_LOCAL; + } + + /* BIT 16 to 19 is TVER. Bit 20 to 22 is pad cnt */ + bth0 = (IB_OPCODE_CNP << 24) | (1 << 16) | + (hfi1_get_16b_padding(hwords << 2, 0) << 20); + ohdr->bth[0] = cpu_to_be32(bth0); + + ohdr->bth[1] = cpu_to_be32(remote_qpn); + ohdr->bth[2] = 0; /* PSN 0 */ + + /* Convert dwords to flits */ + len = (hwords + nwords) >> 1; + hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5); + + plen = 2 /* PBC */ + hwords + nwords; + pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + if (ctxt) { + pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); + if (pbuf) + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + &hdr, hwords); + } +} + void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh) @@ -543,13 +725,9 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT)); ohdr->bth[2] = 0; /* PSN 0 */ - hdr.lrh[0] = cpu_to_be16(lrh0); - hdr.lrh[1] = cpu_to_be16(dlid); - hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); - hdr.lrh[3] = cpu_to_be16(slid); - + hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid); plen = 2 /* PBC */ + hwords; - pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); vl = sc_to_vlt(ppd->dd, sc5); pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); if (ctxt) { @@ -668,37 +846,45 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, void hfi1_ud_rcv(struct hfi1_packet *packet) { struct ib_other_headers *ohdr = packet->ohdr; - int opcode; u32 hdrsize = packet->hlen; struct ib_wc wc; u32 qkey; u32 src_qp; - u16 dlid, pkey; + u16 pkey; int mgmt_pkey_idx = -1; struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct ib_header *hdr = packet->hdr; - u32 rcv_flags = packet->rcv_flags; - void *data = packet->ebuf; + void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; - bool has_grh = rcv_flags & HFI1_HAS_GRH; - u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf); - u32 bth1; - u8 sl_from_sc, sl; - u16 slid; + u8 sc5 = packet->sc; + u8 sl_from_sc; + u8 opcode = packet->opcode; + u8 sl = packet->sl; + u32 dlid = packet->dlid; + u32 slid = packet->slid; u8 extra_bytes; + bool dlid_is_permissive; + bool slid_is_permissive; - qkey = be32_to_cpu(ohdr->u.ud.deth[0]); - src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; - dlid = ib_get_dlid(hdr); - bth1 = be32_to_cpu(ohdr->bth[1]); - slid = ib_get_slid(hdr); - pkey = ib_bth_get_pkey(ohdr); - opcode = ib_bth_get_opcode(ohdr); - sl = ib_get_sl(hdr); - extra_bytes = ib_bth_get_pad(ohdr); - extra_bytes += (SIZE_OF_CRC << 2); + extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2); + qkey = ib_get_qkey(ohdr); + src_qp = ib_get_sqpn(ohdr); + + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + u32 permissive_lid = + opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B); + + pkey = hfi1_16B_get_pkey(packet->hdr); + dlid_is_permissive = (dlid == permissive_lid); + slid_is_permissive = (slid == permissive_lid); + } else { + hdr = packet->hdr; + pkey = ib_bth_get_pkey(ohdr); + dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE)); + slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE)); + } sl_from_sc = ibp->sc_to_sl[sc5]; process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); @@ -716,8 +902,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). */ if (qp->ibqp.qp_num) { - if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || - hdr->lrh[3] == IB_LID_PERMISSIVE)) + if (unlikely(dlid_is_permissive || slid_is_permissive)) goto drop; if (qp->ibqp.qp_num > 1) { if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { @@ -727,10 +912,10 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * for invalid pkeys is optional according to * IB spec (release 1.3, section 10.9.4) */ - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, - pkey, sl, - src_qp, qp->ibqp.qp_num, - slid, dlid); + hfi1_bad_pkey(ibp, + pkey, sl, + src_qp, qp->ibqp.qp_num, + slid, dlid); return; } } else { @@ -739,12 +924,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (mgmt_pkey_idx < 0) goto drop; } - if (unlikely(qkey != qp->qkey)) { - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl, - src_qp, qp->ibqp.qp_num, - slid, dlid); + if (unlikely(qkey != qp->qkey)) /* Silent drop */ return; - } + /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && (tlen > 2048 || (sc5 == 0xF)))) @@ -758,8 +940,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (tlen > 2048) goto drop; - if ((hdr->lrh[1] == IB_LID_PERMISSIVE || - hdr->lrh[3] == IB_LID_PERMISSIVE) && + if ((dlid_is_permissive || slid_is_permissive) && smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) goto drop; @@ -811,8 +992,19 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) qp->r_flags |= RVT_R_REUSE_SGE; goto drop; } - if (has_grh) { - hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh, + if (packet->grh) { + hfi1_copy_sge(&qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); + wc.wc_flags |= IB_WC_GRH; + } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { + struct ib_grh grh; + /* + * Assuming we only created 16B on the send side + * if we want to use large LIDs, since GRH was stripped + * out when creating 16B, add back the GRH here. + */ + hfi1_make_ext_grh(packet, &grh, slid, dlid); + hfi1_copy_sge(&qp->r_sge, &grh, sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { @@ -845,14 +1037,15 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) } else { wc.pkey_index = 0; } - + if (slid_is_permissive) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); wc.slid = slid; wc.sl = sl_from_sc; /* * Save the LMC lower bits if the destination LID is a unicast LID. */ - wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 : + wc.dlid_path_bits = hfi1_check_mcast(dlid) ? 0 : dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index a8f0aa4722f6..6f6c14df383e 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -47,58 +47,28 @@ #include <asm/page.h> #include <linux/string.h> +#include "mmu_rb.h" #include "user_exp_rcv.h" #include "trace.h" -#include "mmu_rb.h" - -struct tid_group { - struct list_head list; - u32 base; - u8 size; - u8 used; - u8 map; -}; - -struct tid_rb_node { - struct mmu_rb_node mmu; - unsigned long phys; - struct tid_group *grp; - u32 rcventry; - dma_addr_t dma_addr; - bool freed; - unsigned npages; - struct page *pages[0]; -}; - -struct tid_pageset { - u16 idx; - u16 count; -}; - -#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) - -#define num_user_pages(vaddr, len) \ - (1 + (((((unsigned long)(vaddr) + \ - (unsigned long)(len) - 1) & PAGE_MASK) - \ - ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, struct exp_tid_set *set, struct hfi1_filedata *fd); -static u32 find_phys_blocks(struct page **pages, unsigned npages, - struct tid_pageset *list); -static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr, +static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); +static int set_rcvarray_entry(struct hfi1_filedata *fd, + struct tid_user_buf *tbuf, u32 rcventry, struct tid_group *grp, - struct page **pages, unsigned npages); + u16 pageidx, unsigned int npages); static int tid_rb_insert(void *arg, struct mmu_rb_node *node); static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, struct tid_rb_node *tnode); static void tid_rb_remove(void *arg, struct mmu_rb_node *node); static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode); -static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, - struct tid_group *grp, struct tid_pageset *sets, - unsigned start, u16 count, struct page **pages, - u32 *tidlist, unsigned *tididx, unsigned *pmapped); +static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, + struct tid_group *grp, + unsigned int start, u16 count, + u32 *tidlist, unsigned int *tididx, + unsigned int *pmapped); static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, struct tid_group **grp); static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); @@ -109,96 +79,14 @@ static struct mmu_rb_ops tid_rb_ops = { .invalidate = tid_rb_invalidate }; -static inline u32 rcventry2tidinfo(u32 rcventry) -{ - u32 pair = rcventry & ~0x1; - - return EXP_TID_SET(IDX, pair >> 1) | - EXP_TID_SET(CTRL, 1 << (rcventry - pair)); -} - -static inline void exp_tid_group_init(struct exp_tid_set *set) -{ - INIT_LIST_HEAD(&set->list); - set->count = 0; -} - -static inline void tid_group_remove(struct tid_group *grp, - struct exp_tid_set *set) -{ - list_del_init(&grp->list); - set->count--; -} - -static inline void tid_group_add_tail(struct tid_group *grp, - struct exp_tid_set *set) -{ - list_add_tail(&grp->list, &set->list); - set->count++; -} - -static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) -{ - struct tid_group *grp = - list_first_entry(&set->list, struct tid_group, list); - list_del_init(&grp->list); - set->count--; - return grp; -} - -static inline void tid_group_move(struct tid_group *group, - struct exp_tid_set *s1, - struct exp_tid_set *s2) -{ - tid_group_remove(group, s1); - tid_group_add_tail(group, s2); -} - -int hfi1_user_exp_rcv_grp_init(struct hfi1_filedata *fd) -{ - struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_devdata *dd = fd->dd; - u32 tidbase; - u32 i; - struct tid_group *grp, *gptr; - - exp_tid_group_init(&uctxt->tid_group_list); - exp_tid_group_init(&uctxt->tid_used_list); - exp_tid_group_init(&uctxt->tid_full_list); - - tidbase = uctxt->expected_base; - for (i = 0; i < uctxt->expected_count / - dd->rcv_entries.group_size; i++) { - grp = kzalloc(sizeof(*grp), GFP_KERNEL); - if (!grp) - goto grp_failed; - - grp->size = dd->rcv_entries.group_size; - grp->base = tidbase; - tid_group_add_tail(grp, &uctxt->tid_group_list); - tidbase += dd->rcv_entries.group_size; - } - - return 0; - -grp_failed: - list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, - list) { - list_del_init(&grp->list); - kfree(grp); - } - - return -ENOMEM; -} - /* * Initialize context and file private data needed for Expected * receive caching. This needs to be done after the context has * been configured with the eager/expected RcvEntry counts. */ -int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd) +int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) { - struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; int ret = 0; @@ -266,18 +154,6 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd) return ret; } -void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt) -{ - struct tid_group *grp, *gptr; - - list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, - list) { - list_del_init(&grp->list); - kfree(grp); - } - hfi1_clear_tids(uctxt); -} - void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) { struct hfi1_ctxtdata *uctxt = fd->uctxt; @@ -302,21 +178,90 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) fd->entry_to_rb = NULL; } -/* - * Write an "empty" RcvArray entry. - * This function exists so the TID registaration code can use it - * to write to unused/unneeded entries and still take advantage - * of the WC performance improvements. The HFI will ignore this - * write to the RcvArray entry. +/** + * Release pinned receive buffer pages. + * + * @mapped - true if the pages have been DMA mapped. false otherwise. + * @idx - Index of the first page to unpin. + * @npages - No of pages to unpin. + * + * If the pages have been DMA mapped (indicated by mapped parameter), their + * info will be passed via a struct tid_rb_node. If they haven't been mapped, + * their info will be passed via a struct tid_user_buf. + */ +static void unpin_rcv_pages(struct hfi1_filedata *fd, + struct tid_user_buf *tidbuf, + struct tid_rb_node *node, + unsigned int idx, + unsigned int npages, + bool mapped) +{ + struct page **pages; + struct hfi1_devdata *dd = fd->uctxt->dd; + + if (mapped) { + pci_unmap_single(dd->pcidev, node->dma_addr, + node->mmu.len, PCI_DMA_FROMDEVICE); + pages = &node->pages[idx]; + } else { + pages = &tidbuf->pages[idx]; + } + hfi1_release_user_pages(fd->mm, pages, npages, mapped); + fd->tid_n_pinned -= npages; +} + +/** + * Pin receive buffer pages. */ -static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) { + int pinned; + unsigned int npages; + unsigned long vaddr = tidbuf->vaddr; + struct page **pages = NULL; + struct hfi1_devdata *dd = fd->uctxt->dd; + + /* Get the number of pages the user buffer spans */ + npages = num_user_pages(vaddr, tidbuf->length); + if (!npages) + return -EINVAL; + + if (npages > fd->uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Verify that access is OK for the user buffer */ + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + npages * PAGE_SIZE)) { + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, npages); + return -EFAULT; + } + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + /* - * Doing the WC fill writes only makes sense if the device is - * present and the RcvArray has been mapped as WC memory. + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. */ - if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) - writeq(0, dd->rcvarray_wc + (index * 8)); + if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { + kfree(pages); + return -ENOMEM; + } + + pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); + if (pinned <= 0) { + kfree(pages); + return pinned; + } + tidbuf->pages = pages; + tidbuf->npages = npages; + fd->tid_n_pinned += pinned; + return pinned; } /* @@ -374,62 +319,33 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, int ret = 0, need_group = 0, pinned; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; - unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, + unsigned int ngroups, pageidx = 0, pageset_count, tididx = 0, mapped, mapped_pages = 0; - unsigned long vaddr = tinfo->vaddr; - struct page **pages = NULL; u32 *tidlist = NULL; - struct tid_pageset *pagesets = NULL; + struct tid_user_buf *tidbuf; - /* Get the number of pages the user buffer spans */ - npages = num_user_pages(vaddr, tinfo->length); - if (!npages) - return -EINVAL; - - if (npages > uctxt->expected_count) { - dd_dev_err(dd, "Expected buffer too big\n"); - return -EINVAL; - } - - /* Verify that access is OK for the user buffer */ - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, - npages * PAGE_SIZE)) { - dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", - (void *)vaddr, npages); - return -EFAULT; - } - - pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), - GFP_KERNEL); - if (!pagesets) + tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); + if (!tidbuf) return -ENOMEM; - /* Allocate the array of struct page pointers needed for pinning */ - pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto bail; - } - - /* - * Pin all the pages of the user buffer. If we can't pin all the - * pages, accept the amount pinned so far and program only that. - * User space knows how to deal with partially programmed buffers. - */ - if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { - ret = -ENOMEM; - goto bail; + tidbuf->vaddr = tinfo->vaddr; + tidbuf->length = tinfo->length; + tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), + GFP_KERNEL); + if (!tidbuf->psets) { + kfree(tidbuf); + return -ENOMEM; } - pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); + pinned = pin_rcv_pages(fd, tidbuf); if (pinned <= 0) { - ret = pinned; - goto bail; + kfree(tidbuf->psets); + kfree(tidbuf); + return pinned; } - fd->tid_n_pinned += npages; /* Find sets of physically contiguous pages */ - npagesets = find_phys_blocks(pages, pinned, pagesets); + tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); /* * We don't need to access this under a lock since tid_used is per @@ -437,10 +353,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, * and hfi1_user_exp_rcv_setup() at the same time. */ spin_lock(&fd->tid_lock); - if (fd->tid_used + npagesets > fd->tid_limit) + if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) pageset_count = fd->tid_limit - fd->tid_used; else - pageset_count = npagesets; + pageset_count = tidbuf->n_psets; spin_unlock(&fd->tid_lock); if (!pageset_count) @@ -468,9 +384,9 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, struct tid_group *grp = tid_group_pop(&uctxt->tid_group_list); - ret = program_rcvarray(fd, vaddr, grp, pagesets, + ret = program_rcvarray(fd, tidbuf, grp, pageidx, dd->rcv_entries.group_size, - pages, tidlist, &tididx, &mapped); + tidlist, &tididx, &mapped); /* * If there was a failure to program the RcvArray * entries for the entire group, reset the grp fields @@ -514,8 +430,8 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned use = min_t(unsigned, pageset_count - pageidx, grp->size - grp->used); - ret = program_rcvarray(fd, vaddr, grp, pagesets, - pageidx, use, pages, tidlist, + ret = program_rcvarray(fd, tidbuf, grp, + pageidx, use, tidlist, &tididx, &mapped); if (ret < 0) { hfi1_cdbg(TID, @@ -575,16 +491,14 @@ nomem: * If not everything was mapped (due to insufficient RcvArray entries, * for example), unpin all unmapped pages so we can pin them nex time. */ - if (mapped_pages != pinned) { - hfi1_release_user_pages(fd->mm, &pages[mapped_pages], - pinned - mapped_pages, - false); - fd->tid_n_pinned -= pinned - mapped_pages; - } + if (mapped_pages != pinned) + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, + (pinned - mapped_pages), false); bail: - kfree(pagesets); - kfree(pages); + kfree(tidbuf->psets); kfree(tidlist); + kfree(tidbuf->pages); + kfree(tidbuf); return ret > 0 ? 0 : ret; } @@ -674,11 +588,12 @@ int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, return ret; } -static u32 find_phys_blocks(struct page **pages, unsigned npages, - struct tid_pageset *list) +static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) { unsigned pagecount, pageidx, setcount = 0, i; unsigned long pfn, this_pfn; + struct page **pages = tidbuf->pages; + struct tid_pageset *list = tidbuf->psets; if (!npages) return 0; @@ -741,13 +656,13 @@ static u32 find_phys_blocks(struct page **pages, unsigned npages, /** * program_rcvarray() - program an RcvArray group with receive buffers * @fd: filedata pointer - * @vaddr: starting user virtual address + * @tbuf: pointer to struct tid_user_buf that has the user buffer starting + * virtual address, buffer length, page pointers, pagesets (array of + * struct tid_pageset holding information on physically contiguous + * chunks from the user buffer), and other fields. * @grp: RcvArray group - * @sets: array of struct tid_pageset holding information on physically - * contiguous chunks from the user buffer * @start: starting index into sets array * @count: number of struct tid_pageset's to program - * @pages: an array of struct page * for the user buffer * @tidlist: the array of u32 elements when the information about the * programmed RcvArray entries is to be encoded. * @tididx: starting offset into tidlist @@ -765,11 +680,11 @@ static u32 find_phys_blocks(struct page **pages, unsigned npages, * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or * number of RcvArray entries programmed. */ -static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, +static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, struct tid_group *grp, - struct tid_pageset *sets, - unsigned start, u16 count, struct page **pages, - u32 *tidlist, unsigned *tididx, unsigned *pmapped) + unsigned int start, u16 count, + u32 *tidlist, unsigned int *tididx, + unsigned int *pmapped) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; @@ -808,11 +723,11 @@ static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, } rcventry = grp->base + useidx; - npages = sets[setidx].count; - pageidx = sets[setidx].idx; + npages = tbuf->psets[setidx].count; + pageidx = tbuf->psets[setidx].idx; - ret = set_rcvarray_entry(fd, vaddr + (pageidx * PAGE_SIZE), - rcventry, grp, pages + pageidx, + ret = set_rcvarray_entry(fd, tbuf, + rcventry, grp, pageidx, npages); if (ret) return ret; @@ -833,15 +748,17 @@ static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, return idx; } -static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr, +static int set_rcvarray_entry(struct hfi1_filedata *fd, + struct tid_user_buf *tbuf, u32 rcventry, struct tid_group *grp, - struct page **pages, unsigned npages) + u16 pageidx, unsigned int npages) { int ret; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct tid_rb_node *node; struct hfi1_devdata *dd = uctxt->dd; dma_addr_t phys; + struct page **pages = tbuf->pages + pageidx; /* * Allocate the node first so we can handle a potential @@ -862,7 +779,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr, return -EFAULT; } - node->mmu.addr = vaddr; + node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE); node->mmu.len = npages * PAGE_SIZE; node->phys = page_to_phys(pages[0]); node->npages = npages; @@ -935,17 +852,13 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) node->npages, node->mmu.addr, node->phys, node->dma_addr); - hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); /* * Make sure device has seen the write before we unpin the * pages. */ - flush_wc(); + hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); - pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len, - PCI_DMA_FROMDEVICE); - hfi1_release_user_pages(fd->mm, node->pages, node->npages, true); - fd->tid_n_pinned -= node->npages; + unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); node->grp->used--; node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index 5250c897298d..e383cc01a2bf 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -49,30 +49,44 @@ #include "hfi.h" -#define EXP_TID_TIDLEN_MASK 0x7FFULL -#define EXP_TID_TIDLEN_SHIFT 0 -#define EXP_TID_TIDCTRL_MASK 0x3ULL -#define EXP_TID_TIDCTRL_SHIFT 20 -#define EXP_TID_TIDIDX_MASK 0x3FFULL -#define EXP_TID_TIDIDX_SHIFT 22 -#define EXP_TID_GET(tid, field) \ - (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK) +#include "exp_rcv.h" -#define EXP_TID_SET(field, value) \ - (((value) & EXP_TID_TID##field##_MASK) << \ - EXP_TID_TID##field##_SHIFT) -#define EXP_TID_CLEAR(tid, field) ({ \ - (tid) &= ~(EXP_TID_TID##field##_MASK << \ - EXP_TID_TID##field##_SHIFT); \ - }) -#define EXP_TID_RESET(tid, field, value) do { \ - EXP_TID_CLEAR(tid, field); \ - (tid) |= EXP_TID_SET(field, (value)); \ - } while (0) +struct tid_pageset { + u16 idx; + u16 count; +}; -void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt); -int hfi1_user_exp_rcv_grp_init(struct hfi1_filedata *fd); -int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd); +struct tid_user_buf { + unsigned long vaddr; + unsigned long length; + unsigned int npages; + struct page **pages; + struct tid_pageset *psets; + unsigned int n_psets; +}; + +struct tid_rb_node { + struct mmu_rb_node mmu; + unsigned long phys; + struct tid_group *grp; + u32 rcventry; + dma_addr_t dma_addr; + bool freed; + unsigned int npages; + struct page *pages[0]; +}; + +static inline int num_user_pages(unsigned long addr, + unsigned long len) +{ + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd); int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, struct hfi1_tid_info *tinfo); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index d55339f5d73b..c0c0e0445cbf 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -64,224 +64,20 @@ #include "hfi.h" #include "sdma.h" +#include "mmu_rb.h" #include "user_sdma.h" #include "verbs.h" /* for the headers */ #include "common.h" /* for struct hfi1_tid_info */ #include "trace.h" -#include "mmu_rb.h" static uint hfi1_sdma_comp_ring_size = 128; module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); -/* The maximum number of Data io vectors per message/request */ -#define MAX_VECTORS_PER_REQ 8 -/* - * Maximum number of packet to send from each message/request - * before moving to the next one. - */ -#define MAX_PKTS_PER_QUEUE 16 - -#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) - -#define req_opcode(x) \ - (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) -#define req_version(x) \ - (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) -#define req_iovcnt(x) \ - (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) - -/* Number of BTH.PSN bits used for sequence number in expected rcvs */ -#define BTH_SEQ_MASK 0x7ffull - -/* - * Define fields in the KDETH header so we can update the header - * template. - */ -#define KDETH_OFFSET_SHIFT 0 -#define KDETH_OFFSET_MASK 0x7fff -#define KDETH_OM_SHIFT 15 -#define KDETH_OM_MASK 0x1 -#define KDETH_TID_SHIFT 16 -#define KDETH_TID_MASK 0x3ff -#define KDETH_TIDCTRL_SHIFT 26 -#define KDETH_TIDCTRL_MASK 0x3 -#define KDETH_INTR_SHIFT 28 -#define KDETH_INTR_MASK 0x1 -#define KDETH_SH_SHIFT 29 -#define KDETH_SH_MASK 0x1 -#define KDETH_HCRC_UPPER_SHIFT 16 -#define KDETH_HCRC_UPPER_MASK 0xff -#define KDETH_HCRC_LOWER_SHIFT 24 -#define KDETH_HCRC_LOWER_MASK 0xff - -#define AHG_KDETH_INTR_SHIFT 12 -#define AHG_KDETH_SH_SHIFT 13 - -#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) -#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) - -#define KDETH_GET(val, field) \ - (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) -#define KDETH_SET(dw, field, val) do { \ - u32 dwval = le32_to_cpu(dw); \ - dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ - dwval |= (((val) & KDETH_##field##_MASK) << \ - KDETH_##field##_SHIFT); \ - dw = cpu_to_le32(dwval); \ - } while (0) - -#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ - do { \ - if ((idx) < ARRAY_SIZE((arr))) \ - (arr)[(idx++)] = sdma_build_ahg_descriptor( \ - (__force u16)(value), (dw), (bit), \ - (width)); \ - else \ - return -ERANGE; \ - } while (0) - -/* KDETH OM multipliers and switch over point */ -#define KDETH_OM_SMALL 4 -#define KDETH_OM_SMALL_SHIFT 2 -#define KDETH_OM_LARGE 64 -#define KDETH_OM_LARGE_SHIFT 6 -#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) - -/* Tx request flag bits */ -#define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ -#define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ - -/* SDMA request flag bits */ -#define SDMA_REQ_FOR_THREAD 1 -#define SDMA_REQ_SEND_DONE 2 -#define SDMA_REQ_HAS_ERROR 3 -#define SDMA_REQ_DONE_ERROR 4 - -#define SDMA_PKT_Q_INACTIVE BIT(0) -#define SDMA_PKT_Q_ACTIVE BIT(1) -#define SDMA_PKT_Q_DEFERRED BIT(2) - -/* - * Maximum retry attempts to submit a TX request - * before putting the process to sleep. - */ -#define MAX_DEFER_RETRY_COUNT 1 - static unsigned initial_pkt_count = 8; -#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ - -struct sdma_mmu_node; - -struct user_sdma_iovec { - struct list_head list; - struct iovec iov; - /* number of pages in this vector */ - unsigned npages; - /* array of pinned pages for this vector */ - struct page **pages; - /* - * offset into the virtual address space of the vector at - * which we last left off. - */ - u64 offset; - struct sdma_mmu_node *node; -}; - -struct sdma_mmu_node { - struct mmu_rb_node rb; - struct hfi1_user_sdma_pkt_q *pq; - atomic_t refcount; - struct page **pages; - unsigned npages; -}; - -/* evict operation argument */ -struct evict_data { - u32 cleared; /* count evicted so far */ - u32 target; /* target count to evict */ -}; - -struct user_sdma_request { - struct sdma_req_info info; - struct hfi1_user_sdma_pkt_q *pq; - struct hfi1_user_sdma_comp_q *cq; - /* This is the original header from user space */ - struct hfi1_pkt_header hdr; - /* - * Pointer to the SDMA engine for this request. - * Since different request could be on different VLs, - * each request will need it's own engine pointer. - */ - struct sdma_engine *sde; - s8 ahg_idx; - u32 ahg[9]; - /* - * KDETH.Offset (Eager) field - * We need to remember the initial value so the headers - * can be updated properly. - */ - u32 koffset; - /* - * KDETH.OFFSET (TID) field - * The offset can cover multiple packets, depending on the - * size of the TID entry. - */ - u32 tidoffset; - /* - * We copy the iovs for this request (based on - * info.iovcnt). These are only the data vectors - */ - unsigned data_iovs; - /* total length of the data in the request */ - u32 data_len; - /* progress index moving along the iovs array */ - unsigned iov_idx; - struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; - /* number of elements copied to the tids array */ - u16 n_tids; - /* TID array values copied from the tid_iov vector */ - u32 *tids; - u16 tididx; - u32 sent; - u64 seqnum; - u64 seqcomp; - u64 seqsubmitted; - struct list_head txps; - unsigned long flags; - /* status of the last txreq completed */ - int status; -}; - -/* - * A single txreq could span up to 3 physical pages when the MTU - * is sufficiently large (> 4K). Each of the IOV pointers also - * needs it's own set of flags so the vector has been handled - * independently of each other. - */ -struct user_sdma_txreq { - /* Packet header for the txreq */ - struct hfi1_pkt_header hdr; - struct sdma_txreq txreq; - struct list_head list; - struct user_sdma_request *req; - u16 flags; - unsigned busycount; - u64 seqnum; -}; - -#define SDMA_DBG(req, fmt, ...) \ - hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ - (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ - ##__VA_ARGS__) -#define SDMA_Q_DBG(pq, fmt, ...) \ - hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ - (pq)->subctxt, ##__VA_ARGS__) - static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts); -static int num_user_pages(const struct iovec *iov); static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); @@ -307,7 +103,8 @@ static int defer_packet_queue( struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *txreq, - unsigned int seq); + uint seq, + bool pkts_sent); static void activate_packet_queue(struct iowait *wait, int reason); static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, unsigned long len); @@ -329,7 +126,8 @@ static int defer_packet_queue( struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *txreq, - unsigned seq) + uint seq, + bool pkts_sent) { struct hfi1_user_sdma_pkt_q *pq = container_of(wait, struct hfi1_user_sdma_pkt_q, busy); @@ -349,7 +147,7 @@ static int defer_packet_queue( xchg(&pq->state, SDMA_PKT_Q_DEFERRED); write_seqlock(&dev->iowait_lock); if (list_empty(&pq->busy.list)) - list_add_tail(&pq->busy.list, &sde->dmawait); + iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); write_sequnlock(&dev->iowait_lock); return -EBUSY; eagain: @@ -364,13 +162,6 @@ static void activate_packet_queue(struct iowait *wait, int reason) wake_up(&wait->wait_dma); }; -static void sdma_kmem_cache_ctor(void *obj) -{ - struct user_sdma_txreq *tx = obj; - - memset(tx, 0, sizeof(*tx)); -} - int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct hfi1_filedata *fd) { @@ -379,7 +170,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct hfi1_devdata *dd; struct hfi1_user_sdma_comp_q *cq; struct hfi1_user_sdma_pkt_q *pq; - unsigned long flags; if (!uctxt || !fd) return -EBADF; @@ -393,7 +183,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, if (!pq) return -ENOMEM; - INIT_LIST_HEAD(&pq->list); pq->dd = dd; pq->ctxt = uctxt->ctxt; pq->subctxt = fd->subctxt; @@ -426,7 +215,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, sizeof(struct user_sdma_txreq), L1_CACHE_BYTES, SLAB_HWCACHE_ALIGN, - sdma_kmem_cache_ctor); + NULL); if (!pq->txreq_cache) { dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", uctxt->ctxt); @@ -454,10 +243,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, fd->pq = pq; fd->cq = cq; - spin_lock_irqsave(&uctxt->sdma_qlock, flags); - list_add(&pq->list, &uctxt->sdma_queues); - spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); - return 0; pq_mmu_fail: @@ -476,22 +261,17 @@ pq_reqs_nomem: return ret; } -int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) +int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) { - struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_user_sdma_pkt_q *pq; - unsigned long flags; - hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, - uctxt->ctxt, fd->subctxt); + trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); + pq = fd->pq; if (pq) { if (pq->handler) hfi1_mmu_rb_unregister(pq->handler); - spin_lock_irqsave(&uctxt->sdma_qlock, flags); - if (!list_empty(&pq->list)) - list_del_init(&pq->list); - spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); iowait_sdma_drain(&pq->busy); /* Wait until all requests have been freed. */ wait_event_interruptible( @@ -546,6 +326,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, struct sdma_req_info info; struct user_sdma_request *req; u8 opcode, sc, vl; + u16 pkey; + u32 slid; int req_queued = 0; u16 dlid; u32 selector; @@ -567,7 +349,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, (u16 *)&info); - if (info.comp_idx >= hfi1_sdma_comp_ring_size) { hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Invalid comp index", @@ -604,15 +385,23 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, /* * All safety checks have been done and this request has been claimed. */ - hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, - uctxt->ctxt, fd->subctxt, info.comp_idx); + trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, + info.comp_idx); req = pq->reqs + info.comp_idx; - memset(req, 0, sizeof(*req)); req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ + req->data_len = 0; req->pq = pq; req->cq = cq; req->status = -1; req->ahg_idx = -1; + req->iov_idx = 0; + req->sent = 0; + req->seqnum = 0; + req->seqcomp = 0; + req->seqsubmitted = 0; + req->tids = NULL; + req->done = 0; + req->has_error = 0; INIT_LIST_HEAD(&req->txps); memcpy(&req->info, &info, sizeof(info)); @@ -671,8 +460,9 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, } /* Checking P_KEY for requests from user-space */ - if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, - PKEY_CHECK_INVALID)) { + pkey = (u16)be32_to_cpu(req->hdr.bth[0]); + slid = be16_to_cpu(req->hdr.lrh[3]); + if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { ret = -EINVAL; goto free_req; } @@ -696,24 +486,27 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? KDETH_OM_LARGE : KDETH_OM_SMALL); - SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); + trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, + info.comp_idx, req->tidoffset); idx++; /* Save all the IO vector structures */ for (i = 0; i < req->data_iovs; i++) { + req->iovs[i].offset = 0; INIT_LIST_HEAD(&req->iovs[i].list); memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(req->iovs[i].iov)); ret = pin_vector_pages(req, &req->iovs[i]); if (ret) { + req->data_iovs = i; req->status = ret; goto free_req; } req->data_len += req->iovs[i].iov.iov_len; } - SDMA_DBG(req, "total data length %u", req->data_len); - + trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, + info.comp_idx, req->data_len); if (pcount > req->info.npkts) pcount = req->info.npkts; /* @@ -749,6 +542,7 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, } req->tids = tmp; req->n_tids = ntids; + req->tididx = 0; idx++; } @@ -791,12 +585,12 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, * request have been submitted to the SDMA engine. However, it * will not wait for send completions. */ - while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { + while (req->seqsubmitted != req->info.npkts) { ret = user_sdma_send_pkts(req, pcount); if (ret < 0) { if (ret != -EBUSY) { req->status = ret; - set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + WRITE_ONCE(req->has_error, 1); if (ACCESS_ONCE(req->seqcomp) == req->seqsubmitted - 1) goto free_req; @@ -867,7 +661,11 @@ static inline u32 compute_data_length(struct user_sdma_request *req, } else { len = min(req->data_len - req->sent, (u32)req->info.fragsize); } - SDMA_DBG(req, "Data Length = %u", len); + trace_hfi1_sdma_user_compute_length(req->pq->dd, + req->pq->ctxt, + req->pq->subctxt, + req->info.comp_idx, + len); return len; } @@ -884,6 +682,84 @@ static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); } +static int user_sdma_txadd_ahg(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + u32 datalen) +{ + int ret; + u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); + u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); + struct hfi1_user_sdma_pkt_q *pq = req->pq; + + /* + * Copy the request header into the tx header + * because the HW needs a cacheline-aligned + * address. + * This copy can be optimized out if the hdr + * member of user_sdma_request were also + * cacheline aligned. + */ + memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); + tx->hdr.pbc[0] = cpu_to_le16(pbclen); + } + ret = check_header_template(req, &tx->hdr, lrhlen, datalen); + if (ret) + return ret; + ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, + sizeof(tx->hdr) + datalen, req->ahg_idx, + 0, NULL, 0, user_sdma_txreq_cb); + if (ret) + return ret; + ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); + if (ret) + sdma_txclean(pq->dd, &tx->txreq); + return ret; +} + +static int user_sdma_txadd(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + struct user_sdma_iovec *iovec, u32 datalen, + u32 *queued_ptr, u32 *data_sent_ptr, + u64 *iov_offset_ptr) +{ + int ret; + unsigned int pageidx, len; + unsigned long base, offset; + u64 iov_offset = *iov_offset_ptr; + u32 queued = *queued_ptr, data_sent = *data_sent_ptr; + struct hfi1_user_sdma_pkt_q *pq = req->pq; + + base = (unsigned long)iovec->iov.iov_base; + offset = offset_in_page(base + iovec->offset + iov_offset); + pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> + PAGE_SHIFT); + len = offset + req->info.fragsize > PAGE_SIZE ? + PAGE_SIZE - offset : req->info.fragsize; + len = min((datalen - queued), len); + ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], + offset, len); + if (ret) { + SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); + return ret; + } + iov_offset += len; + queued += len; + data_sent += len; + if (unlikely(queued < datalen && pageidx == iovec->npages && + req->iov_idx < req->data_iovs - 1)) { + iovec->offset += iov_offset; + iovec = &req->iovs[++req->iov_idx]; + iov_offset = 0; + } + + *queued_ptr = queued; + *data_sent_ptr = data_sent; + *iov_offset_ptr = iov_offset; + return ret; +} + static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) { int ret = 0, count; @@ -898,10 +774,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) pq = req->pq; /* If tx completion has reported an error, we are done. */ - if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { - set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + if (READ_ONCE(req->has_error)) return -EFAULT; - } /* * Check if we might have sent the entire request already @@ -924,10 +798,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) * with errors. If so, we are not going to process any * more packets from this request. */ - if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { - set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + if (READ_ONCE(req->has_error)) return -EFAULT; - } tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); if (!tx) @@ -984,39 +856,9 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (req->ahg_idx >= 0) { if (!req->seqnum) { - u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); - u32 lrhlen = get_lrh_len(req->hdr, - pad_len(datalen)); - /* - * Copy the request header into the tx header - * because the HW needs a cacheline-aligned - * address. - * This copy can be optimized out if the hdr - * member of user_sdma_request were also - * cacheline aligned. - */ - memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); - if (PBC2LRH(pbclen) != lrhlen) { - pbclen = (pbclen & 0xf000) | - LRH2PBC(lrhlen); - tx->hdr.pbc[0] = cpu_to_le16(pbclen); - } - ret = check_header_template(req, &tx->hdr, - lrhlen, datalen); + ret = user_sdma_txadd_ahg(req, tx, datalen); if (ret) goto free_tx; - ret = sdma_txinit_ahg(&tx->txreq, - SDMA_TXREQ_F_AHG_COPY, - sizeof(tx->hdr) + datalen, - req->ahg_idx, 0, NULL, 0, - user_sdma_txreq_cb); - if (ret) - goto free_tx; - ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, - &tx->hdr, - sizeof(tx->hdr)); - if (ret) - goto free_txreq; } else { int changes; @@ -1024,11 +866,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) datalen); if (changes < 0) goto free_tx; - sdma_txinit_ahg(&tx->txreq, - SDMA_TXREQ_F_USE_AHG, - datalen, req->ahg_idx, changes, - req->ahg, sizeof(req->hdr), - user_sdma_txreq_cb); } } else { ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + @@ -1052,35 +889,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) */ while (queued < datalen && (req->sent + data_sent) < req->data_len) { - unsigned long base, offset; - unsigned pageidx, len; - - base = (unsigned long)iovec->iov.iov_base; - offset = offset_in_page(base + iovec->offset + - iov_offset); - pageidx = (((iovec->offset + iov_offset + - base) - (base & PAGE_MASK)) >> PAGE_SHIFT); - len = offset + req->info.fragsize > PAGE_SIZE ? - PAGE_SIZE - offset : req->info.fragsize; - len = min((datalen - queued), len); - ret = sdma_txadd_page(pq->dd, &tx->txreq, - iovec->pages[pageidx], - offset, len); - if (ret) { - SDMA_DBG(req, "SDMA txreq add page failed %d\n", - ret); + ret = user_sdma_txadd(req, tx, iovec, datalen, + &queued, &data_sent, &iov_offset); + if (ret) goto free_txreq; - } - iov_offset += len; - queued += len; - data_sent += len; - if (unlikely(queued < datalen && - pageidx == iovec->npages && - req->iov_idx < req->data_iovs - 1)) { - iovec->offset += iov_offset; - iovec = &req->iovs[++req->iov_idx]; - iov_offset = 0; - } } /* * The txreq was submitted successfully so we can update @@ -1105,7 +917,7 @@ dosend: ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); req->seqsubmitted += count; if (req->seqsubmitted == req->info.npkts) { - set_bit(SDMA_REQ_SEND_DONE, &req->flags); + WRITE_ONCE(req->done, 1); /* * The txreq has already been submitted to the HW queue * so we can free the AHG entry now. Corruption will not @@ -1124,19 +936,6 @@ free_tx: return ret; } -/* - * How many pages in this iovec element? - */ -static inline int num_user_pages(const struct iovec *iov) -{ - const unsigned long addr = (unsigned long)iov->iov_base; - const unsigned long len = iov->iov_len; - const unsigned long spage = addr & PAGE_MASK; - const unsigned long epage = (addr + len - 1) & PAGE_MASK; - - return 1 + ((epage - spage) >> PAGE_SHIFT); -} - static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) { struct evict_data evict_data; @@ -1147,22 +946,82 @@ static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) return evict_data.cleared; } +static int pin_sdma_pages(struct user_sdma_request *req, + struct user_sdma_iovec *iovec, + struct sdma_mmu_node *node, + int npages) +{ + int pinned, cleared; + struct page **pages; + struct hfi1_user_sdma_pkt_q *pq = req->pq; + + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + SDMA_DBG(req, "Failed page array alloc"); + return -ENOMEM; + } + memcpy(pages, node->pages, node->npages * sizeof(*pages)); + + npages -= node->npages; +retry: + if (!hfi1_can_pin_pages(pq->dd, pq->mm, + atomic_read(&pq->n_locked), npages)) { + cleared = sdma_cache_evict(pq, npages); + if (cleared >= npages) + goto retry; + } + pinned = hfi1_acquire_user_pages(pq->mm, + ((unsigned long)iovec->iov.iov_base + + (node->npages * PAGE_SIZE)), npages, 0, + pages + node->npages); + if (pinned < 0) { + kfree(pages); + return pinned; + } + if (pinned != npages) { + unpin_vector_pages(pq->mm, pages, node->npages, pinned); + return -EFAULT; + } + kfree(node->pages); + node->rb.len = iovec->iov.iov_len; + node->pages = pages; + atomic_add(pinned, &pq->n_locked); + return pinned; +} + +static void unpin_sdma_pages(struct sdma_mmu_node *node) +{ + if (node->npages) { + unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); + atomic_sub(node->npages, &node->pq->n_locked); + } +} + static int pin_vector_pages(struct user_sdma_request *req, struct user_sdma_iovec *iovec) { - int ret = 0, pinned, npages, cleared; - struct page **pages; + int ret = 0, pinned, npages; struct hfi1_user_sdma_pkt_q *pq = req->pq; struct sdma_mmu_node *node = NULL; struct mmu_rb_node *rb_node; - - rb_node = hfi1_mmu_rb_extract(pq->handler, - (unsigned long)iovec->iov.iov_base, - iovec->iov.iov_len); - if (rb_node) + struct iovec *iov; + bool extracted; + + extracted = + hfi1_mmu_rb_remove_unless_exact(pq->handler, + (unsigned long) + iovec->iov.iov_base, + iovec->iov.iov_len, &rb_node); + if (rb_node) { node = container_of(rb_node, struct sdma_mmu_node, rb); - else - rb_node = NULL; + if (!extracted) { + atomic_inc(&node->refcount); + iovec->pages = node->pages; + iovec->npages = node->npages; + iovec->node = node; + return 0; + } + } if (!node) { node = kzalloc(sizeof(*node), GFP_KERNEL); @@ -1174,46 +1033,16 @@ static int pin_vector_pages(struct user_sdma_request *req, atomic_set(&node->refcount, 0); } - npages = num_user_pages(&iovec->iov); + iov = &iovec->iov; + npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); if (node->npages < npages) { - pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - SDMA_DBG(req, "Failed page array alloc"); - ret = -ENOMEM; - goto bail; - } - memcpy(pages, node->pages, node->npages * sizeof(*pages)); - - npages -= node->npages; - -retry: - if (!hfi1_can_pin_pages(pq->dd, pq->mm, - atomic_read(&pq->n_locked), npages)) { - cleared = sdma_cache_evict(pq, npages); - if (cleared >= npages) - goto retry; - } - pinned = hfi1_acquire_user_pages(pq->mm, - ((unsigned long)iovec->iov.iov_base + - (node->npages * PAGE_SIZE)), npages, 0, - pages + node->npages); + pinned = pin_sdma_pages(req, iovec, node, npages); if (pinned < 0) { - kfree(pages); ret = pinned; goto bail; } - if (pinned != npages) { - unpin_vector_pages(pq->mm, pages, node->npages, - pinned); - ret = -EFAULT; - goto bail; - } - kfree(node->pages); - node->rb.len = iovec->iov.iov_len; - node->pages = pages; node->npages += pinned; npages = node->npages; - atomic_add(pinned, &pq->n_locked); } iovec->pages = node->pages; iovec->npages = npages; @@ -1221,14 +1050,12 @@ retry: ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); if (ret) { - atomic_sub(node->npages, &pq->n_locked); iovec->node = NULL; goto bail; } return 0; bail: - if (rb_node) - unpin_vector_pages(pq->mm, node->pages, 0, node->npages); + unpin_sdma_pages(node); kfree(node); return ret; } @@ -1408,9 +1235,10 @@ static int set_txreq_header(struct user_sdma_request *req, * Set the KDETH.OFFSET and KDETH.OM based on size of * transfer. */ - SDMA_DBG(req, "TID offset %ubytes %uunits om%u", - req->tidoffset, req->tidoffset >> omfactor, - omfactor != KDETH_OM_SMALL_SHIFT); + trace_hfi1_sdma_user_tid_info( + pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, + req->tidoffset, req->tidoffset >> omfactor, + omfactor != KDETH_OM_SMALL_SHIFT); KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, req->tidoffset >> omfactor); KDETH_SET(hdr->kdeth.ver_tid_offset, OM, @@ -1423,21 +1251,22 @@ done: } static int set_txreq_header_ahg(struct user_sdma_request *req, - struct user_sdma_txreq *tx, u32 len) + struct user_sdma_txreq *tx, u32 datalen) { + u32 ahg[AHG_KDETH_ARRAY_SIZE]; int diff = 0; u8 omfactor; /* KDETH.OM */ struct hfi1_user_sdma_pkt_q *pq = req->pq; struct hfi1_pkt_header *hdr = &req->hdr; u16 pbclen = le16_to_cpu(hdr->pbc[0]); - u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); if (PBC2LRH(pbclen) != lrhlen) { /* PBC.PbcLengthDWs */ - AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, + AHG_HEADER_SET(ahg, diff, 0, 0, 12, cpu_to_le16(LRH2PBC(lrhlen))); /* LRH.PktLen (we need the full 16 bits due to byte swap) */ - AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, + AHG_HEADER_SET(ahg, diff, 3, 0, 16, cpu_to_be16(lrhlen >> 2)); } @@ -1449,13 +1278,12 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) val32 |= 1UL << 31; - AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); - AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); + AHG_HEADER_SET(ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); + AHG_HEADER_SET(ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); /* KDETH.Offset */ - AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, + AHG_HEADER_SET(ahg, diff, 15, 0, 16, cpu_to_le16(req->koffset & 0xffff)); - AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, - cpu_to_le16(req->koffset >> 16)); + AHG_HEADER_SET(ahg, diff, 15, 16, 16, cpu_to_le16(req->koffset >> 16)); if (req_opcode(req->info.ctrl) == EXPECTED) { __le16 val; @@ -1473,9 +1301,8 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, * we have to check again. */ if (++req->tididx > req->n_tids - 1 || - !req->tids[req->tididx]) { + !req->tids[req->tididx]) return -EINVAL; - } tidval = req->tids[req->tididx]; } omfactor = ((EXP_TID_GET(tidval, LEN) * @@ -1483,7 +1310,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : KDETH_OM_SMALL_SHIFT; /* KDETH.OM and KDETH.OFFSET (TID) */ - AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, + AHG_HEADER_SET(ahg, diff, 7, 0, 16, ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | ((req->tidoffset >> omfactor) & 0x7fff))); @@ -1503,12 +1330,20 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, AHG_KDETH_INTR_SHIFT)); } - AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); + AHG_HEADER_SET(ahg, diff, 7, 16, 14, val); } + if (diff < 0) + return diff; trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, req->sde->this_idx, - req->ahg_idx, req->ahg, diff, tidval); + req->ahg_idx, ahg, diff, tidval); + sdma_txinit_ahg(&tx->txreq, + SDMA_TXREQ_F_USE_AHG, + datalen, req->ahg_idx, diff, + ahg, sizeof(req->hdr), + user_sdma_txreq_cb); + return diff; } @@ -1537,7 +1372,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) if (status != SDMA_TXREQ_S_OK) { SDMA_DBG(req, "SDMA completion with error %d", status); - set_bit(SDMA_REQ_HAS_ERROR, &req->flags); + WRITE_ONCE(req->has_error, 1); } req->seqcomp = tx->seqnum; @@ -1556,8 +1391,8 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) if (status != SDMA_TXREQ_S_OK) req->status = status; if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && - (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || - test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { + (READ_ONCE(req->done) || + READ_ONCE(req->has_error))) { user_sdma_free_request(req, false); pq_update(pq); set_comp_state(pq, cq, idx, ERROR, req->status); @@ -1611,8 +1446,6 @@ static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, u16 idx, enum hfi1_sdma_comp_state state, int ret) { - hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", - pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); if (state == ERROR) cq->comps[idx].errcode = -ret; smp_wmb(); /* make sure errcode is visible first */ @@ -1667,10 +1500,7 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); - atomic_sub(node->npages, &node->pq->n_locked); - - unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); - + unpin_sdma_pages(node); kfree(node); } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index e5b10aefe212..9b8bb5634c0d 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -53,11 +53,68 @@ #include "iowait.h" #include "user_exp_rcv.h" +/* The maximum number of Data io vectors per message/request */ +#define MAX_VECTORS_PER_REQ 8 +/* + * Maximum number of packet to send from each message/request + * before moving to the next one. + */ +#define MAX_PKTS_PER_QUEUE 16 + +#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) + +#define req_opcode(x) \ + (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_version(x) \ + (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_iovcnt(x) \ + (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) + +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define BTH_SEQ_MASK 0x7ffull + +#define AHG_KDETH_INTR_SHIFT 12 +#define AHG_KDETH_SH_SHIFT 13 +#define AHG_KDETH_ARRAY_SIZE 9 + +#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) +#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) + +#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ + do { \ + if ((idx) < ARRAY_SIZE((arr))) \ + (arr)[(idx++)] = sdma_build_ahg_descriptor( \ + (__force u16)(value), (dw), (bit), \ + (width)); \ + else \ + return -ERANGE; \ + } while (0) + +/* Tx request flag bits */ +#define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ +#define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ + +#define SDMA_PKT_Q_INACTIVE BIT(0) +#define SDMA_PKT_Q_ACTIVE BIT(1) +#define SDMA_PKT_Q_DEFERRED BIT(2) + +/* + * Maximum retry attempts to submit a TX request + * before putting the process to sleep. + */ +#define MAX_DEFER_RETRY_COUNT 1 + +#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ + +#define SDMA_DBG(req, fmt, ...) \ + hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ + (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ + ##__VA_ARGS__) + extern uint extended_psn; struct hfi1_user_sdma_pkt_q { - struct list_head list; - unsigned ctxt; + u16 ctxt; u16 subctxt; u16 n_max_reqs; atomic_t n_reqs; @@ -80,9 +137,115 @@ struct hfi1_user_sdma_comp_q { struct hfi1_sdma_comp_entry *comps; }; +struct sdma_mmu_node { + struct mmu_rb_node rb; + struct hfi1_user_sdma_pkt_q *pq; + atomic_t refcount; + struct page **pages; + unsigned int npages; +}; + +struct user_sdma_iovec { + struct list_head list; + struct iovec iov; + /* number of pages in this vector */ + unsigned int npages; + /* array of pinned pages for this vector */ + struct page **pages; + /* + * offset into the virtual address space of the vector at + * which we last left off. + */ + u64 offset; + struct sdma_mmu_node *node; +}; + +/* evict operation argument */ +struct evict_data { + u32 cleared; /* count evicted so far */ + u32 target; /* target count to evict */ +}; + +struct user_sdma_request { + /* This is the original header from user space */ + struct hfi1_pkt_header hdr; + + /* Read mostly fields */ + struct hfi1_user_sdma_pkt_q *pq ____cacheline_aligned_in_smp; + struct hfi1_user_sdma_comp_q *cq; + /* + * Pointer to the SDMA engine for this request. + * Since different request could be on different VLs, + * each request will need it's own engine pointer. + */ + struct sdma_engine *sde; + struct sdma_req_info info; + /* TID array values copied from the tid_iov vector */ + u32 *tids; + /* total length of the data in the request */ + u32 data_len; + /* number of elements copied to the tids array */ + u16 n_tids; + /* + * We copy the iovs for this request (based on + * info.iovcnt). These are only the data vectors + */ + u8 data_iovs; + s8 ahg_idx; + + /* Writeable fields shared with interrupt */ + u64 seqcomp ____cacheline_aligned_in_smp; + u64 seqsubmitted; + /* status of the last txreq completed */ + int status; + + /* Send side fields */ + struct list_head txps ____cacheline_aligned_in_smp; + u64 seqnum; + /* + * KDETH.OFFSET (TID) field + * The offset can cover multiple packets, depending on the + * size of the TID entry. + */ + u32 tidoffset; + /* + * KDETH.Offset (Eager) field + * We need to remember the initial value so the headers + * can be updated properly. + */ + u32 koffset; + u32 sent; + /* TID index copied from the tid_iov vector */ + u16 tididx; + /* progress index moving along the iovs array */ + u8 iov_idx; + u8 done; + u8 has_error; + + struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; +} ____cacheline_aligned_in_smp; + +/* + * A single txreq could span up to 3 physical pages when the MTU + * is sufficiently large (> 4K). Each of the IOV pointers also + * needs it's own set of flags so the vector has been handled + * independently of each other. + */ +struct user_sdma_txreq { + /* Packet header for the txreq */ + struct hfi1_pkt_header hdr; + struct sdma_txreq txreq; + struct list_head list; + struct user_sdma_request *req; + u16 flags; + unsigned int busycount; + u64 seqnum; +}; + int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct hfi1_filedata *fd); -int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd); +int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, struct iovec *iovec, unsigned long dim, unsigned long *count); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 2d19f9bb434d..e232f3c608b4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -53,6 +53,7 @@ #include <linux/rculist.h> #include <linux/mm.h> #include <linux/vmalloc.h> +#include <rdma/opa_addr.h> #include "hfi.h" #include "common.h" @@ -508,13 +509,14 @@ again: /* * Make sure the QP is ready and able to accept the given opcode. */ -static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet) +static inline opcode_handler qp_ok(struct hfi1_packet *packet) { if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) return NULL; - if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) || - (opcode == IB_OPCODE_CNP)) - return opcode_handler_tbl[opcode]; + if (((packet->opcode & RVT_OPCODE_QP_MASK) == + packet->qp->allowed_ops) || + (packet->opcode == IB_OPCODE_CNP)) + return opcode_handler_tbl[packet->opcode]; return NULL; } @@ -548,69 +550,54 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) return pbc; } -/** - * hfi1_ib_rcv - process an incoming packet - * @packet: data packet information - * - * This is called to process an incoming packet at interrupt level. - * - * Tlen is the length of the header + data + CRC in bytes. - */ -void hfi1_ib_rcv(struct hfi1_packet *packet) +static int hfi1_do_pkey_check(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; - struct ib_header *hdr = packet->hdr; - u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_16b_header *hdr = packet->hdr; + u16 pkey; + + /* Pkey check needed only for bypass packets */ + if (packet->etype != RHF_RCV_TYPE_BYPASS) + return 0; + + /* Perform pkey check */ + pkey = hfi1_16B_get_pkey(hdr); + return ingress_pkey_check(ppd, pkey, packet->sc, + packet->qp->s_pkey_index, + packet->slid, true); +} + +static inline void hfi1_handle_packet(struct hfi1_packet *packet, + bool is_mcast) +{ + u32 qp_num; + struct hfi1_ctxtdata *rcd = packet->rcd; struct hfi1_pportdata *ppd = rcd->ppd; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; opcode_handler packet_handler; unsigned long flags; - u32 qp_num; - int lnh; - u8 opcode; - u16 lid; - - /* Check for GRH */ - lnh = ib_get_lnh(hdr); - if (lnh == HFI1_LRH_BTH) { - packet->ohdr = &hdr->u.oth; - } else if (lnh == HFI1_LRH_GRH) { - u32 vtf; - - packet->ohdr = &hdr->u.l.oth; - if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) - goto drop; - vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); - if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) - goto drop; - packet->rcv_flags |= HFI1_HAS_GRH; - } else { - goto drop; - } - - trace_input_ibhdr(rcd->dd, hdr); - opcode = ib_bth_get_opcode(packet->ohdr); - inc_opstats(tlen, &rcd->opstats->stats[opcode]); + inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]); - /* Get the destination QP number. */ - qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK; - lid = ib_get_dlid(hdr); - if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && - (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) { + if (unlikely(is_mcast)) { struct rvt_mcast *mcast; struct rvt_mcast_qp *p; - if (lnh != HFI1_LRH_GRH) + if (!packet->grh) goto drop; - mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid, lid); + mcast = rvt_mcast_find(&ibp->rvp, + &packet->grh->dgid, + opa_get_lid(packet->dlid, 9B)); if (!mcast) goto drop; list_for_each_entry_rcu(p, &mcast->qp_list, list) { packet->qp = p->qp; + if (hfi1_do_pkey_check(packet)) + goto drop; spin_lock_irqsave(&packet->qp->r_lock, flags); - packet_handler = qp_ok(opcode, packet); + packet_handler = qp_ok(packet); if (likely(packet_handler)) packet_handler(packet); else @@ -624,19 +611,22 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) if (atomic_dec_return(&mcast->refcount) <= 1) wake_up(&mcast->wait); } else { + /* Get the destination QP number. */ + qp_num = ib_bth_get_qpn(packet->ohdr); rcu_read_lock(); packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); - if (!packet->qp) { - rcu_read_unlock(); - goto drop; - } - if (unlikely(hfi1_dbg_fault_opcode(packet->qp, opcode, - true))) { - rcu_read_unlock(); - goto drop; - } + if (!packet->qp) + goto unlock_drop; + + if (hfi1_do_pkey_check(packet)) + goto unlock_drop; + + if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode, + true))) + goto unlock_drop; + spin_lock_irqsave(&packet->qp->r_lock, flags); - packet_handler = qp_ok(opcode, packet); + packet_handler = qp_ok(packet); if (likely(packet_handler)) packet_handler(packet); else @@ -645,11 +635,34 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) rcu_read_unlock(); } return; - +unlock_drop: + rcu_read_unlock(); drop: ibp->rvp.n_pkt_drops++; } +/** + * hfi1_ib_rcv - process an incoming packet + * @packet: data packet information + * + * This is called to process an incoming packet at interrupt level. + */ +void hfi1_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); +} + +void hfi1_16B_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + trace_input_ibhdr(rcd->dd, packet, false); + hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); +} + /* * This is called from a timer to check for QPs * which need kernel memory in order to send a packet. @@ -696,7 +709,7 @@ static void verbs_sdma_complete( if (tx->wqe) { hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); } else if (qp->ibqp.qp_type == IB_QPT_RC) { - struct ib_header *hdr; + struct hfi1_opa_header *hdr; hdr = &tx->phdr.hdr; hfi1_rc_send_complete(qp, hdr); @@ -799,12 +812,27 @@ static int build_verbs_tx_desc( int ret = 0; struct hfi1_sdma_header *phdr = &tx->phdr; u16 hdrbytes = tx->hdr_dwords << 2; + u32 *hdr; + u8 extra_bytes = 0; + static char trail_buf[12]; /* CRC = 4, LT = 1, Pad = 0 to 7 bytes */ + if (tx->phdr.hdr.hdr_type) { + /* + * hdrbytes accounts for PBC. Need to subtract 8 bytes + * before calculating padding. + */ + extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) + + (SIZE_OF_CRC << 2) + SIZE_OF_LT; + hdr = (u32 *)&phdr->hdr.opah; + } else { + hdr = (u32 *)&phdr->hdr.ibh; + } if (!ahg_info->ahgcount) { ret = sdma_txinit_ahg( &tx->txreq, ahg_info->tx_flags, - hdrbytes + length, + hdrbytes + length + + extra_bytes, ahg_info->ahgidx, 0, NULL, @@ -834,8 +862,17 @@ static int build_verbs_tx_desc( goto bail_txadd; } /* add the ulp payload - if any. tx->ss can be NULL for acks */ - if (tx->ss) + if (tx->ss) { ret = build_verbs_ulp_payload(sde, length, tx); + if (ret) + goto bail_txadd; + } + + /* add icrc, lt byte, and padding to flit */ + if (extra_bytes != 0) + ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq, + trail_buf, extra_bytes); + bail_txadd: return ret; } @@ -847,26 +884,42 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_ahg_info *ahg_info = priv->s_ahg; u32 hdrwords = qp->s_hdrwords; u32 len = ps->s_txreq->s_cur_size; - u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */ + u32 plen; struct hfi1_ibdev *dev = ps->dev; struct hfi1_pportdata *ppd = ps->ppd; struct verbs_txreq *tx; u8 sc5 = priv->s_sc; - int ret; + u32 dwords; + bool bypass = false; + + if (ps->s_txreq->phdr.hdr.hdr_type) { + u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len); + + dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) + + SIZE_OF_LT) >> 2; + bypass = true; + } else { + dwords = (len + 3) >> 2; + } + plen = hdrwords + dwords + 2; tx = ps->s_txreq; if (!sdma_txreq_built(&tx->txreq)) { if (likely(pbc == 0)) { u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); - u8 opcode = get_opcode(&tx->phdr.hdr); /* No vl15 here */ - /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ - pbc |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + /* set PBC_DC_INFO bit (aka SC[4]) in pbc */ + if (ps->s_txreq->phdr.hdr.hdr_type) + pbc |= PBC_PACKET_BYPASS | + PBC_INSERT_BYPASS_ICRC; + else + pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); - if (unlikely(hfi1_dbg_fault_opcode(qp, opcode, false))) - pbc = hfi1_fault_tx(qp, opcode, pbc); + if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, + false))) + pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, @@ -878,14 +931,15 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(ret)) goto bail_build; } - ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq); + ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, + ps->pkts_sent); if (unlikely(ret < 0)) { if (ret == -ECOMM) goto bail_ecomm; return ret; } trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device), - &ps->s_txreq->phdr.hdr); + &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); return ret; bail_ecomm: @@ -935,7 +989,8 @@ static int pio_wait(struct rvt_qp *qp, dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN); qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); - list_add_tail(&priv->s_iowait.list, &sc->piowait); + iowait_queue(ps->pkts_sent, &priv->s_iowait, + &sc->piowait); priv->s_iowait.lock = &dev->iowait_lock; trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); rvt_get_qp(qp); @@ -967,10 +1022,10 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u32 hdrwords = qp->s_hdrwords; struct rvt_sge_state *ss = ps->s_txreq->ss; u32 len = ps->s_txreq->s_cur_size; - u32 dwords = (len + 3) >> 2; - u32 plen = hdrwords + dwords + 2; /* includes pbc */ + u32 dwords; + u32 plen; struct hfi1_pportdata *ppd = ps->ppd; - u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr; + u32 *hdr; u8 sc5; unsigned long flags = 0; struct send_context *sc; @@ -978,6 +1033,23 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int wc_status = IB_WC_SUCCESS; int ret = 0; pio_release_cb cb = NULL; + u32 lrh0_16b; + bool bypass = false; + u8 extra_bytes = 0; + + if (ps->s_txreq->phdr.hdr.hdr_type) { + u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len); + + extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT; + dwords = (len + extra_bytes) >> 2; + hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah; + lrh0_16b = ps->s_txreq->phdr.hdr.opah.lrh[0]; + bypass = true; + } else { + dwords = (len + 3) >> 2; + hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh; + } + plen = hdrwords + dwords + 2; /* only RC/UC use complete */ switch (qp->ibqp.qp_type) { @@ -995,13 +1067,14 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (likely(pbc == 0)) { u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); - struct verbs_txreq *tx = ps->s_txreq; - u8 opcode = get_opcode(&tx->phdr.hdr); - /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ - pbc |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; - if (unlikely(hfi1_dbg_fault_opcode(qp, opcode, false))) - pbc = hfi1_fault_tx(qp, opcode, pbc); + /* set PBC_DC_INFO bit (aka SC[4]) in pbc */ + if (ps->s_txreq->phdr.hdr.hdr_type) + pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + else + pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); + if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, false))) + pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); } if (cb) @@ -1038,11 +1111,12 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, } } - if (len == 0) { + if (dwords == 0) { pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords); } else { + seg_pio_copy_start(pbuf, pbc, + hdr, hdrwords * 4); if (ss) { - seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4); while (len) { void *addr = ss->sge.vaddr; u32 slen = ss->sge.length; @@ -1053,12 +1127,24 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, seg_pio_copy_mid(pbuf, addr, slen); len -= slen; } - seg_pio_copy_end(pbuf); } + /* + * Bypass packet will need to copy additional + * bytes to accommodate for CRC and LT bytes + */ + if (extra_bytes) { + u8 *empty_buf; + + empty_buf = kcalloc(extra_bytes, sizeof(u8), + GFP_KERNEL); + seg_pio_copy_mid(pbuf, empty_buf, extra_bytes); + kfree(empty_buf); + } + seg_pio_copy_end(pbuf); } trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device), - &ps->s_txreq->phdr.hdr); + &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); pio_bail: if (qp->s_wqe) { @@ -1104,10 +1190,10 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) /** * egress_pkey_check - check P_KEY of a packet - * @ppd: Physical IB port data - * @lrh: Local route header - * @bth: Base transport header - * @sc5: SC for packet + * @ppd: Physical IB port data + * @slid: SLID for packet + * @bkey: PKEY for header + * @sc5: SC for packet * @s_pkey_index: It will be used for look up optimization for kernel contexts * only. If it is negative value, then it means user contexts is calling this * function. @@ -1116,19 +1202,16 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) * * Return: 0 on success, otherwise, 1 */ -int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, +int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey, u8 sc5, int8_t s_pkey_index) { struct hfi1_devdata *dd; int i; - u16 pkey; int is_user_ctxt_mechanism = (s_pkey_index < 0); if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) return 0; - pkey = (u16)be32_to_cpu(bth[0]); - /* If SC15, pkey[0:14] must be 0x7fff */ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) goto bad; @@ -1161,8 +1244,6 @@ bad: dd = ppd->dd; if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) { - u16 slid = be16_to_cpu(lrh[3]); - dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK; dd->err_info_xmit_constraint.slid = slid; @@ -1179,11 +1260,11 @@ bad: * and size */ static inline send_routine get_send_routine(struct rvt_qp *qp, - struct verbs_txreq *tx) + struct hfi1_pkt_state *ps) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_qp_priv *priv = qp->priv; - struct ib_header *h = &tx->phdr.hdr; + struct verbs_txreq *tx = ps->s_txreq; if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA))) return dd->process_pio_send; @@ -1195,11 +1276,9 @@ static inline send_routine get_send_routine(struct rvt_qp *qp, break; case IB_QPT_UC: case IB_QPT_RC: { - u8 op = get_opcode(h); - if (piothreshold && tx->s_cur_size <= min(piothreshold, qp->pmtu) && - (BIT(op & OPMASK) & pio_opmask[op >> 5]) && + (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) && iowait_sdma_pending(&priv->s_iowait) == 0 && !sdma_txreq_built(&tx->txreq)) return dd->process_pio_send; @@ -1224,25 +1303,38 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_qp_priv *priv = qp->priv; struct ib_other_headers *ohdr; - struct ib_header *hdr; send_routine sr; int ret; - u8 lnh; + u16 pkey; + u32 slid; - hdr = &ps->s_txreq->phdr.hdr; /* locate the pkey within the headers */ - lnh = ib_get_lnh(hdr); - if (lnh == HFI1_LRH_GRH) - ohdr = &hdr->u.l.oth; - else - ohdr = &hdr->u.oth; - - sr = get_send_routine(qp, ps->s_txreq); - ret = egress_pkey_check(dd->pport, - hdr->lrh, - ohdr->bth, - priv->s_sc, - qp->s_pkey_index); + if (ps->s_txreq->phdr.hdr.hdr_type) { + struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah; + u8 l4 = hfi1_16B_get_l4(hdr); + + if (l4 == OPA_16B_L4_IB_GLOBAL) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; + slid = hfi1_16B_get_slid(hdr); + pkey = hfi1_16B_get_pkey(hdr); + } else { + struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh; + u8 lnh = ib_get_lnh(hdr); + + if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; + slid = ib_get_slid(hdr); + pkey = ib_bth_get_pkey(ohdr); + } + + ps->opcode = ib_bth_get_opcode(ohdr); + sr = get_send_routine(qp, ps); + ret = egress_pkey_check(dd->pport, slid, pkey, + priv->s_sc, qp->s_pkey_index); if (unlikely(ret)) { /* * The value we are returning here does not get propagated to @@ -1361,14 +1453,14 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); struct hfi1_devdata *dd = dd_from_dev(verbs_dev); struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; - u16 lid = ppd->lid; + u32 lid = ppd->lid; /* props being zeroed by the caller, avoid zeroing it here */ props->lid = lid ? lid : 0; props->lmc = ppd->lmc; /* OPA logical states match IB logical states */ props->state = driver_lstate(ppd); - props->phys_state = hfi1_ibphys_portstate(ppd); + props->phys_state = driver_pstate(ppd); props->gid_tbl_len = HFI1_GUIDS_PER_PORT; props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); /* see rate_show() in ib core/sysfs.c */ @@ -1388,6 +1480,15 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_2048); + /* + * sm_lid of 0xFFFF needs special handling so that it can + * be differentiated from a permissve LID of 0xFFFF. + * We set the grh_required flag here so the SA can program + * the DGID in the address handle appropriately + */ + if (props->sm_lid == be16_to_cpu(IB_LID_PERMISSIVE)) + props->grh_required = true; + return 0; } @@ -1473,6 +1574,10 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) struct hfi1_devdata *dd; u8 sc5; + if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + /* test the mapping for validity */ ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); ppd = ppd_from_ibp(ibp); @@ -1491,6 +1596,7 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev, struct hfi1_pportdata *ppd; struct hfi1_devdata *dd; u8 sc5; + struct rdma_ah_attr *attr = &ah->attr; /* * Do not trust reading anything from rvt_ah at this point as it is not @@ -1500,33 +1606,14 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev, ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); ppd = ppd_from_ibp(ibp); sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)]; + hfi1_update_ah_attr(ibdev, attr); + hfi1_make_opa_lid(attr); dd = dd_from_ppd(ppd); ah->vl = sc_to_vlt(dd, sc5); if (ah->vl < num_vls || ah->vl == 15) ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); } -struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid) -{ - struct rdma_ah_attr attr; - struct ib_ah *ah = ERR_PTR(-EINVAL); - struct rvt_qp *qp0; - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - struct hfi1_devdata *dd = dd_from_ppd(ppd); - u8 port_num = ppd->port; - - memset(&attr, 0, sizeof(attr)); - attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num); - rdma_ah_set_dlid(&attr, dlid); - rdma_ah_set_port_num(&attr, ppd_from_ibp(ibp)->port); - rcu_read_lock(); - qp0 = rcu_dereference(ibp->rvp.qp[0]); - if (qp0) - ah = rdma_create_ah(qp0->ibqp.pd, &attr); - rcu_read_unlock(); - return ah; -} - /** * hfi1_get_npkeys - return the size of the PKEY table for context 0 * @dd: the hfi1_ib device @@ -1547,13 +1634,22 @@ static void init_ibport(struct hfi1_pportdata *ppd) ibp->sc_to_sl[i] = i; } + for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++) + INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list); + setup_timer(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, + (unsigned long)ibp); + spin_lock_init(&ibp->rvp.lock); /* Set the prefix to the default value (see ch. 4.1.1) */ ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; ibp->rvp.sm_lid = 0; - /* Below should only set bits defined in OPA PortInfo.CapabilityMask */ + /* + * Below should only set bits defined in OPA PortInfo.CapabilityMask + * and PortInfo.CapabilityMask3 + */ ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP | IB_PORT_CAP_MASK_NOTICE_SUP; + ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported; ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; @@ -1564,14 +1660,13 @@ static void init_ibport(struct hfi1_pportdata *ppd) RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); } -static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str) { struct rvt_dev_info *rdi = ib_to_rvt(ibdev); struct hfi1_ibdev *dev = dev_from_rdi(rdi); u32 ver = dd_from_dev(dev)->dc8051_ver; - snprintf(str, str_len, "%u.%u.%u", dc8051_ver_maj(ver), + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver), dc8051_ver_min(ver), dc8051_ver_patch(ver)); } @@ -1816,7 +1911,8 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK; - dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA; + dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA | + RDMA_CORE_CAP_OPA_AH; dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE; dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index cd635d0c1d3b..87d1285a3340 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -95,6 +95,7 @@ struct hfi1_packet; #define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0) #define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +#define OPA_BTH_MIG_REQ BIT(31) #define RC_OP(x) IB_OPCODE_RC_##x #define UC_OP(x) IB_OPCODE_UC_##x @@ -104,6 +105,25 @@ enum { HFI1_HAS_GRH = (1 << 0), }; +struct hfi1_16b_header { + u32 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ib_other_headers oth; + } l; + struct ib_other_headers oth; + } u; +} __packed; + +struct hfi1_opa_header { + union { + struct ib_header ibh; /* 9B header */ + struct hfi1_16b_header opah; /* 16B header */ + }; + u8 hdr_type; /* 9B or 16B */ +} __packed; + struct hfi1_ahg_info { u32 ahgdesc[2]; u16 tx_flags; @@ -113,7 +133,7 @@ struct hfi1_ahg_info { struct hfi1_sdma_header { __le64 pbc; - struct ib_header hdr; + struct hfi1_opa_header hdr; } __packed; /* @@ -127,6 +147,7 @@ struct hfi1_qp_priv { u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; struct rvt_qp *owner; + u8 hdr_type; /* 9B or 16B */ }; /* @@ -142,7 +163,9 @@ struct hfi1_pkt_state { unsigned long timeout; unsigned long timeout_int; int cpu; + u8 opcode; bool in_thread; + bool pkts_sent; }; #define HFI1_PSN_CREDIT 16 @@ -236,8 +259,8 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) /* * This must be called with s_lock held. */ -void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, - u32 qp1, u32 qp2, u16 lid1, u16 lid2); +void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, + u32 qp1, u32 qp2, u32 lid1, u32 lid2); void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num); void hfi1_sys_guid_chg(struct hfi1_ibport *ibp); void hfi1_node_desc_chg(struct hfi1_ibport *ibp); @@ -257,13 +280,8 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, * necessarily be at least one bit less than * the container holding the PSN. */ -#ifndef CONFIG_HFI1_VERBS_31BIT_PSN -#define PSN_MASK 0xFFFFFF -#define PSN_SHIFT 8 -#else #define PSN_MASK 0x7FFFFFFF #define PSN_SHIFT 1 -#endif #define PSN_MODIFY_MASK 0xFFFFFF /* @@ -307,15 +325,12 @@ void hfi1_rc_rcv(struct hfi1_packet *packet); void hfi1_rc_hdrerr( struct hfi1_ctxtdata *rcd, - struct ib_header *hdr, - u32 rcv_flags, + struct hfi1_packet *packet, struct rvt_qp *qp); u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); -struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid); - -void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr); +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah); void hfi1_ud_rcv(struct hfi1_packet *packet); @@ -336,18 +351,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; -static inline u8 get_opcode(struct ib_header *h) -{ - u16 lnh = be16_to_cpu(h->lrh[0]) & 3; - - if (lnh == IB_LNH_IBA_LOCAL) - return be32_to_cpu(h->u.oth.bth[0]) >> 24; - else - return be32_to_cpu(h->u.l.oth.bth[0]) >> 24; -} - -int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct ib_header *hdr, - int has_grh, struct rvt_qp *qp, u32 bth0); +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet); u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, const struct ib_global_route *grh, u32 hwords, u32 nwords); @@ -365,7 +369,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread); void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status); -void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn); +void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp, + bool is_fecn); int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); @@ -379,6 +384,8 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *); void hfi1_ib_rcv(struct hfi1_packet *packet); +void hfi1_16B_rcv(struct hfi1_packet *packet); + unsigned hfi1_get_npkeys(struct hfi1_devdata *); int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c index 5d23172c470f..873e48ea923f 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.c +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -119,13 +119,6 @@ out: return tx; } -static void verbs_txreq_kmem_cache_ctor(void *obj) -{ - struct verbs_txreq *tx = (struct verbs_txreq *)obj; - - memset(tx, 0, sizeof(*tx)); -} - int verbs_txreq_init(struct hfi1_ibdev *dev) { char buf[TXREQ_LEN]; @@ -135,7 +128,7 @@ int verbs_txreq_init(struct hfi1_ibdev *dev) dev->verbs_txreq_cache = kmem_cache_create(buf, sizeof(struct verbs_txreq), 0, SLAB_HWCACHE_ALIGN, - verbs_txreq_kmem_cache_ctor); + NULL); if (!dev->verbs_txreq_cache) return -ENOMEM; return 0; diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h index 4a621cde4abb..5ae781514e32 100644 --- a/drivers/infiniband/hw/hfi1/vnic.h +++ b/drivers/infiniband/hw/hfi1/vnic.h @@ -54,21 +54,6 @@ #define HFI1_VNIC_MAX_TXQ 16 #define HFI1_VNIC_MAX_PAD 12 -/* L2 header definitions */ -#define HFI1_L2_TYPE_OFFSET 0x7 -#define HFI1_L2_TYPE_SHFT 0x5 -#define HFI1_L2_TYPE_MASK 0x3 - -#define HFI1_GET_L2_TYPE(hdr) \ - ((*((u8 *)(hdr) + HFI1_L2_TYPE_OFFSET) >> HFI1_L2_TYPE_SHFT) & \ - HFI1_L2_TYPE_MASK) - -/* L4 type definitions */ -#define HFI1_L4_TYPE_OFFSET 8 - -#define HFI1_GET_L4_TYPE(data) \ - (*((u8 *)(data) + HFI1_L4_TYPE_OFFSET)) - /* L4 header definitions */ #define HFI1_VNIC_L4_HDR_OFFSET OPA_VNIC_L2_HDR_LEN @@ -103,6 +88,7 @@ struct hfi1_vnic_sdma { struct sdma_txreq stx; unsigned int state; u8 q_idx; + bool pkts_sent; }; /** diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index 339f0cdd56d6..f419cbb05928 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -95,7 +95,7 @@ static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt) if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; - hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt); + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt); uctxt->is_vnic = true; done: @@ -106,22 +106,13 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata **vnic_ctxt) { struct hfi1_ctxtdata *uctxt; - unsigned int ctxt; int ret; if (dd->flags & HFI1_FROZEN) return -EIO; - for (ctxt = dd->first_dyn_alloc_ctxt; - ctxt < dd->num_rcv_contexts; ctxt++) - if (!dd->rcd[ctxt]) - break; - - if (ctxt == dd->num_rcv_contexts) - return -EBUSY; - - uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, dd->node); - if (!uctxt) { + ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt); + if (ret < 0) { dd_dev_err(dd, "Unable to create ctxtdata, failing open\n"); return -ENOMEM; } @@ -155,12 +146,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd, return ret; bail: - /* - * hfi1_free_ctxtdata() also releases send_context - * structure if uctxt->sc is not null - */ - dd->rcd[uctxt->ctxt] = NULL; - hfi1_free_ctxtdata(dd, uctxt); + hfi1_free_ctxt(uctxt); dd_dev_dbg(dd, "vnic allocation failed. rc %d\n", ret); return ret; } @@ -168,15 +154,12 @@ bail: static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt) { - unsigned long flags; - dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt); flush_wc(); if (dd->num_msix_entries) hfi1_reset_vnic_msix_info(uctxt); - spin_lock_irqsave(&dd->uctxt_lock, flags); /* * Disable receive context and interrupt available, reset all * RcvCtxtCtrl bits to default values. @@ -186,7 +169,7 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, HFI1_RCVCTRL_INTRAVAIL_DIS | HFI1_RCVCTRL_ONE_PKT_EGR_DIS | HFI1_RCVCTRL_NO_RHQ_DROP_DIS | - HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt); + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); /* * VNIC contexts are allocated from user context pool. * Release them back to user context pool. @@ -199,16 +182,15 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, sc_disable(uctxt->sc); dd->send_contexts[uctxt->sc->sw_index].type = SC_USER; - spin_unlock_irqrestore(&dd->uctxt_lock, flags); - dd->rcd[uctxt->ctxt] = NULL; uctxt->event_flags = 0; hfi1_clear_tids(uctxt); hfi1_clear_ctxt_pkey(dd, uctxt); hfi1_stats.sps_ctxts--; - hfi1_free_ctxtdata(dd, uctxt); + + hfi1_free_ctxt(uctxt); } void hfi1_vnic_setup(struct hfi1_devdata *dd) @@ -582,8 +564,8 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) int l4_type, vesw_id = -1; u8 q_idx; - l4_type = HFI1_GET_L4_TYPE(packet->ebuf); - if (likely(l4_type == OPA_VNIC_L4_ETHR)) { + l4_type = hfi1_16B_get_l4(packet->ebuf); + if (likely(l4_type == OPA_16B_L4_ETHR)) { vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf); vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id); @@ -751,6 +733,7 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]); if (rc) break; + hfi1_rcd_get(dd->vnic.ctxt[i]); dd->vnic.ctxt[i]->vnic_q_idx = i; } @@ -762,6 +745,7 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) */ while (i-- > dd->vnic.num_ctxt) { deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]); + hfi1_rcd_put(dd->vnic.ctxt[i]); dd->vnic.ctxt[i] = NULL; } goto alloc_fail; @@ -791,6 +775,7 @@ static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo) if (--dd->vnic.num_vports == 0) { for (i = 0; i < dd->vnic.num_ctxt; i++) { deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]); + hfi1_rcd_put(dd->vnic.ctxt[i]); dd->vnic.ctxt[i] = NULL; } hfi1_deinit_vnic_rsm(dd); diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 51a817d3aa14..c3c96c5869ed 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -198,11 +198,16 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, goto free_desc; tx->retry_count = 0; - ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq); + ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq, + vnic_sdma->pkts_sent); /* When -ECOMM, sdma callback will be called with ABORT status */ if (unlikely(ret && unlikely(ret != -ECOMM))) goto free_desc; + if (!ret) { + vnic_sdma->pkts_sent = true; + iowait_starve_clear(vnic_sdma->pkts_sent, &vnic_sdma->wait); + } return ret; free_desc: @@ -211,6 +216,8 @@ free_desc: tx_err: if (ret != -EBUSY) dev_kfree_skb_any(skb); + else + vnic_sdma->pkts_sent = false; return ret; } @@ -225,7 +232,8 @@ tx_err: static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *txreq, - unsigned int seq) + uint seq, + bool pkts_sent) { struct hfi1_vnic_sdma *vnic_sdma = container_of(wait, struct hfi1_vnic_sdma, wait); @@ -239,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; write_seqlock(&dev->iowait_lock); if (list_empty(&vnic_sdma->wait.list)) - list_add_tail(&vnic_sdma->wait.list, &sde->dmawait); + iowait_queue(pkts_sent, wait, &sde->dmawait); write_sequnlock(&dev->iowait_lock); return -EBUSY; } @@ -295,22 +303,15 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) } } -static void hfi1_vnic_txreq_kmem_cache_ctor(void *obj) -{ - struct vnic_txreq *tx = (struct vnic_txreq *)obj; - - memset(tx, 0, sizeof(*tx)); -} - int hfi1_vnic_txreq_init(struct hfi1_devdata *dd) { char buf[HFI1_VNIC_TXREQ_NAME_LEN]; snprintf(buf, sizeof(buf), "hfi1_%u_vnic_txreq_cache", dd->unit); dd->vnic.txreq_cache = kmem_cache_create(buf, - sizeof(struct vnic_txreq), - 0, SLAB_HWCACHE_ALIGN, - hfi1_vnic_txreq_kmem_cache_ctor); + sizeof(struct vnic_txreq), + 0, SLAB_HWCACHE_ALIGN, + NULL); if (!dd->vnic.txreq_cache) return -ENOMEM; return 0; diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index e1a6e055cd60..61c93bbd230d 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -1,7 +1,7 @@ config INFINIBAND_HNS tristate "HNS RoCE Driver" depends on NET_VENDOR_HISILICON - depends on ARM64 && HNS && HNS_DSAF && HNS_ENET + depends on (ARM64 || (COMPILE_TEST && 64BIT)) && HNS && HNS_DSAF && HNS_ENET ---help--- This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine is used in Hisilicon Hi1610 and more further ICT SoC. diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 605962f2828c..e1b433cdd5e2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -32,6 +32,7 @@ */ #include <linux/platform_device.h> +#include <linux/vmalloc.h> #include "hns_roce_device.h" int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj) diff --git a/drivers/infiniband/hw/hns/hns_roce_eq.c b/drivers/infiniband/hw/hns/hns_roce_eq.c index 50f864935a0e..b0f43735de1a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_eq.c +++ b/drivers/infiniband/hw/hns/hns_roce_eq.c @@ -31,6 +31,7 @@ */ #include <linux/platform_device.h> +#include <linux/interrupt.h> #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_eq.h" @@ -292,7 +293,7 @@ static int hns_roce_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) dev_warn(dev, "Unhandled event %d on EQ %d at index %u\n", event_type, eq->eqn, eq->cons_index); break; - }; + } eq->cons_index++; aeqes_found = 1; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 2540b65e242c..747efd1ae5a6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -2023,7 +2023,6 @@ int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); u32 notification_flag; u32 doorbell[2]; - int ret = 0; notification_flag = (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? CQ_DB_REQ_NOT : CQ_DB_REQ_NOT_SOL; @@ -2043,7 +2042,7 @@ int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) hns_roce_write64_k(doorbell, hr_cq->cq_db_l); - return ret; + return 0; } static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq, diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 80fc01ffd8bd..e387360e3780 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -32,6 +32,7 @@ */ #include <linux/platform_device.h> +#include <linux/vmalloc.h> #include <rdma/ib_umem.h> #include "hns_roce_device.h" #include "hns_roce_cmd.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 054c52699090..f5dd21c2d275 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -799,7 +799,7 @@ bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq, cur = hr_wq->head - hr_wq->tail; if (likely(cur + nreq < hr_wq->max_post)) - return 0; + return false; hr_cq = to_hr_cq(ib_cq); spin_lock(&hr_cq->lock); diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 5a2fa743676c..14f36ba4e5be 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1582,15 +1582,14 @@ static enum i40iw_status_code i40iw_del_multiple_qhash( } /** - * i40iw_netdev_vlan_ipv6 - Gets the netdev and mac + * i40iw_netdev_vlan_ipv6 - Gets the netdev and vlan * @addr: local IPv6 address * @vlan_id: vlan id for the given IPv6 address - * @mac: mac address for the given IPv6 address * * Returns the net_device of the IPv6 address and also sets the - * vlan id and mac for that address. + * vlan id for that address. */ -static struct net_device *i40iw_netdev_vlan_ipv6(u32 *addr, u16 *vlan_id, u8 *mac) +static struct net_device *i40iw_netdev_vlan_ipv6(u32 *addr, u16 *vlan_id) { struct net_device *ip_dev = NULL; struct in6_addr laddr6; @@ -1600,15 +1599,11 @@ static struct net_device *i40iw_netdev_vlan_ipv6(u32 *addr, u16 *vlan_id, u8 *ma i40iw_copy_ip_htonl(laddr6.in6_u.u6_addr32, addr); if (vlan_id) *vlan_id = I40IW_NO_VLAN; - if (mac) - eth_zero_addr(mac); rcu_read_lock(); for_each_netdev_rcu(&init_net, ip_dev) { if (ipv6_chk_addr(&init_net, &laddr6, ip_dev, 1)) { if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(ip_dev); - if (ip_dev->dev_addr && mac) - ether_addr_copy(mac, ip_dev->dev_addr); break; } } @@ -3588,7 +3583,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->vlan_id = i40iw_get_vlan_ipv4(cm_node->loc_addr); } else { cm_node->ipv4 = false; - i40iw_netdev_vlan_ipv6(cm_node->loc_addr, &cm_node->vlan_id, NULL); + i40iw_netdev_vlan_ipv6(cm_node->loc_addr, &cm_node->vlan_id); } i40iw_debug(cm_node->dev, I40IW_DEBUG_CM, @@ -3687,8 +3682,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->accelerated = 1; if (cm_node->accept_pend) { - if (!cm_node->listener) - i40iw_pr_err("cm_node->listener NULL for passive node\n"); atomic_dec(&cm_node->listener->pend_accepts_cnt); cm_node->accept_pend = 0; } @@ -3789,7 +3782,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) raddr6->sin6_addr.in6_u.u6_addr32); cm_info.loc_port = ntohs(laddr6->sin6_port); cm_info.rem_port = ntohs(raddr6->sin6_port); - i40iw_netdev_vlan_ipv6(cm_info.loc_addr, &cm_info.vlan_id, NULL); + i40iw_netdev_vlan_ipv6(cm_info.loc_addr, &cm_info.vlan_id); } cm_info.cm_id = cm_id; cm_info.tos = cm_id->tos; @@ -3931,8 +3924,7 @@ int i40iw_create_listen(struct iw_cm_id *cm_id, int backlog) cm_info.loc_port = ntohs(laddr6->sin6_port); if (ipv6_addr_type(&laddr6->sin6_addr) != IPV6_ADDR_ANY) i40iw_netdev_vlan_ipv6(cm_info.loc_addr, - &cm_info.vlan_id, - NULL); + &cm_info.vlan_id); else wildcard = true; } @@ -4056,12 +4048,7 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event) i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); cm_node->accelerated = 1; - if (cm_node->accept_pend) { - if (!cm_node->listener) - i40iw_pr_err("listener is null for passive node\n"); - atomic_dec(&cm_node->listener->pend_accepts_cnt); - cm_node->accept_pend = 0; - } + return; error: diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index a49ff2eb6fb3..d1f5345f04f0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -54,6 +54,17 @@ static inline void i40iw_insert_wqe_hdr(u64 *wqe, u64 header) set_64bit_val(wqe, 24, header); } +void i40iw_check_cqp_progress(struct i40iw_cqp_timeout *cqp_timeout, struct i40iw_sc_dev *dev) +{ + if (cqp_timeout->compl_cqp_cmds != dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]) { + cqp_timeout->compl_cqp_cmds = dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]; + cqp_timeout->count = 0; + } else { + if (dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS] != cqp_timeout->compl_cqp_cmds) + cqp_timeout->count++; + } +} + /** * i40iw_get_cqp_reg_info - get head and tail for cqp using registers * @cqp: struct for cqp hw diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index ae8463ff59a7..cc742c3132c6 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -77,7 +77,6 @@ MODULE_PARM_DESC(mpa_version, "MPA version to be used in MPA Req/Resp 1 or 2"); MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>"); MODULE_DESCRIPTION("Intel(R) Ethernet Connection X722 iWARP RDMA Driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); static struct i40e_client i40iw_client; static char i40iw_client_name[I40E_CLIENT_STR_LENGTH] = "i40iw"; diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h index 28a92fee0822..e217a1259f57 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_p.h +++ b/drivers/infiniband/hw/i40iw/i40iw_p.h @@ -35,11 +35,13 @@ #ifndef I40IW_P_H #define I40IW_P_H -#define PAUSE_TIMER_VALUE 0xFFFF -#define REFRESH_THRESHOLD 0x7FFF -#define HIGH_THRESHOLD 0x800 -#define LOW_THRESHOLD 0x200 -#define ALL_TC2PFC 0xFF +#define PAUSE_TIMER_VALUE 0xFFFF +#define REFRESH_THRESHOLD 0x7FFF +#define HIGH_THRESHOLD 0x800 +#define LOW_THRESHOLD 0x200 +#define ALL_TC2PFC 0xFF +#define CQP_COMPL_WAIT_TIME 0x3E8 +#define CQP_TIMEOUT_THRESHOLD 5 void i40iw_debug_buf(struct i40iw_sc_dev *dev, enum i40iw_debug_flag mask, char *desc, u64 *buf, u32 size); @@ -51,6 +53,8 @@ void i40iw_sc_cqp_post_sq(struct i40iw_sc_cqp *cqp); u64 *i40iw_sc_cqp_get_next_send_wqe(struct i40iw_sc_cqp *cqp, u64 scratch); +void i40iw_check_cqp_progress(struct i40iw_cqp_timeout *cqp_timeout, struct i40iw_sc_dev *dev); + enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp, struct i40iw_fast_reg_stag_info *info, bool post_sq); diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c index c87ba1617087..540aab5e502d 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_pble.c +++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c @@ -269,10 +269,8 @@ static enum i40iw_status_code add_bp_pages(struct i40iw_sc_dev *dev, status = i40iw_add_sd_table_entry(dev->hw, hmc_info, info->idx.sd_idx, I40IW_SD_TYPE_PAGED, I40IW_HMC_DIRECT_BP_SIZE); - if (status) { - i40iw_free_vmalloc_mem(dev->hw, chunk); - return status; - } + if (status) + goto error; if (!dev->is_pf) { status = i40iw_vchnl_vf_add_hmc_objs(dev, I40IW_HMC_IW_PBLE, fpm_to_idx(pble_rsrc, @@ -280,8 +278,7 @@ static enum i40iw_status_code add_bp_pages(struct i40iw_sc_dev *dev, (info->pages << PBLE_512_SHIFT)); if (status) { i40iw_pr_err("allocate PBLEs in the PF. Error %i\n", status); - i40iw_free_vmalloc_mem(dev->hw, chunk); - return status; + goto error; } } addr = chunk->vaddr; diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 7f5583d83622..c2cab20c4bc5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -949,14 +949,16 @@ enum i40iw_status_code i40iw_puda_create_rsrc(struct i40iw_sc_vsi *vsi, ret = i40iw_puda_qp_create(rsrc); } if (ret) { - i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error qp_create\n", __func__); + i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error qp_create\n", + __func__); goto error; } rsrc->completion = PUDA_QP_CREATED; ret = i40iw_puda_allocbufs(rsrc, info->tx_buf_cnt + info->rq_size); if (ret) { - i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error allloc_buf\n", __func__); + i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error alloc_buf\n", + __func__); goto error; } diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h index 959ec81fba99..63118f6d5ab4 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_type.h +++ b/drivers/infiniband/hw/i40iw/i40iw_type.h @@ -1345,4 +1345,9 @@ struct i40iw_virtchnl_work_info { void *worker_vf_dev; }; +struct i40iw_cqp_timeout { + u64 compl_cqp_cmds; + u8 count; +}; + #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index 1060725d18bc..0aadb7a0d1aa 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -912,7 +912,7 @@ enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data return 0; } -static struct i40iw_qp_uk_ops iw_qp_uk_ops = { +static const struct i40iw_qp_uk_ops iw_qp_uk_ops = { .iw_qp_post_wr = i40iw_qp_post_wr, .iw_qp_ring_push_db = i40iw_qp_ring_push_db, .iw_rdma_write = i40iw_rdma_write, @@ -926,14 +926,14 @@ static struct i40iw_qp_uk_ops iw_qp_uk_ops = { .iw_post_nop = i40iw_nop }; -static struct i40iw_cq_ops iw_cq_ops = { +static const struct i40iw_cq_ops iw_cq_ops = { .iw_cq_request_notification = i40iw_cq_request_notification, .iw_cq_poll_completion = i40iw_cq_poll_completion, .iw_cq_post_entries = i40iw_cq_post_entries, .iw_cq_clean = i40iw_clean_cq }; -static struct i40iw_device_uk_ops iw_device_uk_ops = { +static const struct i40iw_device_uk_ops iw_device_uk_ops = { .iwarp_cq_uk_init = i40iw_cq_uk_init, .iwarp_qp_uk_init = i40iw_qp_uk_init, }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index e311ec559f4e..62f1f45b8737 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -445,23 +445,29 @@ static int i40iw_wait_event(struct i40iw_device *iwdev, { struct cqp_commands_info *info = &cqp_request->info; struct i40iw_cqp *iwcqp = &iwdev->cqp; + struct i40iw_cqp_timeout cqp_timeout; bool cqp_error = false; int err_code = 0; - int timeout_ret = 0; + memset(&cqp_timeout, 0, sizeof(cqp_timeout)); + cqp_timeout.compl_cqp_cmds = iwdev->sc_dev.cqp_cmd_stats[OP_COMPLETED_COMMANDS]; + do { + if (wait_event_timeout(cqp_request->waitq, + cqp_request->request_done, CQP_COMPL_WAIT_TIME)) + break; - timeout_ret = wait_event_timeout(cqp_request->waitq, - cqp_request->request_done, - I40IW_EVENT_TIMEOUT); - if (!timeout_ret) { - i40iw_pr_err("error cqp command 0x%x timed out ret = %d\n", - info->cqp_cmd, timeout_ret); + i40iw_check_cqp_progress(&cqp_timeout, &iwdev->sc_dev); + + if (cqp_timeout.count < CQP_TIMEOUT_THRESHOLD) + continue; + + i40iw_pr_err("error cqp command 0x%x timed out", info->cqp_cmd); err_code = -ETIME; if (!iwdev->reset) { iwdev->reset = true; i40iw_request_reset(iwdev); } goto done; - } + } while (1); cqp_error = cqp_request->compl_info.error; if (cqp_error) { i40iw_pr_err("error cqp command 0x%x completion maj = 0x%x min=0x%x\n", diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 02d871db7ca5..1aa411034a27 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2584,13 +2584,12 @@ static const char * const i40iw_hw_stat_names[] = { "iwRdmaInv" }; -static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str, - size_t str_len) +static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str) { u32 firmware_version = I40IW_FW_VERSION; - snprintf(str, str_len, "%u.%u", firmware_version, - (firmware_version & 0x000000ff)); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version, + (firmware_version & 0x000000ff)); } /** diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index ea24230ea0d4..155b4dfc0ae8 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -528,7 +528,7 @@ static int set_guid_rec(struct ib_device *ibdev, memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); - guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.lid = ib_lid_be16(attr.lid); guid_info_rec.block_num = index; memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, @@ -781,7 +781,7 @@ void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); if (!dev->sriov.is_going_down) { - /* If there is pending one should cancell then run, otherwise + /* If there is pending one should cancel then run, otherwise * won't run till previous one is ended as same work * struct is used. */ diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index ff931c580557..cab796341697 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -218,6 +218,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, goto err_mtt; uar = &to_mucontext(context)->uar; + cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS; } else { err = mlx4_db_alloc(dev->dev, &cq->db, 1); if (err) @@ -233,6 +234,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, goto err_db; uar = &dev->priv_uar; + cq->mcq.usage = MLX4_RES_USAGE_DRIVER; } if (dev->eq_table) @@ -635,7 +637,7 @@ static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries, struct mlx4_ib_qp *qp; *npolled = 0; - /* Find uncompleted WQEs belonging to that cq and retrun + /* Find uncompleted WQEs belonging to that cq and return * simulated FLUSH_ERR completions */ list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) { diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 21d31cb1325f..0793a21d76f4 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -169,7 +169,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, op_modifier |= 0x4; - in_modifier |= in_wc->slid << 16; + in_modifier |= ib_lid_cpu16(in_wc->slid) << 16; } err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, @@ -625,7 +625,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); } else { tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + tun_mad->hdr.slid_mac_47_32 = ib_lid_be16(wc->slid); } ib_dma_sync_single_for_device(&dev->ib_dev, @@ -826,7 +826,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, } } - slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { forward_trap(to_mdev(ibdev), port_num, in_mad); @@ -860,7 +860,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = pattr.lid; + prev_lid = ib_lid_cpu16(pattr.lid); err = mlx4_MAD_IFC(to_mdev(ibdev), (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d1b43cbbfea7..c636842c5be0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -70,7 +70,6 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); int mlx4_ib_sm_guid_assign = 0; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); @@ -81,6 +80,8 @@ static const char mlx4_ib_version[] = DRV_VERSION "\n"; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); +static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, + u8 port_num); static struct workqueue_struct *wq; @@ -552,6 +553,16 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->timestamp_mask = 0xFFFFFFFFFFFFULL; props->max_ah = INT_MAX; + if ((dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) && + (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET || + mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET)) { + props->rss_caps.max_rwq_indirection_tables = props->max_qp; + props->rss_caps.max_rwq_indirection_table_size = + dev->dev->caps.max_rss_tbl_sz; + props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; + props->max_wq_type_rq = props->max_qp; + } + if (!mlx4_is_slave(dev->dev)) err = mlx4_get_internal_clock_params(dev->dev, &clock_params); @@ -563,6 +574,13 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, } } + if (uhw->outlen >= resp.response_length + + sizeof(resp.max_inl_recv_sz)) { + resp.response_length += sizeof(resp.max_inl_recv_sz); + resp.max_inl_recv_sz = dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg); + } + if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) @@ -1069,6 +1087,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); + INIT_LIST_HEAD(&context->wqn_ranges_list); + mutex_init(&context->wqn_ranges_mutex); + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else @@ -2566,12 +2587,11 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_fw_ver_str(struct ib_device *device, char *str, - size_t str_len) +static void get_fw_ver_str(struct ib_device *device, char *str) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); @@ -2713,6 +2733,26 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str; ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext; + if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) && + ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) || + (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) == + IB_LINK_LAYER_ETHERNET))) { + ibdev->ib_dev.create_wq = mlx4_ib_create_wq; + ibdev->ib_dev.modify_wq = mlx4_ib_modify_wq; + ibdev->ib_dev.destroy_wq = mlx4_ib_destroy_wq; + ibdev->ib_dev.create_rwq_ind_table = + mlx4_ib_create_rwq_ind_table; + ibdev->ib_dev.destroy_rwq_ind_table = + mlx4_ib_destroy_rwq_ind_table; + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); + } + if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; @@ -2772,7 +2812,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) allocated = 0; if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { - err = mlx4_counter_alloc(ibdev->dev, &counter_index); + err = mlx4_counter_alloc(ibdev->dev, &counter_index, + MLX4_RES_USAGE_DRIVER); /* if failed to allocate a new counter, use default */ if (err) counter_index = @@ -2827,7 +2868,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, - &ibdev->steer_qpn_base, 0); + &ibdev->steer_qpn_base, 0, + MLX4_RES_USAGE_DRIVER); if (err) goto err_counter; diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index b73f89700ef9..70eb9f917303 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -808,8 +808,7 @@ static ssize_t sysfs_show_group(struct device *dev, struct device_attribute *attr, char *buf); static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, - union ib_gid *mgid, int create, - gfp_t gfp_mask) + union ib_gid *mgid, int create) { struct mcast_group *group, *cur_group; int is_mgid0; @@ -825,7 +824,7 @@ static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, if (!create) return ERR_PTR(-ENOENT); - group = kzalloc(sizeof *group, gfp_mask); + group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); @@ -892,7 +891,7 @@ int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, case IB_MGMT_METHOD_GET_RESP: case IB_SA_METHOD_DELETE_RESP: mutex_lock(&ctx->mcg_table_lock); - group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); + group = acquire_group(ctx, &rec->mgid, 0); mutex_unlock(&ctx->mcg_table_lock); if (IS_ERR(group)) { if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { @@ -954,7 +953,7 @@ int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, req->sa_mad = *sa_mad; mutex_lock(&ctx->mcg_table_lock); - group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); + group = acquire_group(ctx, &rec->mgid, may_create); mutex_unlock(&ctx->mcg_table_lock); if (IS_ERR(group)) { kfree(req); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 9db82e67e959..1fa19820355a 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -46,6 +46,7 @@ #include <linux/mlx4/device.h> #include <linux/mlx4/doorbell.h> +#include <linux/mlx4/qp.h> #define MLX4_IB_DRV_NAME "mlx4_ib" @@ -88,6 +89,8 @@ struct mlx4_ib_ucontext { struct list_head db_page_list; struct mutex db_page_mutex; struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; + struct list_head wqn_ranges_list; + struct mutex wqn_ranges_mutex; /* protect wqn_ranges_list */ }; struct mlx4_ib_pd { @@ -289,8 +292,25 @@ struct mlx4_roce_smac_vlan_info { int update_vid; }; +struct mlx4_wqn_range { + int base_wqn; + int size; + int refcount; + bool dirty; + struct list_head list; +}; + +struct mlx4_ib_rss { + unsigned int base_qpn_tbl_sz; + u8 flags; + u8 rss_key[MLX4_EN_RSS_KEY_SIZE]; +}; + struct mlx4_ib_qp { - struct ib_qp ibqp; + union { + struct ib_qp ibqp; + struct ib_wq ibwq; + }; struct mlx4_qp mqp; struct mlx4_buf buf; @@ -318,6 +338,7 @@ struct mlx4_ib_qp { u8 sq_no_prefetch; u8 state; int mlx_type; + u32 inl_recv_sz; struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; @@ -328,6 +349,10 @@ struct mlx4_ib_qp { struct list_head cq_recv_list; struct list_head cq_send_list; struct counter_index *counter_index; + struct mlx4_wqn_range *wqn_range; + /* Number of RSS QP parents that uses this WQ */ + u32 rss_usecnt; + struct mlx4_ib_rss *rss_ctx; }; struct mlx4_ib_srq { @@ -623,6 +648,8 @@ struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; __u64 hca_core_clock_offset; + __u32 max_inl_recv_sz; + __u32 reserved; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) @@ -890,4 +917,17 @@ void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port); +struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_wq(struct ib_wq *wq); +int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); + +struct ib_rwq_ind_table +*mlx4_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 75c0e6c5dd56..2747abde2ea8 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -36,7 +36,6 @@ #include <net/ip.h> #include <linux/slab.h> #include <linux/netdevice.h> -#include <linux/vmalloc.h> #include <rdma/ib_cache.h> #include <rdma/ib_pack.h> @@ -53,6 +52,7 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state); enum { MLX4_IB_ACK_REQ_FREQ = 8, @@ -116,6 +116,11 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), }; +enum mlx4_ib_source_type { + MLX4_IB_QP_SRC = 0, + MLX4_IB_RWQ_SRC = 1, +}; + static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); @@ -330,6 +335,12 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) } } +static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n", + type, qp->qpn); +} + static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* @@ -377,7 +388,8 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - int is_user, int has_rq, struct mlx4_ib_qp *qp) + int is_user, int has_rq, struct mlx4_ib_qp *qp, + u32 inl_recv_sz) { /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || @@ -385,18 +397,24 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, return -EINVAL; if (!has_rq) { - if (cap->max_recv_wr) + if (cap->max_recv_wr || inl_recv_sz) return -EINVAL; qp->rq.wqe_cnt = qp->rq.max_gs = 0; } else { + u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg); + u32 wqe_size; + /* HW requires >= 1 RQ entry with >= 1 gather entry */ - if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge || + inl_recv_sz > max_inl_recv_sz)) return -EINVAL; qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); - qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg); + qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz)); } /* leave userspace return values as they were, so as not to break ABI */ @@ -632,7 +650,300 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev, qp->counter_index = NULL; } +static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, + struct ib_qp_init_attr *init_attr, + struct mlx4_ib_create_qp_rss *ucmd) +{ + rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num | + (init_attr->rwq_ind_tbl->log_ind_tbl_size << 24); + + if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) && + (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) { + memcpy(rss_ctx->rss_key, ucmd->rx_hash_key, + MLX4_EN_RSS_KEY_SIZE); + } else { + pr_debug("RX Hash function is not supported\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { + rss_ctx->flags = MLX4_RSS_IPV4; + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { + pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) { + rss_ctx->flags |= MLX4_RSS_IPV6; + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) { + pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) { + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) { + pr_debug("RX Hash fields_mask for UDP is not supported\n"); + return (-EOPNOTSUPP); + } + + if (rss_ctx->flags & MLX4_RSS_IPV4) { + rss_ctx->flags |= MLX4_RSS_UDP_IPV4; + } else if (rss_ctx->flags & MLX4_RSS_IPV6) { + rss_ctx->flags |= MLX4_RSS_UDP_IPV6; + } else { + pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n"); + return (-EOPNOTSUPP); + } + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) { + pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { + if (rss_ctx->flags & MLX4_RSS_IPV4) { + rss_ctx->flags |= MLX4_RSS_TCP_IPV4; + } else if (rss_ctx->flags & MLX4_RSS_IPV6) { + rss_ctx->flags |= MLX4_RSS_TCP_IPV6; + } else { + pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n"); + return (-EOPNOTSUPP); + } + + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { + pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + return 0; +} + +static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct mlx4_ib_create_qp_rss *ucmd, + struct mlx4_ib_qp *qp) +{ + int qpn; + int err; + + qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS; + + err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage); + if (err) + return err; + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + if (err) + goto err_qpn; + + mutex_init(&qp->mutex); + + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET; + qp->state = IB_QPS_RESET; + + /* Set dummy send resources to be compatible with HV and PRM */ + qp->sq_no_prefetch = 1; + qp->sq.wqe_cnt = 1; + qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE; + qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE; + qp->mtt = (to_mqp( + (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt; + + qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL); + if (!qp->rss_ctx) { + err = -ENOMEM; + goto err_qp_alloc; + } + + err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd); + if (err) + goto err; + + return 0; + +err: + kfree(qp->rss_ctx); + +err_qp_alloc: + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + +err_qpn: + mlx4_qp_release_range(dev->dev, qpn, 1); + return err; +} + +static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp; + struct mlx4_ib_create_qp_rss ucmd = {}; + size_t required_cmd_sz; + int err; + + if (!udata) { + pr_debug("RSS QP with NULL udata\n"); + return ERR_PTR(-EINVAL); + } + + if (udata->outlen) + return ERR_PTR(-EOPNOTSUPP); + + required_cmd_sz = offsetof(typeof(ucmd), reserved1) + + sizeof(ucmd.reserved1); + if (udata->inlen < required_cmd_sz) { + pr_debug("invalid inlen\n"); + return ERR_PTR(-EINVAL); + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + pr_debug("copy failed\n"); + return ERR_PTR(-EFAULT); + } + + if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved))) + return ERR_PTR(-EOPNOTSUPP); + + if (ucmd.comp_mask || ucmd.reserved1) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + pr_debug("inlen is not supported\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + pr_debug("RSS QP with unsupported QP type %d\n", + init_attr->qp_type); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->create_flags) { + pr_debug("RSS QP doesn't support create flags\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->send_cq || init_attr->cap.max_send_wr) { + pr_debug("RSS QP with unsupported send attributes\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + + err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibqp.qp_num = qp->mqp.qpn; + + return &qp->ibqp; +} + +/* + * This function allocates a WQN from a range which is consecutive and aligned + * to its size. In case the range is full, then it creates a new range and + * allocates WQN from it. The new range will be used for following allocations. + */ +static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context, + struct mlx4_ib_qp *qp, int range_size, int *wqn) +{ + struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device); + struct mlx4_wqn_range *range; + int err = 0; + + mutex_lock(&context->wqn_ranges_mutex); + + range = list_first_entry_or_null(&context->wqn_ranges_list, + struct mlx4_wqn_range, list); + + if (!range || (range->refcount == range->size) || range->dirty) { + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + err = -ENOMEM; + goto out; + } + + err = mlx4_qp_reserve_range(dev->dev, range_size, + range_size, &range->base_wqn, 0, + qp->mqp.usage); + if (err) { + kfree(range); + goto out; + } + + range->size = range_size; + list_add(&range->list, &context->wqn_ranges_list); + } else if (range_size != 1) { + /* + * Requesting a new range (>1) when last range is still open, is + * not valid. + */ + err = -EINVAL; + goto out; + } + + qp->wqn_range = range; + + *wqn = range->base_wqn + range->refcount; + + range->refcount++; + +out: + mutex_unlock(&context->wqn_ranges_mutex); + + return err; +} + +static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context, + struct mlx4_ib_qp *qp, bool dirty_release) +{ + struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device); + struct mlx4_wqn_range *range; + + mutex_lock(&context->wqn_ranges_mutex); + + range = qp->wqn_range; + + range->refcount--; + if (!range->refcount) { + mlx4_qp_release_range(dev->dev, range->base_wqn, + range->size); + list_del(&range->list); + kfree(range); + } else if (dirty_release) { + /* + * A range which one of its WQNs is destroyed, won't be able to be + * reused for further WQN allocations. + * The next created WQ will allocate a new range. + */ + range->dirty = 1; + } + + mutex_unlock(&context->wqn_ranges_mutex); +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + enum mlx4_ib_source_type src, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) @@ -645,6 +956,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; struct mlx4_ib_cq *mcq; unsigned long flags; + int range_size = 0; /* When tunneling special qps, we use a plain UD qp */ if (sqpn) { @@ -719,26 +1031,70 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); - if (err) - goto err; if (pd->uobject) { - struct mlx4_ib_create_qp ucmd; + union { + struct mlx4_ib_create_qp qp; + struct mlx4_ib_create_wq wq; + } ucmd; + size_t copy_len; - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + copy_len = (src == MLX4_IB_QP_SRC) ? + sizeof(struct mlx4_ib_create_qp) : + min(sizeof(struct mlx4_ib_create_wq), udata->inlen); + + if (ib_copy_from_udata(&ucmd, udata, copy_len)) { err = -EFAULT; goto err; } - qp->sq_no_prefetch = ucmd.sq_no_prefetch; + if (src == MLX4_IB_RWQ_SRC) { + if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] || + ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) { + pr_debug("user command isn't supported\n"); + err = -EOPNOTSUPP; + goto err; + } + + if (ucmd.wq.log_range_size > + ilog2(dev->dev->caps.max_rss_tbl_sz)) { + pr_debug("WQN range size must be equal or smaller than %d\n", + dev->dev->caps.max_rss_tbl_sz); + err = -EOPNOTSUPP; + goto err; + } + range_size = 1 << ucmd.wq.log_range_size; + } else { + qp->inl_recv_sz = ucmd.qp.inl_recv_sz; + } - err = set_user_sq_size(dev, qp, &ucmd); + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, qp->inl_recv_sz); if (err) goto err; - qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - qp->buf_size, 0, 0); + if (src == MLX4_IB_QP_SRC) { + qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch; + + err = set_user_sq_size(dev, qp, + (struct mlx4_ib_create_qp *) + &ucmd); + if (err) + goto err; + } else { + qp->sq_no_prefetch = 1; + qp->sq.wqe_cnt = 1; + qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE; + /* Allocated buffer expects to have at least that SQ + * size. + */ + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + } + + qp->umem = ib_umem_get(pd->uobject->context, + (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr : + ucmd.wq.buf_addr, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; @@ -755,11 +1111,18 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), - ucmd.db_addr, &qp->db); + (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr : + ucmd.wq.db_addr, &qp->db); if (err) goto err_mtt; } + qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS; } else { + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, 0); + if (err) + goto err; + qp->sq_no_prefetch = 0; if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) @@ -812,20 +1175,15 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64), - GFP_KERNEL | __GFP_NOWARN); - if (!qp->sq.wrid) - qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), - GFP_KERNEL, PAGE_KERNEL); - qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64), - GFP_KERNEL | __GFP_NOWARN); - if (!qp->rq.wrid) - qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), - GFP_KERNEL, PAGE_KERNEL); + qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(u64), GFP_KERNEL); + qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt, + sizeof(u64), GFP_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; } + qp->mqp.usage = MLX4_RES_USAGE_DRIVER; } if (sqpn) { @@ -836,6 +1194,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err_wrid; } } + } else if (src == MLX4_IB_RWQ_SRC) { + err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp, + range_size, &qpn); + if (err) + goto err_wrid; } else { /* Raw packet QPNs may not have bits 6,7 set in their qp_num; * otherwise, the WQE BlueFlame setup flow wrongly causes @@ -845,13 +1208,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, (init_attr->cap.max_send_wr ? MLX4_RESERVE_ETH_BF_QP : 0) | (init_attr->cap.max_recv_wr ? - MLX4_RESERVE_A0_QP : 0)); + MLX4_RESERVE_A0_QP : 0), + qp->mqp.usage); else if (qp->flags & MLX4_IB_QP_NETIF) err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); else err = mlx4_qp_reserve_range(dev->dev, 1, 1, - &qpn, 0); + &qpn, 0, qp->mqp.usage); if (err) goto err_proxy; } @@ -873,7 +1237,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, */ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); - qp->mqp.event = mlx4_ib_qp_event; + qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event : + mlx4_ib_wq_event; + if (!*caller_qp) *caller_qp = qp; @@ -900,6 +1266,9 @@ err_qpn: if (!sqpn) { if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qpn, 1); + else if (src == MLX4_IB_RWQ_SRC) + mlx4_ib_release_wqn(to_mucontext(pd->uobject->context), + qp, 0); else mlx4_qp_release_range(dev->dev, qpn, 1); } @@ -998,7 +1367,7 @@ static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) return to_mpd(qp->ibqp.pd); } -static void get_cqs(struct mlx4_ib_qp *qp, +static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src, struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) { switch (qp->ibqp.qp_type) { @@ -1011,14 +1380,46 @@ static void get_cqs(struct mlx4_ib_qp *qp, *recv_cq = *send_cq; break; default: - *send_cq = to_mcq(qp->ibqp.send_cq); - *recv_cq = to_mcq(qp->ibqp.recv_cq); + *recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) : + to_mcq(qp->ibwq.cq); + *send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) : + *recv_cq; break; } } +static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (qp->state != IB_QPS_RESET) { + int i; + + for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size); + i++) { + struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } + + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + pr_warn("modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + del_gid_entries(qp); + kfree(qp->rss_ctx); +} + static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, - int is_user) + enum mlx4_ib_source_type src, int is_user) { struct mlx4_ib_cq *send_cq, *recv_cq; unsigned long flags; @@ -1051,7 +1452,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, } } - get_cqs(qp, &send_cq, &recv_cq); + get_cqs(qp, src, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx4_ib_lock_cqs(send_cq, recv_cq); @@ -1077,6 +1478,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else if (src == MLX4_IB_RWQ_SRC) + mlx4_ib_release_wqn(to_mucontext( + qp->ibwq.uobject->context), qp, 1); else mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); } @@ -1084,9 +1488,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_mtt_cleanup(dev->dev, &qp->mtt); if (is_user) { - if (qp->rq.wqe_cnt) - mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), - &qp->db); + if (qp->rq.wqe_cnt) { + struct mlx4_ib_ucontext *mcontext = !src ? + to_mucontext(qp->ibqp.uobject->context) : + to_mucontext(qp->ibwq.uobject->context); + mlx4_ib_db_unmap_user(mcontext, &qp->db); + } ib_umem_release(qp->umem); } else { kvfree(qp->sq.wrid); @@ -1128,6 +1535,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; u16 xrcdn = 0; + if (init_attr->rwq_ind_tbl) + return _mlx4_ib_create_qp_rss(pd, init_attr, udata); + /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. @@ -1182,8 +1592,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, /* fall through */ case IB_QPT_UD: { - err = create_qp_common(to_mdev(pd->device), pd, init_attr, - udata, 0, &qp); + err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC, + init_attr, udata, 0, &qp); if (err) { kfree(qp); return ERR_PTR(err); @@ -1203,7 +1613,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, if (udata) return ERR_PTR(-EINVAL); if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) { - int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0); + int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, + 1, 1, &sqpn, 0, + MLX4_RES_USAGE_DRIVER); if (res) return ERR_PTR(res); @@ -1211,8 +1623,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, sqpn = get_sqp_num(to_mdev(pd->device), init_attr); } - err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, - sqpn, &qp); + err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC, + init_attr, udata, sqpn, &qp); if (err) return ERR_PTR(err); @@ -1267,7 +1679,6 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); - struct mlx4_ib_pd *pd; if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); @@ -1282,8 +1693,14 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp) if (mqp->counter_index) mlx4_ib_free_qp_counter(dev, mqp); - pd = get_pd(mqp); - destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); + if (qp->rwq_ind_tbl) { + destroy_qp_rss(dev, mqp); + } else { + struct mlx4_ib_pd *pd; + + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject); + } if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); @@ -1566,7 +1983,7 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK)) return 0; - err = mlx4_counter_alloc(dev->dev, &tmp_idx); + err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER); if (err) return err; @@ -1606,12 +2023,119 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type) } } -static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, +/* + * Go over all RSS QP's childes (WQs) and apply their HW state according to + * their logic state if the RSS QP is the first RSS QP associated for the WQ. + */ +static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) +{ + int err = 0; + int i; + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + /* Mlx4_ib restrictions: + * WQ's is associated to a port according to the RSS QP it is + * associates to. + * In case the WQ is associated to a different port by another + * RSS QP, return a failure. + */ + if ((wq->rss_usecnt > 0) && (wq->port != port_num)) { + err = -EINVAL; + mutex_unlock(&wq->mutex); + break; + } + wq->port = port_num; + if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) { + err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY); + if (err) { + mutex_unlock(&wq->mutex); + break; + } + } + wq->rss_usecnt++; + + mutex_unlock(&wq->mutex); + } + + if (i && err) { + int j; + + for (j = (i - 1); j >= 0; j--) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[j]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + if ((wq->rss_usecnt == 1) && + (ibwq->state == IB_WQS_RDY)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + pr_warn("failed to reverse WQN=0x%06x\n", + ibwq->wq_num); + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } + } + + return err; +} + +static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl) +{ + int i; + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + pr_warn("failed to reverse WQN=%x\n", + ibwq->wq_num); + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } +} + +static void fill_qp_rss_context(struct mlx4_qp_context *context, + struct mlx4_ib_qp *qp) +{ + struct mlx4_rss_context *rss_context; + + rss_context = (void *)context + offsetof(struct mlx4_qp_context, + pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH; + + rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz); + rss_context->default_qpn = + cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff); + if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6)) + rss_context->base_qpn_udp = rss_context->default_qpn; + rss_context->flags = qp->rss_ctx->flags; + /* Currently support just toeplitz */ + rss_context->hash_fn = MLX4_RSS_HASH_TOP; + + memcpy(rss_context->rss_key, qp->rss_ctx->rss_key, + MLX4_EN_RSS_KEY_SIZE); +} + +static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { - struct mlx4_ib_dev *dev = to_mdev(ibqp->device); - struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct ib_uobject *ibuobject; + struct ib_srq *ibsrq; + struct ib_rwq_ind_table *rwq_ind_tbl; + enum ib_qp_type qp_type; + struct mlx4_ib_dev *dev; + struct mlx4_ib_qp *qp; struct mlx4_ib_pd *pd; struct mlx4_ib_cq *send_cq, *recv_cq; struct mlx4_qp_context *context; @@ -1621,6 +2145,30 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, int err = -EINVAL; int counter_index; + if (src_type == MLX4_IB_RWQ_SRC) { + struct ib_wq *ibwq; + + ibwq = (struct ib_wq *)src; + ibuobject = ibwq->uobject; + ibsrq = NULL; + rwq_ind_tbl = NULL; + qp_type = IB_QPT_RAW_PACKET; + qp = to_mqp((struct ib_qp *)ibwq); + dev = to_mdev(ibwq->device); + pd = to_mpd(ibwq->pd); + } else { + struct ib_qp *ibqp; + + ibqp = (struct ib_qp *)src; + ibuobject = ibqp->uobject; + ibsrq = ibqp->srq; + rwq_ind_tbl = ibqp->rwq_ind_tbl; + qp_type = ibqp->qp_type; + qp = to_mqp(ibqp); + dev = to_mdev(ibqp->device); + pd = get_pd(qp); + } + /* APM is not supported under RoCE */ if (attr_mask & IB_QP_ALT_PATH && rdma_port_get_link_layer(&dev->ib_dev, qp->port) == @@ -1634,6 +2182,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); + if (rwq_ind_tbl) { + fill_qp_rss_context(context, qp); + context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET); + } + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); else { @@ -1651,11 +2204,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) + if (qp->inl_recv_sz) + context->param3 |= cpu_to_be32(1 << 25); + + if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; - else if (ibqp->qp_type == IB_QPT_RAW_PACKET) + else if (qp_type == IB_QPT_RAW_PACKET) context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; - else if (ibqp->qp_type == IB_QPT_UD) { + else if (qp_type == IB_QPT_UD) { if (qp->flags & MLX4_IB_QP_LSO) context->mtu_msgmax = (IB_MTU_4096 << 5) | ilog2(dev->dev->caps.max_gso_sz); @@ -1671,9 +2227,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, ilog2(dev->dev->caps.max_msg_sz); } - if (qp->rq.wqe_cnt) - context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; - context->rq_size_stride |= qp->rq.wqe_shift - 4; + if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */ + if (qp->rq.wqe_cnt) + context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; + context->rq_size_stride |= qp->rq.wqe_shift - 4; + } if (qp->sq.wqe_cnt) context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; @@ -1685,14 +2243,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; context->xrcd = cpu_to_be32((u32) qp->xrcdn); - if (ibqp->qp_type == IB_QPT_RAW_PACKET) + if (qp_type == IB_QPT_RAW_PACKET) context->param3 |= cpu_to_be32(1 << 30); } - if (qp->ibqp.uobject) + if (ibuobject) context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, - to_mucontext(ibqp->uobject->context)->uar.index)); + to_mucontext(ibuobject->context) + ->uar.index)); else context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index)); @@ -1736,7 +2295,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, steer_qp = 1; } - if (ibqp->qp_type == IB_QPT_GSI) { + if (qp_type == IB_QPT_GSI) { enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ? IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE; u8 qpc_roce_mode = gid_type_to_qpc(gid_type); @@ -1753,7 +2312,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_AV) { - u8 port_num = mlx4_is_bonded(to_mdev(ibqp->device)->dev) ? 1 : + u8 port_num = mlx4_is_bonded(dev->dev) ? 1 : attr_mask & IB_QP_PORT ? attr->port_num : qp->port; union ib_gid gid; struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB}; @@ -1768,7 +2327,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, int index = rdma_ah_read_grh(&attr->ah_attr)->sgid_index; - status = ib_get_cached_gid(ibqp->device, port_num, + status = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &gid_attr); if (!status && !memcmp(&gid, &zgid, sizeof(gid))) status = -ENOENT; @@ -1825,15 +2384,20 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } - pd = get_pd(qp); - get_cqs(qp, &send_cq, &recv_cq); - context->pd = cpu_to_be32(pd->pdn); + context->pd = cpu_to_be32(pd->pdn); + + if (!rwq_ind_tbl) { + get_cqs(qp, src_type, &send_cq, &recv_cq); + } else { /* Set dummy CQs to be compatible with HV and PRM */ + send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq); + recv_cq = send_cq; + } context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); /* Set "fast registration enabled" for all kernel QPs */ - if (!qp->ibqp.uobject) + if (!ibuobject) context->params1 |= cpu_to_be32(1 << 11); if (attr_mask & IB_QP_RNR_RETRY) { @@ -1868,7 +2432,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; } - if (ibqp->srq) + if (ibsrq) context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); if (attr_mask & IB_QP_MIN_RNR_TIMER) { @@ -1899,17 +2463,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_Q_KEY; } - if (ibqp->srq) - context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); + if (ibsrq) + context->srqn = cpu_to_be32(1 << 24 | + to_msrq(ibsrq)->msrq.srqn); - if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (qp->rq.wqe_cnt && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && - (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || - ibqp->qp_type == IB_QPT_UD || - ibqp->qp_type == IB_QPT_RAW_PACKET)) { + (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI || + qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) { context->pri_path.sched_queue = (qp->port - 1) << 6; if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || qp->mlx4_ib_qp_type & @@ -1942,7 +2508,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + if (qp_type == IB_QPT_RAW_PACKET) { context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | MLX4_IB_LINK_TYPE_ETH; if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { @@ -1952,7 +2518,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { int is_eth = rdma_port_get_link_layer( &dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET; @@ -1962,14 +2528,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; else sqd_event = 0; - if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (!ibuobject && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) context->rlkey_roce_mode |= (1 << 4); /* @@ -1978,7 +2545,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * headroom is stamped so that the hardware doesn't start * processing stale work requests. */ - if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + if (!ibuobject && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) { struct mlx4_wqe_ctrl_seg *ctrl; int i; @@ -2035,9 +2604,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET) { - if (!ibqp->uobject) { + if (!ibuobject) { mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq) : NULL); + ibsrq ? to_msrq(ibsrq) : NULL); if (send_cq != recv_cq) mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); @@ -2148,22 +2717,25 @@ out: return err; } +enum { + MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE | + IB_QP_PORT), +}; + static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { + enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; - int ll; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (cur_state == new_state && cur_state == IB_QPS_RESET) { - ll = IB_LINK_LAYER_UNSPECIFIED; - } else { + if (cur_state != new_state || cur_state != IB_QPS_RESET) { int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; ll = rdma_port_get_link_layer(&dev->ib_dev, port); } @@ -2178,6 +2750,27 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } + if (ibqp->rwq_ind_tbl) { + if (!(((cur_state == IB_QPS_RESET) && + (new_state == IB_QPS_INIT)) || + ((cur_state == IB_QPS_INIT) && + (new_state == IB_QPS_RTR)))) { + pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n", + ibqp->qp_num, cur_state, new_state); + + err = -EOPNOTSUPP; + goto out; + } + + if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) { + pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n", + ibqp->qp_num, attr_mask, cur_state, new_state); + + err = -EOPNOTSUPP; + goto out; + } + } + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) { if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { if ((ibqp->qp_type == IB_QPT_RC) || @@ -2242,7 +2835,17 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } - err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) { + err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num); + if (err) + goto out; + } + + err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask, + cur_state, new_state); + + if (ibqp->rwq_ind_tbl && err) + bring_down_rss_rwqs(ibqp->rwq_ind_tbl); if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) attr->port_num = 1; @@ -3432,6 +4035,9 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr int mlx4_state; int err = 0; + if (ibqp->rwq_ind_tbl) + return -EOPNOTSUPP; + mutex_lock(&qp->mutex); if (qp->state == IB_QPS_RESET) { @@ -3527,3 +4133,285 @@ out: return err; } +struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev; + struct ib_qp_init_attr ib_qp_init_attr; + struct mlx4_ib_qp *qp; + struct mlx4_ib_create_wq ucmd; + int err, required_cmd_sz; + + if (!(udata && pd->uobject)) + return ERR_PTR(-EINVAL); + + required_cmd_sz = offsetof(typeof(ucmd), comp_mask) + + sizeof(ucmd.comp_mask); + if (udata->inlen < required_cmd_sz) { + pr_debug("invalid inlen\n"); + return ERR_PTR(-EINVAL); + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + pr_debug("inlen is not supported\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (udata->outlen) + return ERR_PTR(-EOPNOTSUPP); + + dev = to_mdev(pd->device); + + if (init_attr->wq_type != IB_WQT_RQ) { + pr_debug("unsupported wq type %d\n", init_attr->wq_type); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->create_flags) { + pr_debug("unsupported create_flags %u\n", + init_attr->create_flags); + return ERR_PTR(-EOPNOTSUPP); + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + + memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr)); + ib_qp_init_attr.qp_context = init_attr->wq_context; + ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET; + ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr; + ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge; + ib_qp_init_attr.recv_cq = init_attr->cq; + ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */ + + err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr, + udata, 0, &qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibwq.event_handler = init_attr->event_handler; + qp->ibwq.wq_num = qp->mqp.qpn; + qp->ibwq.state = IB_WQS_RESET; + + return &qp->ibwq; +} + +static int ib_wq2qp_state(enum ib_wq_state state) +{ + switch (state) { + case IB_WQS_RESET: + return IB_QPS_RESET; + case IB_WQS_RDY: + return IB_QPS_RTR; + default: + return IB_QPS_ERR; + } +} + +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state) +{ + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + enum ib_qp_state qp_cur_state; + enum ib_qp_state qp_new_state; + int attr_mask; + int err; + + /* ib_qp.state represents the WQ HW state while ib_wq.state represents + * the WQ logic state. + */ + qp_cur_state = qp->state; + qp_new_state = ib_wq2qp_state(new_state); + + if (ib_wq2qp_state(new_state) == qp_cur_state) + return 0; + + if (new_state == IB_WQS_RDY) { + struct ib_qp_attr attr = {}; + + attr.port_num = qp->port; + attr_mask = IB_QP_PORT; + + err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr, + attr_mask, IB_QPS_RESET, IB_QPS_INIT); + if (err) { + pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n", + ibwq->wq_num); + return err; + } + + qp_cur_state = IB_QPS_INIT; + } + + attr_mask = 0; + err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask, + qp_cur_state, qp_new_state); + + if (err && (qp_cur_state == IB_QPS_INIT)) { + qp_new_state = IB_QPS_RESET; + if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, + attr_mask, IB_QPS_INIT, IB_QPS_RESET)) { + pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n", + ibwq->wq_num); + qp_new_state = IB_QPS_INIT; + } + } + + qp->state = qp_new_state; + + return err; +} + +int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + struct mlx4_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + enum ib_wq_state cur_state, new_state; + int err = 0; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + if (wq_attr_mask & IB_WQ_FLAGS) + return -EOPNOTSUPP; + + cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state : + ibwq->state; + new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state; + + if (cur_state < IB_WQS_RESET || cur_state > IB_WQS_ERR || + new_state < IB_WQS_RESET || new_state > IB_WQS_ERR) + return -EINVAL; + + if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR)) + return -EINVAL; + + if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET)) + return -EINVAL; + + /* Need to protect against the parent RSS which also may modify WQ + * state. + */ + mutex_lock(&qp->mutex); + + /* Can update HW state only if a RSS QP has already associated to this + * WQ, so we can apply its port on the WQ. + */ + if (qp->rss_usecnt) + err = _mlx4_ib_modify_wq(ibwq, new_state); + + if (!err) + ibwq->state = new_state; + + mutex_unlock(&qp->mutex); + + return err; +} + +int mlx4_ib_destroy_wq(struct ib_wq *ibwq) +{ + struct mlx4_ib_dev *dev = to_mdev(ibwq->device); + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + + if (qp->counter_index) + mlx4_ib_free_qp_counter(dev, qp); + + destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1); + + kfree(qp); + + return 0; +} + +struct ib_rwq_ind_table +*mlx4_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_rwq_ind_table *rwq_ind_table; + struct mlx4_ib_create_rwq_ind_tbl_resp resp = {}; + unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size; + unsigned int base_wqn; + size_t min_resp_len; + int i; + int err; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + if (ind_tbl_size > + device->attrs.rss_caps.max_rwq_indirection_table_size) { + pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n", + ind_tbl_size, + device->attrs.rss_caps.max_rwq_indirection_table_size); + return ERR_PTR(-EINVAL); + } + + base_wqn = init_attr->ind_tbl[0]->wq_num; + + if (base_wqn % ind_tbl_size) { + pr_debug("WQN=0x%x isn't aligned with indirection table size\n", + base_wqn); + return ERR_PTR(-EINVAL); + } + + for (i = 1; i < ind_tbl_size; i++) { + if (++base_wqn != init_attr->ind_tbl[i]->wq_num) { + pr_debug("indirection table's WQNs aren't consecutive\n"); + return ERR_PTR(-EINVAL); + } + } + + rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL); + if (!rwq_ind_table) + return ERR_PTR(-ENOMEM); + + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err; + } + + return rwq_ind_table; + +err: + kfree(rwq_ind_table); + return ERR_PTR(err); +} + +int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + kfree(ib_rwq_ind_tbl); + return 0; +} diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 0facaf5f6d23..ebee56cbc0e2 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,7 +34,6 @@ #include <linux/mlx4/qp.h> #include <linux/mlx4/srq.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include "mlx4_ib.h" #include <rdma/mlx4-abi.h> @@ -171,20 +170,16 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; - srq->wrid = kmalloc_array(srq->msrq.max, sizeof(u64), - GFP_KERNEL | __GFP_NOWARN); + srq->wrid = kvmalloc_array(srq->msrq.max, + sizeof(u64), GFP_KERNEL); if (!srq->wrid) { - srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), - GFP_KERNEL, PAGE_KERNEL); - if (!srq->wrid) { - err = -ENOMEM; - goto err_mtt; - } + err = -ENOMEM; + goto err_mtt; } } - cqn = (init_attr->srq_type == IB_SRQT_XRC) ? - to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; + cqn = ib_srq_has_cq(init_attr->srq_type) ? + to_mcq(init_attr->ext.cq)->mcq.cqn : 0; xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : (u16) dev->dev->caps.reserved_xrcds; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 90ad2adc752f..bc6299697dda 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 18d5e1db93ed..470995fa38d2 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -57,3 +57,23 @@ int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev, MLX5_SET(query_cong_statistics_in, in, clear, reset); return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); } + +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out, int out_size) +{ + u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { }; + + MLX5_SET(query_cong_params_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_PARAMS); + MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); +} + +int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev, + void *in, int in_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { }; + + return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out)); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index fa09228193a6..af4c24596274 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -39,4 +39,8 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev, bool reset, void *out, int out_size); +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out, int out_size); +int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, + void *in, int in_size); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c new file mode 100644 index 000000000000..2d32b519bb61 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/debugfs.h> + +#include "mlx5_ib.h" +#include "cmd.h" + +enum mlx5_ib_cong_node_type { + MLX5_IB_RROCE_ECN_RP = 1, + MLX5_IB_RROCE_ECN_NP = 2, +}; + +static const char * const mlx5_ib_dbg_cc_name[] = { + "rp_clamp_tgt_rate", + "rp_clamp_tgt_rate_ati", + "rp_time_reset", + "rp_byte_reset", + "rp_threshold", + "rp_ai_rate", + "rp_hai_rate", + "rp_min_dec_fac", + "rp_min_rate", + "rp_rate_to_set_on_first_cnp", + "rp_dce_tcp_g", + "rp_dce_tcp_rtt", + "rp_rate_reduce_monitor_period", + "rp_initial_alpha_value", + "rp_gd", + "np_cnp_dscp", + "np_cnp_prio_mode", + "np_cnp_prio", +}; + +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR BIT(1) +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR BIT(2) +#define MLX5_IB_RP_TIME_RESET_ATTR BIT(3) +#define MLX5_IB_RP_BYTE_RESET_ATTR BIT(4) +#define MLX5_IB_RP_THRESHOLD_ATTR BIT(5) +#define MLX5_IB_RP_AI_RATE_ATTR BIT(7) +#define MLX5_IB_RP_HAI_RATE_ATTR BIT(8) +#define MLX5_IB_RP_MIN_DEC_FAC_ATTR BIT(9) +#define MLX5_IB_RP_MIN_RATE_ATTR BIT(10) +#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR BIT(11) +#define MLX5_IB_RP_DCE_TCP_G_ATTR BIT(12) +#define MLX5_IB_RP_DCE_TCP_RTT_ATTR BIT(13) +#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR BIT(14) +#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR BIT(15) +#define MLX5_IB_RP_GD_ATTR BIT(16) + +#define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) +#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) + +static enum mlx5_ib_cong_node_type +mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset) +{ + if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE && + param_offset <= MLX5_IB_DBG_CC_RP_GD) + return MLX5_IB_RROCE_ECN_RP; + else + return MLX5_IB_RROCE_ECN_NP; +} + +static u32 mlx5_get_cc_param_val(void *field, int offset) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate); + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc); + case MLX5_IB_DBG_CC_RP_TIME_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset); + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset); + case MLX5_IB_DBG_CC_RP_THRESHOLD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_threshold); + case MLX5_IB_DBG_CC_RP_AI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate); + case MLX5_IB_DBG_CC_RP_HAI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate); + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac); + case MLX5_IB_DBG_CC_RP_MIN_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate); + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp); + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g); + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt); + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period); + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value); + case MLX5_IB_DBG_CC_RP_GD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_gd); + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_dscp); + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_prio_mode); + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_802p_prio); + default: + return 0; + } +} + +static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, + u32 var, u32 *attr_mask) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate, var); + break; + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc, var); + break; + case MLX5_IB_DBG_CC_RP_TIME_RESET: + *attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset, var); + break; + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + *attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset, var); + break; + case MLX5_IB_DBG_CC_RP_THRESHOLD: + *attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_threshold, var); + break; + case MLX5_IB_DBG_CC_RP_AI_RATE: + *attr_mask |= MLX5_IB_RP_AI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_HAI_RATE: + *attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + *attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_RATE: + *attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + *attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + *attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + *attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + *attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period, var); + break; + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + *attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value, var); + break; + case MLX5_IB_DBG_CC_RP_GD: + *attr_mask |= MLX5_IB_RP_GD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_gd, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + *attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0); + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var); + break; + } +} + +static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var) +{ + int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); + void *out; + void *field; + int err; + enum mlx5_ib_cong_node_type node; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + node = mlx5_ib_param_to_node(offset); + + err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen); + if (err) + goto free; + + field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters); + *var = mlx5_get_cc_param_val(field, offset); + +free: + kvfree(out); + return err; +} + +static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); + void *in; + void *field; + enum mlx5_ib_cong_node_type node; + u32 attr_mask = 0; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + + node = mlx5_ib_param_to_node(offset); + MLX5_SET(modify_cong_params_in, in, cong_protocol, node); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters); + mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, + attr_mask); + + err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen); + kvfree(in); + return err; +} + +static ssize_t set_param(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + char lbuf[11] = { }; + u32 var; + int ret; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &var)) + return -EINVAL; + + ret = mlx5_ib_set_cc_params(param->dev, offset, var); + return ret ? ret : count; +} + +static ssize_t get_param(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + u32 var = 0; + int ret; + char lbuf[11]; + + if (*pos) + return 0; + + ret = mlx5_ib_get_cc_params(param->dev, offset, &var); + if (ret) + return ret; + + ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var); + if (ret < 0) + return ret; + + if (copy_to_user(buf, lbuf, ret)) + return -EFAULT; + + *pos += ret; + return ret; +} + +static const struct file_operations dbg_cc_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = set_param, + .read = get_param, +}; + +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root || + !dev->dbg_cc_params || + !dev->dbg_cc_params->root) + return; + + debugfs_remove_recursive(dev->dbg_cc_params->root); + kfree(dev->dbg_cc_params); + dev->dbg_cc_params = NULL; +} + +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_dbg_cc_params *dbg_cc_params; + int i; + + if (!mlx5_debugfs_root) + goto out; + + if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) || + !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed)) + goto out; + + dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL); + if (!dbg_cc_params) + goto out; + + dev->dbg_cc_params = dbg_cc_params; + + dbg_cc_params->root = debugfs_create_dir("cc_params", + dev->mdev->priv.dbg_root); + if (!dbg_cc_params->root) + goto err; + + for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { + dbg_cc_params->params[i].offset = i; + dbg_cc_params->params[i].dev = dev; + dbg_cc_params->params[i].dentry = + debugfs_create_file(mlx5_ib_dbg_cc_name[i], + 0600, dbg_cc_params->root, + &dbg_cc_params->params[i], + &dbg_cc_fops); + if (!dbg_cc_params->params[i].dentry) + goto err; + } +out: return 0; + +err: + mlx5_ib_warn(dev, "cong debugfs failure\n"); + mlx5_ib_cleanup_cong_debugfs(dev); + /* + * We don't want to fail driver if debugfs failed to initialize, + * so we are not forwarding error to the user. + */ + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index a384d72ea3cd..2aa53f427685 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -499,7 +499,7 @@ static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, struct mlx5_ib_qp *qp; *npolled = 0; - /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */ + /* Find uncompleted WQEs belonging to that cq and return mmics ones */ list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { sw_send_comp(qp, num_entries, wc + *npolled, npolled); if (*npolled >= num_entries) @@ -751,10 +751,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, void *cqc; int err; - ucmdlen = - (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < - sizeof(ucmd)) ? (sizeof(ucmd) - - sizeof(ucmd.reserved)) : sizeof(ucmd); + ucmdlen = udata->inlen < sizeof(ucmd) ? + (sizeof(ucmd) - sizeof(ucmd.reserved)) : sizeof(ucmd); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) return -EFAULT; diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c index c1b9de800fe5..649a3364f838 100644 --- a/drivers/infiniband/hw/mlx5/ib_virt.c +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -96,6 +96,7 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -109,6 +110,8 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, } in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].policy = in->policy; out: kfree(in); @@ -151,6 +154,7 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid) struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -160,6 +164,8 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid) in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID; in->node_guid = guid; err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].node_guid = guid; kfree(in); return err; } @@ -169,6 +175,7 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid) struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -178,6 +185,8 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid) in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID; in->port_guid = guid; err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].port_guid = guid; kfree(in); return err; } diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 95db929bdc34..1003b0133a49 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -78,7 +78,7 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, u16 slid; int err; - slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; @@ -204,7 +204,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, int err; void *out_cnt; - /* Decalring support of extended counters */ + /* Declaring support of extended counters */ if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { struct ib_class_port_info cpi = {}; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f7fcde1ff0aa..ab3c562d5ba7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/init.h> @@ -58,6 +59,7 @@ #include <linux/mlx5/vport.h> #include "mlx5_ib.h" #include "cmd.h" +#include <linux/mlx5/vport.h> #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "5.0-0" @@ -65,7 +67,6 @@ MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRIVER_VERSION); static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" @@ -97,6 +98,20 @@ mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) return mlx5_port_type_cap_to_rdma_ll(port_type_cap); } +static int get_port_state(struct ib_device *ibdev, + u8 port_num, + enum ib_port_state *state) +{ + struct ib_port_attr attr; + int ret; + + memset(&attr, 0, sizeof(attr)); + ret = mlx5_ib_query_port(ibdev, port_num, &attr); + if (!ret) + *state = attr.state; + return ret; +} + static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -114,6 +129,7 @@ static int mlx5_netdev_event(struct notifier_block *this, write_unlock(&ibdev->roce.netdev_lock); break; + case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev); @@ -127,10 +143,23 @@ static int mlx5_netdev_event(struct notifier_block *this, if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) && ibdev->ib_active) { struct ib_event ibev = { }; + enum ib_port_state port_state; + if (get_port_state(&ibdev->ib_dev, 1, &port_state)) + return NOTIFY_DONE; + + if (ibdev->roce.last_port_state == port_state) + return NOTIFY_DONE; + + ibdev->roce.last_port_state = port_state; ibev.device = &ibdev->ib_dev; - ibev.event = (event == NETDEV_UP) ? - IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + if (port_state == IB_PORT_DOWN) + ibev.event = IB_EVENT_PORT_ERR; + else if (port_state == IB_PORT_ACTIVE) + ibev.event = IB_EVENT_PORT_ACTIVE; + else + return NOTIFY_DONE; + ibev.element.port_num = 1; ib_dispatch_event(&ibev); } @@ -668,6 +697,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_TSO; } + if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && + MLX5_CAP_GEN(dev->mdev, general_notification_event)) + props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP; + + if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && + MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap)) + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { /* Legacy bit to support old userspace libraries */ @@ -740,6 +777,16 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); } + if (MLX5_CAP_GEN(mdev, tag_matching)) { + props->xrq_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + props->xrq_caps.max_num_tags = + (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; + props->xrq_caps.flags = IB_TM_CAP_RC; + props->xrq_caps.max_ops = + 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + props->xrq_caps.max_sge = MLX5_TM_MAX_SGE; + } + if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { resp.cqe_comp_caps.max_num = MLX5_CAP_GEN(dev->mdev, cqe_compression) ? @@ -765,8 +812,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, uhw->outlen)) { - resp.mlx5_ib_support_multi_pkt_send_wqes = - MLX5_CAP_ETH(mdev, multi_pkt_send_wqe); + if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes = + MLX5_IB_ALLOW_MPW; + + if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes |= + MLX5_IB_SUPPORT_EMPW; + resp.response_length += sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } @@ -774,6 +827,27 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (field_avail(typeof(resp), reserved, uhw->outlen)) resp.response_length += sizeof(resp.reserved); + if (field_avail(typeof(resp), sw_parsing_caps, + uhw->outlen)) { + resp.response_length += sizeof(resp.sw_parsing_caps); + if (MLX5_CAP_ETH(mdev, swp)) { + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING; + + if (MLX5_CAP_ETH(mdev, swp_csum)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_CSUM; + + if (MLX5_CAP_ETH(mdev, swp_lso)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_LSO; + + if (resp.sw_parsing_caps.sw_parsing_offloads) + resp.sw_parsing_caps.supported_qpts = + BIT(IB_QPT_RAW_PACKET); + } + } + if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); @@ -1144,7 +1218,7 @@ static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) return -EINVAL; - mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n", + mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, using %d sys pages\n", MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", lib_uar_4k ? "yes" : "no", ref_bfregs, req->total_num_bfregs, *num_sys_pages); @@ -1193,6 +1267,45 @@ static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *con return 0; } +static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) +{ + int err; + + err = mlx5_core_alloc_transport_domain(dev->mdev, tdn); + if (err) + return err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || + !MLX5_CAP_GEN(dev->mdev, disable_local_lb)) + return err; + + mutex_lock(&dev->lb_mutex); + dev->user_td++; + + if (dev->user_td == 2) + err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + + mutex_unlock(&dev->lb_mutex); + return err; +} + +static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) +{ + mlx5_core_dealloc_transport_domain(dev->mdev, tdn); + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || + !MLX5_CAP_GEN(dev->mdev, disable_local_lb)) + return; + + mutex_lock(&dev->lb_mutex); + dev->user_td--; + + if (dev->user_td < 2) + mlx5_nic_vport_update_local_lb(dev->mdev, false); + + mutex_unlock(&dev->lb_mutex); +} + static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { @@ -1203,7 +1316,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx5_bfreg_info *bfregi; int ver; int err; - size_t reqlen; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); bool lib_uar_4k; @@ -1211,18 +1323,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (!dev->ib_active) return ERR_PTR(-EAGAIN); - if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr)) - return ERR_PTR(-EINVAL); - - reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); - if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; - else if (reqlen >= min_req_v2) + else if (udata->inlen >= min_req_v2) ver = 2; else return ERR_PTR(-EINVAL); - err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req))); + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); if (err) return ERR_PTR(err); @@ -1301,8 +1409,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, mutex_init(&context->upd_xlt_page_mutex); if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { - err = mlx5_core_alloc_transport_domain(dev->mdev, - &context->tdn); + err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); if (err) goto out_page; } @@ -1368,7 +1475,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, out_td: if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) - mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + mlx5_ib_dealloc_transport_domain(dev, context->tdn); out_page: free_page(context->upd_xlt_page); @@ -1396,7 +1503,7 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) bfregi = &context->bfregi; if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) - mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + mlx5_ib_dealloc_transport_domain(dev, context->tdn); free_page(context->upd_xlt_page); deallocate_uars(dev, context); @@ -2034,23 +2141,34 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, * it won't fall into the multicast flow steering table and this rule * could steal other multicast packets. */ -static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) +static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr) { - struct ib_flow_spec_eth *eth_spec; + union ib_flow_spec *flow_spec; if (ib_attr->type != IB_FLOW_ATTR_NORMAL || - ib_attr->size < sizeof(struct ib_flow_attr) + - sizeof(struct ib_flow_spec_eth) || ib_attr->num_of_specs < 1) return false; - eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); - if (eth_spec->type != IB_FLOW_SPEC_ETH || - eth_spec->size != sizeof(*eth_spec)) + flow_spec = (union ib_flow_spec *)(ib_attr + 1); + if (flow_spec->type == IB_FLOW_SPEC_IPV4) { + struct ib_flow_spec_ipv4 *ipv4_spec; + + ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec; + if (ipv4_is_multicast(ipv4_spec->val.dst_ip)) + return true; + return false; + } + + if (flow_spec->type == IB_FLOW_SPEC_ETH) { + struct ib_flow_spec_eth *eth_spec; - return is_multicast_ether_addr(eth_spec->mask.dst_mac) && - is_multicast_ether_addr(eth_spec->val.dst_mac); + eth_spec = (struct ib_flow_spec_eth *)flow_spec; + return is_multicast_ether_addr(eth_spec->mask.dst_mac) && + is_multicast_ether_addr(eth_spec->val.dst_mac); + } + + return false; } static bool is_valid_ethertype(struct mlx5_core_dev *mdev, @@ -2235,10 +2353,31 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, return err ? ERR_PTR(err) : prio; } -static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_destination *dst) +static void set_underlay_qp(struct mlx5_ib_dev *dev, + struct mlx5_flow_spec *spec, + u32 underlay_qpn) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, + spec->match_criteria, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + if (underlay_qpn && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + ft_field_support.bth_dst_qp)) { + MLX5_SET(fte_match_set_misc, + misc_params_v, bth_dst_qp, underlay_qpn); + MLX5_SET(fte_match_set_misc, + misc_params_c, bth_dst_qp, 0xffffff); + } +} + +static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst, + u32 underlay_qpn) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; @@ -2274,6 +2413,9 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, ib_flow += ((union ib_flow_spec *)ib_flow)->size; } + if (!flow_is_multicast_only(flow_attr)) + set_underlay_qp(dev, spec, underlay_qpn); + spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); if (is_drop) { flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; @@ -2313,6 +2455,14 @@ free: return err ? ERR_PTR(err) : handler; } +static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0); +} + static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, @@ -2449,6 +2599,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; int err; + int underlay_qpn; if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOMEM); @@ -2489,8 +2640,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst); } else { - handler = create_flow_rule(dev, ft_prio, flow_attr, - dst); + underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? + mqp->underlay_qpn : 0; + handler = _create_flow_rule(dev, ft_prio, flow_attr, + dst, underlay_qpn); } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { @@ -2528,8 +2681,14 @@ unlock: static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *mqp = to_mqp(ibqp); int err; + if (mqp->flags & MLX5_IB_QP_UNDERLAY) { + mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); + return -EOPNOTSUPP; + } + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", @@ -2691,6 +2850,26 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } +static void delay_drop_handler(struct work_struct *work) +{ + int err; + struct mlx5_ib_delay_drop *delay_drop = + container_of(work, struct mlx5_ib_delay_drop, + delay_drop_work); + + atomic_inc(&delay_drop->events_cnt); + + mutex_lock(&delay_drop->lock); + err = mlx5_core_set_delay_drop(delay_drop->dev->mdev, + delay_drop->timeout); + if (err) { + mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", + delay_drop->timeout); + delay_drop->activate = false; + } + mutex_unlock(&delay_drop->lock); +} + static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { @@ -2743,8 +2922,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, ibev.event = IB_EVENT_CLIENT_REREGISTER; port = (u8)param; break; + case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT: + schedule_work(&ibdev->delay_drop.delay_drop_work); + goto out; default: - return; + goto out; } ibev.device = &ibdev->ib_dev; @@ -2752,7 +2934,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (port < 1 || port > ibdev->num_ports) { mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); - return; + goto out; } if (ibdev->ib_active) @@ -2760,6 +2942,9 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (fatal) ibdev->ib_active = false; + +out: + return; } static int set_has_smi_cap(struct mlx5_ib_dev *dev) @@ -3042,7 +3227,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; - attr.ext.xrc.cq = devr->c0; + attr.ext.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); @@ -3057,9 +3242,9 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; - devr->s0->ext.xrc.cq = devr->c0; + devr->s0->ext.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); - atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->s0->ext.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); @@ -3078,9 +3263,9 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) devr->s1->event_handler = NULL; devr->s1->srq_context = NULL; devr->s1->srq_type = IB_SRQT_BASIC; - devr->s1->ext.xrc.cq = devr->c0; + devr->s1->ext.cq = devr->c0; atomic_inc(&devr->p0->usecnt); - atomic_set(&devr->s0->usecnt, 0); + atomic_set(&devr->s1->usecnt, 0); for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { INIT_WORK(&devr->ports[port].pkey_change_work, @@ -3173,13 +3358,13 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *ibdev, char *str) { struct mlx5_ib_dev *dev = container_of(ibdev, struct mlx5_ib_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), - fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d", + fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), + fw_rev_sub(dev->mdev)); } static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) @@ -3319,6 +3504,17 @@ static const struct mlx5_ib_counter cong_cnts[] = { INIT_CONG_COUNTER(np_cnp_sent), }; +static const struct mlx5_ib_counter extended_err_cnts[] = { + INIT_Q_COUNTER(resp_local_length_error), + INIT_Q_COUNTER(resp_cqe_error), + INIT_Q_COUNTER(req_cqe_error), + INIT_Q_COUNTER(req_remote_invalid_request), + INIT_Q_COUNTER(req_remote_access_errors), + INIT_Q_COUNTER(resp_remote_access_errors), + INIT_Q_COUNTER(resp_cqe_flush_error), + INIT_Q_COUNTER(req_cqe_flush_error), +}; + static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { unsigned int i; @@ -3343,6 +3539,10 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) num_counters += ARRAY_SIZE(retrans_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) + num_counters += ARRAY_SIZE(extended_err_cnts); + cnts->num_q_counters = num_counters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { @@ -3392,6 +3592,13 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, } } + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { + for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + names[j] = extended_err_cnts[i].name; + offsets[j] = extended_err_cnts[i].offset; + } + } + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) { names[j] = cong_cnts[i].name; @@ -3562,6 +3769,136 @@ mlx5_ib_alloc_rdma_netdev(struct ib_device *hca, return netdev; } +static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!dev->delay_drop.dbg) + return; + debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs); + kfree(dev->delay_drop.dbg); + dev->delay_drop.dbg = NULL; +} + +static void cancel_delay_drop(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + cancel_work_sync(&dev->delay_drop.delay_drop_work); + delay_drop_debugfs_cleanup(dev); +} + +static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_delay_drop *delay_drop = filp->private_data; + char lbuf[20]; + int len; + + len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout); + return simple_read_from_buffer(buf, count, pos, lbuf, len); +} + +static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_delay_drop *delay_drop = filp->private_data; + u32 timeout; + u32 var; + + if (kstrtouint_from_user(buf, count, 0, &var)) + return -EFAULT; + + timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS * + 1000); + if (timeout != var) + mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n", + timeout); + + delay_drop->timeout = timeout; + + return count; +} + +static const struct file_operations fops_delay_drop_timeout = { + .owner = THIS_MODULE, + .open = simple_open, + .write = delay_drop_timeout_write, + .read = delay_drop_timeout_read, +}; + +static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_dbg_delay_drop *dbg; + + if (!mlx5_debugfs_root) + return 0; + + dbg = kzalloc(sizeof(*dbg), GFP_KERNEL); + if (!dbg) + return -ENOMEM; + + dbg->dir_debugfs = + debugfs_create_dir("delay_drop", + dev->mdev->priv.dbg_root); + if (!dbg->dir_debugfs) + return -ENOMEM; + + dbg->events_cnt_debugfs = + debugfs_create_atomic_t("num_timeout_events", 0400, + dbg->dir_debugfs, + &dev->delay_drop.events_cnt); + if (!dbg->events_cnt_debugfs) + goto out_debugfs; + + dbg->rqs_cnt_debugfs = + debugfs_create_atomic_t("num_rqs", 0400, + dbg->dir_debugfs, + &dev->delay_drop.rqs_cnt); + if (!dbg->rqs_cnt_debugfs) + goto out_debugfs; + + dbg->timeout_debugfs = + debugfs_create_file("timeout", 0600, + dbg->dir_debugfs, + &dev->delay_drop, + &fops_delay_drop_timeout); + if (!dbg->timeout_debugfs) + goto out_debugfs; + + dev->delay_drop.dbg = dbg; + + return 0; + +out_debugfs: + delay_drop_debugfs_cleanup(dev); + return -ENOMEM; +} + +static void init_delay_drop(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + mutex_init(&dev->delay_drop.lock); + dev->delay_drop.dev = dev; + dev->delay_drop.activate = false; + dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; + INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler); + atomic_set(&dev->delay_drop.rqs_cnt, 0); + atomic_set(&dev->delay_drop.events_cnt, 0); + + if (delay_drop_debugfs_init(dev)) + mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n"); +} + +static const struct cpumask * +mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_get_vector_affinity(dev->mdev, comp_vector); +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; @@ -3692,6 +4029,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; dev->ib_dev.get_dev_fw_str = get_dev_fw_str; + dev->ib_dev.get_vector_affinity = mlx5_ib_get_vector_affinity; if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) dev->ib_dev.alloc_rdma_netdev = mlx5_ib_alloc_rdma_netdev; @@ -3729,18 +4067,20 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + dev->ib_dev.create_flow = mlx5_ib_create_flow; + dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; + dev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { - dev->ib_dev.create_flow = mlx5_ib_create_flow; - dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; dev->ib_dev.create_wq = mlx5_ib_create_wq; dev->ib_dev.modify_wq = mlx5_ib_modify_wq; dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; dev->ib_dev.uverbs_ex_cmd_mask |= - (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | @@ -3760,6 +4100,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) err = mlx5_enable_eth(dev); if (err) goto err_free_port; + dev->roce.last_port_state = IB_PORT_DOWN; } err = create_dev_resources(&dev->devr); @@ -3776,9 +4117,13 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) goto err_odp; } + err = mlx5_ib_init_cong_debugfs(dev); + if (err) + goto err_cnt; + dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); if (!dev->mdev->priv.uar) - goto err_cnt; + goto err_cong; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); if (err) @@ -3796,18 +4141,25 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_dev; + init_delay_drop(dev); + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) - goto err_umrc; + goto err_delay_drop; } + if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + MLX5_CAP_GEN(mdev, disable_local_lb)) + mutex_init(&dev->lb_mutex); + dev->ib_active = true; return dev; -err_umrc: +err_delay_drop: + cancel_delay_drop(dev); destroy_umrc_res(dev); err_dev: @@ -3823,6 +4175,8 @@ err_uar_page: mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); err_cnt: + mlx5_ib_cleanup_cong_debugfs(dev); +err_cong: if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) mlx5_ib_dealloc_counters(dev); @@ -3852,11 +4206,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) struct mlx5_ib_dev *dev = context; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); + cancel_delay_drop(dev); mlx5_remove_netdev_notifier(dev); ib_unregister_device(&dev->ib_dev); mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); mlx5_put_uars_page(dev->mdev, mdev->priv.uar); + mlx5_ib_cleanup_cong_debugfs(dev); if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) mlx5_ib_dealloc_counters(dev); destroy_umrc_res(dev); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bdcf25410c99..189e80cd6b2f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -107,6 +107,11 @@ enum { MLX5_CQE_VERSION_V1, }; +enum { + MLX5_TM_MAX_RNDV_MSG_SIZE = 64, + MLX5_TM_MAX_SGE = 1, +}; + struct mlx5_ib_vma_private_data { struct list_head list; struct vm_area_struct *vma; @@ -247,6 +252,10 @@ struct mlx5_ib_wq { void *qend; }; +enum mlx5_ib_wq_flags { + MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1, +}; + struct mlx5_ib_rwq { struct ib_wq ibwq; struct mlx5_core_qp core_qp; @@ -264,6 +273,7 @@ struct mlx5_ib_rwq { u32 wqe_count; u32 wqe_shift; int wq_sig; + u32 create_flags; /* Use enum mlx5_ib_wq_flags */ }; enum { @@ -378,6 +388,7 @@ struct mlx5_ib_qp { struct list_head cq_recv_list; struct list_head cq_send_list; u32 rate_limit; + u32 underlay_qpn; }; struct mlx5_ib_cq_buf { @@ -399,6 +410,7 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, MLX5_IB_QP_RSS = 1 << 8, MLX5_IB_QP_CVLAN_STRIPPING = 1 << 9, + MLX5_IB_QP_UNDERLAY = 1 << 10, }; struct mlx5_umr_wr { @@ -496,7 +508,7 @@ struct mlx5_ib_mr { struct mlx5_shared_mr_info *smr_info; struct list_head list; int order; - int umred; + bool allocated_from_cache; int npages; struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; @@ -616,6 +628,63 @@ struct mlx5_roce { struct net_device *netdev; struct notifier_block nb; atomic_t next_port; + enum ib_port_state last_port_state; +}; + +struct mlx5_ib_dbg_param { + int offset; + struct mlx5_ib_dev *dev; + struct dentry *dentry; +}; + +enum mlx5_ib_dbg_cc_types { + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE, + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI, + MLX5_IB_DBG_CC_RP_TIME_RESET, + MLX5_IB_DBG_CC_RP_BYTE_RESET, + MLX5_IB_DBG_CC_RP_THRESHOLD, + MLX5_IB_DBG_CC_RP_AI_RATE, + MLX5_IB_DBG_CC_RP_HAI_RATE, + MLX5_IB_DBG_CC_RP_MIN_DEC_FAC, + MLX5_IB_DBG_CC_RP_MIN_RATE, + MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP, + MLX5_IB_DBG_CC_RP_DCE_TCP_G, + MLX5_IB_DBG_CC_RP_DCE_TCP_RTT, + MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD, + MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE, + MLX5_IB_DBG_CC_RP_GD, + MLX5_IB_DBG_CC_NP_CNP_DSCP, + MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE, + MLX5_IB_DBG_CC_NP_CNP_PRIO, + MLX5_IB_DBG_CC_MAX, +}; + +struct mlx5_ib_dbg_cc_params { + struct dentry *root; + struct mlx5_ib_dbg_param params[MLX5_IB_DBG_CC_MAX]; +}; + +enum { + MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100, +}; + +struct mlx5_ib_dbg_delay_drop { + struct dentry *dir_debugfs; + struct dentry *rqs_cnt_debugfs; + struct dentry *events_cnt_debugfs; + struct dentry *timeout_debugfs; +}; + +struct mlx5_ib_delay_drop { + struct mlx5_ib_dev *dev; + struct work_struct delay_drop_work; + /* serialize setting of delay drop */ + struct mutex lock; + u32 timeout; + bool activate; + atomic_t events_cnt; + atomic_t rqs_cnt; + struct mlx5_ib_dbg_delay_drop *dbg; }; struct mlx5_ib_dev { @@ -652,9 +721,15 @@ struct mlx5_ib_dev { struct list_head qp_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; - struct mlx5_sq_bfreg bfreg; - struct mlx5_sq_bfreg fp_bfreg; - u8 umr_fence; + struct mlx5_sq_bfreg bfreg; + struct mlx5_sq_bfreg fp_bfreg; + struct mlx5_ib_delay_drop delay_drop; + struct mlx5_ib_dbg_cc_params *dbg_cc_params; + + /* protect the user_td */ + struct mutex lb_mutex; + u32 user_td; + u8 umr_fence; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -904,6 +979,9 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type); +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev); +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev); + /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 2c40a2e989d2..0e2789d9bb4d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -48,7 +48,7 @@ enum { #define MLX5_UMR_ALIGN 2048 static int clean_mr(struct mlx5_ib_mr *mr); -static int use_umr(struct mlx5_ib_dev *dev, int order); +static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) @@ -183,7 +183,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) break; } mr->order = ent->order; - mr->umred = 1; + mr->allocated_from_cache = 1; mr->dev = dev; MLX5_SET(mkc, mkc, free, 1); @@ -491,16 +491,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_ib_mr *mr = NULL; struct mlx5_cache_ent *ent; + int last_umr_cache_entry; int c; int i; c = order2idx(dev, order); - if (c < 0 || c > MAX_UMR_CACHE_ENTRY) { + last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev)); + if (c < 0 || c > last_umr_cache_entry) { mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); return NULL; } - for (i = c; i < MAX_UMR_CACHE_ENTRY; i++) { + for (i = c; i <= last_umr_cache_entry; i++) { ent = &cache->ent[i]; mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); @@ -674,12 +676,12 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); queue_work(cache->wq, &ent->work); - if (i > MAX_UMR_CACHE_ENTRY) { + if (i > MR_CACHE_LAST_STD_ENTRY) { mlx5_odp_init_mr_cache_entry(ent); continue; } - if (!use_umr(dev, ent->order)) + if (ent->order > mr_cache_max_order(dev)) continue; ent->page = PAGE_SHIFT; @@ -806,21 +808,22 @@ err_free: return ERR_PTR(err); } -static int get_octo_len(u64 addr, u64 len, int page_size) +static int get_octo_len(u64 addr, u64 len, int page_shift) { + u64 page_size = 1ULL << page_shift; u64 offset; int npages; offset = addr & (page_size - 1); - npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + npages = ALIGN(len + offset, page_size) >> page_shift; return (npages + 1) / 2; } -static int use_umr(struct mlx5_ib_dev *dev, int order) +static int mr_cache_max_order(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) - return order <= MAX_UMR_CACHE_ENTRY + 2; - return order <= MLX5_MAX_UMR_SHIFT; + return MR_CACHE_LAST_STD_ENTRY + 2; + return MLX5_MAX_UMR_SHIFT; } static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length, @@ -896,7 +899,8 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, return err; } -static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, +static struct mlx5_ib_mr *alloc_mr_from_cache( + struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr, u64 len, int npages, int page_shift, int order, int access_flags) { @@ -928,16 +932,6 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, mr->mmkey.size = len; mr->mmkey.pd = to_mpd(pd)->pdn; - err = mlx5_ib_update_xlt(mr, 0, npages, page_shift, - MLX5_IB_UPD_XLT_ENABLE); - - if (err) { - mlx5_mr_cache_free(dev, mr); - return ERR_PTR(err); - } - - mr->live = 1; - return mr; } @@ -1103,7 +1097,8 @@ free_xlt: static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, u64 virt_addr, u64 length, struct ib_umem *umem, int npages, - int page_shift, int access_flags) + int page_shift, int access_flags, + bool populate) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1118,15 +1113,19 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, if (!mr) return ERR_PTR(-ENOMEM); - inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(*pas) * ((npages + 1) / 2) * 2; + mr->ibmr.pd = pd; + mr->access_flags = access_flags; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + if (populate) + inlen += sizeof(*pas) * roundup(npages, 2); in = kvzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_1; } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (!(access_flags & IB_ACCESS_ON_DEMAND)) + if (populate && !(access_flags & IB_ACCESS_ON_DEMAND)) mlx5_ib_populate_pas(dev, umem, page_shift, pas, pg_cap ? MLX5_IB_MTT_PRESENT : 0); @@ -1135,23 +1134,27 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, !populate); MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, start_addr, virt_addr); MLX5_SET64(mkc, mkc, len, length); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); MLX5_SET(mkc, mkc, bsf_octword_size, 0); MLX5_SET(mkc, mkc, translations_octword_size, - get_octo_len(virt_addr, length, 1 << page_shift)); + get_octo_len(virt_addr, length, page_shift)); MLX5_SET(mkc, mkc, log_page_size, page_shift); MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(create_mkey_in, in, translations_octword_actual_size, - get_octo_len(virt_addr, length, 1 << page_shift)); + if (populate) { + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + get_octo_len(virt_addr, length, page_shift)); + } err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) { @@ -1160,9 +1163,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, } mr->mmkey.type = MLX5_MKEY_MR; mr->desc_size = sizeof(struct mlx5_mtt); - mr->umem = umem; mr->dev = dev; - mr->live = 1; kvfree(in); mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); @@ -1202,6 +1203,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int ncont; int order; int err; + bool use_umr = true; mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); @@ -1220,27 +1222,29 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, &page_shift, &ncont, &order); - if (err < 0) + if (err < 0) return ERR_PTR(err); - if (use_umr(dev, order)) { - mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, - order, access_flags); + if (order <= mr_cache_max_order(dev)) { + mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont, + page_shift, order, access_flags); if (PTR_ERR(mr) == -EAGAIN) { mlx5_ib_dbg(dev, "cache empty for order %d", order); mr = NULL; } - } else if (access_flags & IB_ACCESS_ON_DEMAND && - !MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) { - err = -EINVAL; - pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); - goto error; + } else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) { + if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); + goto error; + } + use_umr = false; } if (!mr) { mutex_lock(&dev->slow_path_mutex); mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, - page_shift, access_flags); + page_shift, access_flags, !use_umr); mutex_unlock(&dev->slow_path_mutex); } @@ -1258,8 +1262,22 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, update_odp_mr(mr); #endif - return &mr->ibmr; + if (use_umr) { + int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE; + if (access_flags & IB_ACCESS_ON_DEMAND) + update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP; + + err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift, + update_xlt_flags); + if (err) { + mlx5_ib_dereg_mr(&mr->ibmr); + return ERR_PTR(err); + } + } + + mr->live = 1; + return &mr->ibmr; error: ib_umem_release(umem); return ERR_PTR(err); @@ -1347,7 +1365,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* * UMR can't be used - MKey needs to be replaced. */ - if (mr->umred) { + if (mr->allocated_from_cache) { err = unreg_umr(dev, mr); if (err) mlx5_ib_warn(dev, "Failed to unregister MR\n"); @@ -1360,12 +1378,13 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, return err; mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, - page_shift, access_flags); + page_shift, access_flags, true); if (IS_ERR(mr)) return PTR_ERR(mr); - mr->umred = 0; + mr->allocated_from_cache = 0; + mr->live = 1; } else { /* * Send a UMR WQE @@ -1453,7 +1472,7 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) static int clean_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); - int umred = mr->umred; + int allocated_from_cache = mr->allocated_from_cache; int err; if (mr->sig) { @@ -1471,20 +1490,20 @@ static int clean_mr(struct mlx5_ib_mr *mr) mlx5_free_priv_descs(mr); - if (!umred) { + if (!allocated_from_cache) { + u32 key = mr->mmkey.key; + err = destroy_mkey(dev, mr); + kfree(mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - mr->mmkey.key, err); + key, err); return err; } } else { mlx5_mr_cache_free(dev, mr); } - if (!umred) - kfree(mr); - return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f58f8f5f3ebe..acb79d3a4f1d 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -34,6 +34,7 @@ #include <rdma/ib_umem.h> #include <rdma/ib_cache.h> #include <rdma/ib_user_verbs.h> +#include <linux/mlx5/fs.h> #include "mlx5_ib.h" /* not supported currently */ @@ -453,7 +454,8 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, return -EINVAL; } - if (attr->qp_type == IB_QPT_RAW_PACKET) { + if (attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; } else { @@ -675,10 +677,14 @@ err_umem: return err; } -static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq) +static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq) { struct mlx5_ib_ucontext *context; + if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP) + atomic_dec(&dev->delay_drop.rqs_cnt); + context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &rwq->db); if (rwq->umem) @@ -959,11 +965,16 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_free; } - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); - qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); - qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); - qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); + qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt, + sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wqe_head), GFP_KERNEL); if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || !qp->sq.w_list || !qp->sq.wqe_head) { @@ -975,11 +986,11 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, return 0; err_wrid: - kfree(qp->sq.wqe_head); - kfree(qp->sq.w_list); - kfree(qp->sq.wrid); - kfree(qp->sq.wr_data); - kfree(qp->rq.wrid); + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); mlx5_db_free(dev->mdev, &qp->db); err_free: @@ -992,11 +1003,11 @@ err_buf: static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { - kfree(qp->sq.wqe_head); - kfree(qp->sq.w_list); - kfree(qp->sq.wrid); - kfree(qp->sq.wr_data); - kfree(qp->rq.wrid); + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); mlx5_db_free(dev->mdev, &qp->db); mlx5_buf_free(dev->mdev, &qp->buf); } @@ -1021,12 +1032,16 @@ static int is_connected(enum ib_qp_type qp_type) } static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, struct mlx5_ib_sq *sq, u32 tdn) { u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); MLX5_SET(tisc, tisc, transport_domain, tdn); + if (qp->flags & MLX5_IB_QP_UNDERLAY) + MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); + return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); } @@ -1068,11 +1083,16 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); MLX5_SET(sqc, sqc, flush_in_error_en, 1); + if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe)) + MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, 1); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); MLX5_SET(sqc, sqc, tis_lst_sz, 1); MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, swp)) + MLX5_SET(sqc, sqc, allow_swp, 1); wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); @@ -1229,7 +1249,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 tdn = mucontext->tdn; if (qp->sq.wqe_cnt) { - err = create_raw_packet_qp_tis(dev, sq, tdn); + err = create_raw_packet_qp_tis(dev, qp, sq, tdn); if (err) return err; @@ -1503,10 +1523,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, u32 *in; int err; - base = init_attr->qp_type == IB_QPT_RAW_PACKET ? - &qp->raw_packet_qp.rq.base : - &qp->trans_qp.base; - mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); @@ -1588,10 +1604,28 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + + if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (init_attr->qp_type != IB_QPT_UD || + (MLX5_CAP_GEN(dev->mdev, port_type) != + MLX5_CAP_PORT_TYPE_IB) || + !mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) { + mlx5_ib_dbg(dev, "Source QP option isn't supported\n"); + return -EOPNOTSUPP; + } + + qp->flags |= MLX5_IB_QP_UNDERLAY; + qp->underlay_qpn = init_attr->source_qpn; + } } else { qp->wq_sig = !!wq_signature; } + base = (init_attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + qp->has_rq = qp_has_rq(init_attr); err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, (pd && pd->uobject) ? &ucmd : NULL); @@ -1695,10 +1729,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr)); - if (qp->sq.wqe_cnt) + if (qp->sq.wqe_cnt) { MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); - else + } else { MLX5_SET(qpc, qpc, no_sq, 1); + if (init_attr->srq && + init_attr->srq->srq_type == IB_SRQT_TM) + MLX5_SET(qpc, qpc, offload_type, + MLX5_QPC_OFFLOAD_TYPE_RNDV); + } /* Set default resources */ switch (init_attr->qp_type) { @@ -1742,7 +1781,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags |= MLX5_IB_QP_LSO; } - if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (init_attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); err = create_raw_packet_qp(dev, qp, in, pd); @@ -1894,7 +1934,7 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; - struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct mlx5_ib_qp_base *base; unsigned long flags; int err; @@ -1903,12 +1943,14 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) return; } - base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ? + base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) ? &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; if (qp->state != IB_QPS_RESET) { - if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { + if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && + !(qp->flags & MLX5_IB_QP_UNDERLAY)) { err = mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); @@ -1947,7 +1989,8 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { destroy_raw_packet_qp(dev, qp); } else { err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); @@ -2703,7 +2746,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; - } else if (ibqp->qp_type == IB_QPT_UD || + } else if ((ibqp->qp_type == IB_QPT_UD && + !(qp->flags & MLX5_IB_QP_UNDERLAY)) || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { @@ -2800,6 +2844,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : qp->port) - 1; + + /* Underlay port should be used - index 0 function per port */ + if (qp->flags & MLX5_IB_QP_UNDERLAY) + port_num = 0; + mibport = &dev->port[port_num]; context->qp_counter_set_usr_page |= cpu_to_be32((u32)(mibport->cnts.set_id) << 24); @@ -2825,7 +2874,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { struct mlx5_modify_raw_qp_param raw_qp_param = {}; raw_qp_param.operation = op; @@ -2914,7 +2964,13 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } - if (qp_type != MLX5_IB_QPT_REG_UMR && + if (qp->flags & MLX5_IB_QP_UNDERLAY) { + if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) { + mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n", + attr_mask); + goto out; + } + } else if (qp_type != MLX5_IB_QPT_REG_UMR && !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); @@ -4478,9 +4534,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr); + /* Not all of output fields are applicable, make sure to zero them */ + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + memset(qp_attr, 0, sizeof(*qp_attr)); + mutex_lock(&qp->mutex); - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + qp->flags & MLX5_IB_QP_UNDERLAY) { err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); if (err) goto out; @@ -4598,6 +4659,27 @@ static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type) } } +static int set_delay_drop(struct mlx5_ib_dev *dev) +{ + int err = 0; + + mutex_lock(&dev->delay_drop.lock); + if (dev->delay_drop.activate) + goto out; + + err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout); + if (err) + goto out; + + dev->delay_drop.activate = true; +out: + mutex_unlock(&dev->delay_drop.lock); + + if (!err) + atomic_inc(&dev->delay_drop.rqs_cnt); + return err; +} + static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, struct ib_wq_init_attr *init_attr) { @@ -4652,9 +4734,28 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, } MLX5_SET(rqc, rqc, scatter_fcs, 1); } + if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + if (!(dev->ib_dev.attrs.raw_packet_caps & + IB_RAW_PACKET_CAP_DELAY_DROP)) { + mlx5_ib_dbg(dev, "Delay drop is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, delay_drop_en, 1); + } rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); + if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + err = set_delay_drop(dev); + if (err) { + mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n", + err); + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + } else { + rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP; + } + } out: kvfree(in); return err; @@ -4788,7 +4889,7 @@ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, err_copy: mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); err_user_rq: - destroy_user_rq(pd, rwq); + destroy_user_rq(dev, pd, rwq); err: kfree(rwq); return ERR_PTR(err); @@ -4800,7 +4901,7 @@ int mlx5_ib_destroy_wq(struct ib_wq *wq) struct mlx5_ib_rwq *rwq = to_mrwq(wq); mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); - destroy_user_rq(wq->pd, rwq); + destroy_user_rq(dev, wq->pd, rwq); kfree(rwq); return 0; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 43707b101f47..6d5fadad9090 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -101,7 +101,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, udata->inlen - sizeof(ucmd))) return -EINVAL; - if (in->type == IB_SRQT_XRC) { + if (in->type != IB_SRQT_BASIC) { err = get_srq_user_index(to_mucontext(pd->uobject->context), &ucmd, udata->inlen, &uidx); if (err) @@ -145,7 +145,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; in->page_offset = offset; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && - in->type == IB_SRQT_XRC) + in->type != IB_SRQT_BASIC) in->user_index = uidx; return 0; @@ -196,7 +196,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } mlx5_fill_page_array(&srq->buf, in->pas); - srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + srq->wrid = kvmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL); if (!srq->wrid) { err = -ENOMEM; goto err_in; @@ -205,7 +205,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && - in->type == IB_SRQT_XRC) + in->type != IB_SRQT_BASIC) in->user_index = MLX5_IB_DEFAULT_UIDX; return 0; @@ -230,7 +230,7 @@ static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) { - kfree(srq->wrid); + kvfree(srq->wrid); mlx5_buf_free(dev->mdev, &srq->buf); mlx5_db_free(dev->mdev, &srq->db); } @@ -292,14 +292,29 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, in.wqe_shift = srq->msrq.wqe_shift - 4; if (srq->wq_sig) in.flags |= MLX5_SRQ_FLAG_WQ_SIG; - if (init_attr->srq_type == IB_SRQT_XRC) { + + if (init_attr->srq_type == IB_SRQT_XRC) in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; - in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn; - } else if (init_attr->srq_type == IB_SRQT_BASIC) { + else in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn; - in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; + + if (init_attr->srq_type == IB_SRQT_TM) { + in.tm_log_list_size = + ilog2(init_attr->ext.tag_matching.max_num_tags) + 1; + if (in.tm_log_list_size > + MLX5_CAP_GEN(dev->mdev, log_tag_matching_list_sz)) { + mlx5_ib_dbg(dev, "TM SRQ max_num_tags exceeding limit\n"); + err = -EINVAL; + goto err_usr_kern_srq; + } + in.flags |= MLX5_SRQ_FLAG_RNDV; } + if (ib_srq_has_cq(init_attr->srq_type)) + in.cqn = to_mcq(init_attr->ext.cq)->mcq.cqn; + else + in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; + in.pd = to_mpd(pd)->pdn; in.db_record = srq->db.dma; err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in); diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c index 2aec9908c40a..e7f6223e9c60 100644 --- a/drivers/infiniband/hw/mthca/mthca_av.c +++ b/drivers/infiniband/hw/mthca/mthca_av.c @@ -186,7 +186,7 @@ int mthca_create_ah(struct mthca_dev *dev, on_hca_fail: if (ah->type == MTHCA_AH_PCI_POOL) { - ah->av = pci_pool_zalloc(dev->av_table.pool, + ah->av = dma_pool_zalloc(dev->av_table.pool, GFP_ATOMIC, &ah->avdma); if (!ah->av) return -ENOMEM; @@ -250,7 +250,7 @@ int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah) break; case MTHCA_AH_PCI_POOL: - pci_pool_free(dev->av_table.pool, ah->av, ah->avdma); + dma_pool_free(dev->av_table.pool, ah->av, ah->avdma); break; case MTHCA_AH_KMALLOC: @@ -340,7 +340,7 @@ int mthca_init_av_table(struct mthca_dev *dev) if (err) return err; - dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev, + dev->av_table.pool = dma_pool_create("mthca_av", &dev->pdev->dev, MTHCA_AV_SIZE, MTHCA_AV_SIZE, 0); if (!dev->av_table.pool) @@ -360,7 +360,7 @@ int mthca_init_av_table(struct mthca_dev *dev) return 0; out_free_pool: - pci_pool_destroy(dev->av_table.pool); + dma_pool_destroy(dev->av_table.pool); out_free_alloc: mthca_alloc_cleanup(&dev->av_table.alloc); @@ -374,6 +374,6 @@ void mthca_cleanup_av_table(struct mthca_dev *dev) if (dev->av_table.av_map) iounmap(dev->av_table.av_map); - pci_pool_destroy(dev->av_table.pool); + dma_pool_destroy(dev->av_table.pool); mthca_alloc_cleanup(&dev->av_table.alloc); } diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 9d83a53c0c67..419a2a20c047 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -538,7 +538,7 @@ int mthca_cmd_init(struct mthca_dev *dev) return -ENOMEM; } - dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev, + dev->cmd.pool = dma_pool_create("mthca_cmd", &dev->pdev->dev, MTHCA_MAILBOX_SIZE, MTHCA_MAILBOX_SIZE, 0); if (!dev->cmd.pool) { @@ -551,7 +551,7 @@ int mthca_cmd_init(struct mthca_dev *dev) void mthca_cmd_cleanup(struct mthca_dev *dev) { - pci_pool_destroy(dev->cmd.pool); + dma_pool_destroy(dev->cmd.pool); iounmap(dev->hcr); if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS) iounmap(dev->cmd.dbell_map); @@ -621,7 +621,7 @@ struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, if (!mailbox) return ERR_PTR(-ENOMEM); - mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma); + mailbox->buf = dma_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma); if (!mailbox->buf) { kfree(mailbox); return ERR_PTR(-ENOMEM); @@ -635,7 +635,7 @@ void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox) if (!mailbox) return; - pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } @@ -698,7 +698,7 @@ static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm, for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) { if (virt != -1) { pages[nent * 2] = cpu_to_be64(virt); - virt += 1 << lg; + virt += 1ULL << lg; } pages[nent * 2 + 1] = @@ -1921,7 +1921,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); - MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET); + MTHCA_PUT(inbox, ib_lid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET); MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); if (in_grh) @@ -1929,7 +1929,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, op_modifier |= 0x4; - in_modifier |= in_wc->slid << 16; + in_modifier |= ib_lid_cpu16(in_wc->slid) << 16; } err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index ec7da9a474cd..5508afbf1c67 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -118,7 +118,7 @@ enum { }; struct mthca_cmd { - struct pci_pool *pool; + struct dma_pool *pool; struct mutex hcr_mutex; struct semaphore poll_sem; struct semaphore event_sem; @@ -263,7 +263,7 @@ struct mthca_qp_table { }; struct mthca_av_table { - struct pci_pool *pool; + struct dma_pool *pool; int num_ddr_avs; u64 ddr_av_base; void __iomem *av_map; diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 7df3db71777a..093f7755c843 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -205,7 +205,7 @@ int mthca_process_mad(struct ib_device *ibdev, u16 *out_mad_pkey_index) { int err; - u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + u16 slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); u16 prev_lid = 0; struct ib_port_attr pattr; const struct ib_mad *in_mad = (const struct ib_mad *)in; @@ -256,7 +256,7 @@ int mthca_process_mad(struct ib_device *ibdev, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = pattr.lid; + prev_lid = ib_lid_cpu16(pattr.lid); err = mthca_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY, diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index c309e5c96383..e36a9bc52268 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -49,7 +49,6 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG @@ -1162,7 +1161,7 @@ static void mthca_remove_one(struct pci_dev *pdev) mutex_unlock(&mthca_device_mutex); } -static struct pci_device_id mthca_pci_table[] = { +static const struct pci_device_id mthca_pci_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), .driver_data = TAVOR }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR), diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index c197cd9b193f..6fee7795d1c8 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -914,7 +914,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err = 0; int write_mtt_size; - if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) { + if (udata->inlen < sizeof ucmd) { if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", current->comm); @@ -1178,12 +1178,11 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *device, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *device, char *str) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d", (int) (dev->fw_ver >> 32), (int) (dev->fw_ver >> 16) & 0xffff, (int) dev->fw_ver & 0xffff); diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index a30aa6527f7e..942ca84713c9 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -63,7 +63,6 @@ MODULE_AUTHOR("NetEffect"); MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); int interrupt_mod_interval = 0; @@ -102,7 +101,7 @@ static unsigned int ee_flsh_adapter; static unsigned int sysfs_nonidx_addr; static unsigned int sysfs_idx_addr; -static struct pci_device_id nes_pci_table[] = { +static const struct pci_device_id nes_pci_table[] = { { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), }, { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), }, {0} @@ -808,13 +807,6 @@ static void nes_remove(struct pci_dev *pcidev) } -static struct pci_driver nes_pci_driver = { - .name = DRV_NAME, - .id_table = nes_pci_table, - .probe = nes_probe, - .remove = nes_remove, -}; - static ssize_t adapter_show(struct device_driver *ddp, char *buf) { unsigned int devfn = 0xffffffff; @@ -1156,35 +1148,29 @@ static DRIVER_ATTR_RW(idx_addr); static DRIVER_ATTR_RW(idx_data); static DRIVER_ATTR_RW(wqm_quanta); -static int nes_create_driver_sysfs(struct pci_driver *drv) -{ - int error; - error = driver_create_file(&drv->driver, &driver_attr_adapter); - error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd); - error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data); - error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd); - error |= driver_create_file(&drv->driver, &driver_attr_flash_data); - error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr); - error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data); - error |= driver_create_file(&drv->driver, &driver_attr_idx_addr); - error |= driver_create_file(&drv->driver, &driver_attr_idx_data); - error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta); - return error; -} +static struct attribute *nes_attrs[] = { + &driver_attr_adapter.attr, + &driver_attr_eeprom_cmd.attr, + &driver_attr_eeprom_data.attr, + &driver_attr_flash_cmd.attr, + &driver_attr_flash_data.attr, + &driver_attr_nonidx_addr.attr, + &driver_attr_nonidx_data.attr, + &driver_attr_idx_addr.attr, + &driver_attr_idx_data.attr, + &driver_attr_wqm_quanta.attr, + NULL, +}; +ATTRIBUTE_GROUPS(nes); + +static struct pci_driver nes_pci_driver = { + .name = DRV_NAME, + .id_table = nes_pci_table, + .probe = nes_probe, + .remove = nes_remove, + .groups = nes_groups, +}; -static void nes_remove_driver_sysfs(struct pci_driver *drv) -{ - driver_remove_file(&drv->driver, &driver_attr_adapter); - driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd); - driver_remove_file(&drv->driver, &driver_attr_eeprom_data); - driver_remove_file(&drv->driver, &driver_attr_flash_cmd); - driver_remove_file(&drv->driver, &driver_attr_flash_data); - driver_remove_file(&drv->driver, &driver_attr_nonidx_addr); - driver_remove_file(&drv->driver, &driver_attr_nonidx_data); - driver_remove_file(&drv->driver, &driver_attr_idx_addr); - driver_remove_file(&drv->driver, &driver_attr_idx_data); - driver_remove_file(&drv->driver, &driver_attr_wqm_quanta); -} /** * nes_init_module - module initialization entry point @@ -1192,20 +1178,13 @@ static void nes_remove_driver_sysfs(struct pci_driver *drv) static int __init nes_init_module(void) { int retval; - int retval1; retval = nes_cm_start(); if (retval) { printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n"); return retval; } - retval = pci_register_driver(&nes_pci_driver); - if (retval >= 0) { - retval1 = nes_create_driver_sysfs(&nes_pci_driver); - if (retval1 < 0) - printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n"); - } - return retval; + return pci_register_driver(&nes_pci_driver); } @@ -1215,7 +1194,6 @@ static int __init nes_init_module(void) static void __exit nes_exit_module(void) { nes_cm_stop(); - nes_remove_driver_sysfs(&nes_pci_driver); pci_unregister_driver(&nes_pci_driver); } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 25dcd7573df9..f0dc5f4aa177 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -481,21 +481,16 @@ static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr props->active_mtu = ib_mtu_int_to_enum(netdev->mtu); props->lid = 1; - props->lmc = 0; - props->sm_lid = 0; - props->sm_sl = 0; if (netif_queue_stopped(netdev)) props->state = IB_PORT_DOWN; else if (nesvnic->linkup) props->state = IB_PORT_ACTIVE; else props->state = IB_PORT_DOWN; - props->phys_state = 0; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; - props->qkey_viol_cntr = 0; props->active_width = IB_WIDTH_4X; props->active_speed = IB_SPEED_SDR; props->max_msg_sz = 0x80000000; @@ -3672,15 +3667,14 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *dev, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *dev, char *str) { struct nes_ib_device *nesibdev = container_of(dev, struct nes_ib_device, ibdev); struct nes_vnic *nesvnic = nesibdev->nesvnic; nes_debug(NES_DBG_INIT, "\n"); - snprintf(str, str_len, "%u.%u", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", (nesvnic->nesdev->nesadapter->firmware_version >> 16), (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 57c9a2ad0260..fbfbd9e96147 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -58,7 +58,6 @@ #include "ocrdma_stats.h" #include <rdma/ocrdma-abi.h> -MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION); MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION); MODULE_AUTHOR("Emulex Corporation"); MODULE_LICENSE("Dual BSD/GPL"); @@ -108,12 +107,11 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *device, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *device, char *str) { struct ocrdma_dev *dev = get_ocrdma_dev(device); - snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]); } static int ocrdma_register_device(struct ocrdma_dev *dev) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index b5851fd67d4f..97d033f51dc9 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -47,7 +47,6 @@ MODULE_DESCRIPTION("QLogic 40G/100G ROCE Driver"); MODULE_AUTHOR("QLogic Corporation"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(QEDR_MODULE_VERSION); #define QEDR_WQ_MULTIPLIER_DFT (3) @@ -69,13 +68,12 @@ static enum rdma_link_layer qedr_link_layer(struct ib_device *device, return IB_LINK_LAYER_ETHERNET; } -static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str) { struct qedr_dev *qedr = get_qedr_dev(ibdev); u32 fw_ver = (u32)qedr->attr.fw_ver; - snprintf(str, str_len, "%d. %d. %d. %d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d", (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF, (fw_ver >> 8) & 0xFF, fw_ver & 0xFF); } @@ -778,6 +776,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, if (rc) goto init_err; + dev->user_dpm_enabled = dev_info.user_dpm_enabled; dev->num_hwfns = dev_info.common.num_hwfns; dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev); diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index ab7784bfdac6..b2bb42e2805d 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -41,7 +41,6 @@ #include <linux/qed/roce_common.h> #include "qedr_hsi_rdma.h" -#define QEDR_MODULE_VERSION "8.10.10.0" #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA" #define DP_NAME(dev) ((dev)->ibdev.name) @@ -163,6 +162,8 @@ struct qedr_dev { struct qedr_qp *gsi_qp; unsigned long enet_state; + + u8 user_dpm_enabled; }; #define QEDR_MAX_SQ_PBL (0x8000) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 2ae71b8f1ba8..769ac07c3c8e 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -376,6 +376,9 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, memset(&uresp, 0, sizeof(uresp)); + uresp.dpm_enabled = dev->user_dpm_enabled; + uresp.wids_enabled = 1; + uresp.wid_count = oparams.wid_count; uresp.db_pa = ctx->dpi_phys_addr; uresp.db_size = ctx->dpi_size; uresp.max_send_wr = dev->attr.max_sqe; @@ -488,7 +491,7 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, (udata && context) ? "User Lib" : "Kernel"); if (!dev->rdma_ctx) { - DP_ERR(dev, "invlaid RDMA context\n"); + DP_ERR(dev, "invalid RDMA context\n"); return ERR_PTR(-EINVAL); } diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index a3e21a25cea5..f9e1c69603a5 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1,7 +1,7 @@ #ifndef _QIB_KERNEL_H #define _QIB_KERNEL_H /* - * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -443,7 +443,7 @@ struct qib_irq_notify; #endif struct qib_msix_entry { - struct msix_entry msix; + int irq; void *arg; #ifdef CONFIG_INFINIBAND_QIB_DCA int dca; @@ -1433,9 +1433,9 @@ int qib_pcie_init(struct pci_dev *, const struct pci_device_id *); int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *, const struct pci_device_id *); void qib_pcie_ddcleanup(struct qib_devdata *); -int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct qib_msix_entry *); +int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent); int qib_reinit_intr(struct qib_devdata *); -void qib_enable_intx(struct pci_dev *); +void qib_enable_intx(struct qib_devdata *dd); void qib_nomsi(struct qib_devdata *); void qib_nomsix(struct qib_devdata *); void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *); diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c index 5bad8e3b40bb..5ed1ed93380f 100644 --- a/drivers/infiniband/hw/qib/qib_debugfs.c +++ b/drivers/infiniband/hw/qib/qib_debugfs.c @@ -1,6 +1,5 @@ -#ifdef CONFIG_DEBUG_FS /* - * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2013 - 2017 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -191,10 +190,10 @@ DEBUGFS_FILE(ctx_stats) static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) __acquires(RCU) { - struct qib_qp_iter *iter; + struct rvt_qp_iter *iter; loff_t n = *pos; - iter = qib_qp_iter_init(s->private); + iter = rvt_qp_iter_init(s->private, 0, NULL); /* stop calls rcu_read_unlock */ rcu_read_lock(); @@ -203,7 +202,7 @@ static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) return NULL; do { - if (qib_qp_iter_next(iter)) { + if (rvt_qp_iter_next(iter)) { kfree(iter); return NULL; } @@ -216,11 +215,11 @@ static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, loff_t *pos) __must_hold(RCU) { - struct qib_qp_iter *iter = iter_ptr; + struct rvt_qp_iter *iter = iter_ptr; (*pos)++; - if (qib_qp_iter_next(iter)) { + if (rvt_qp_iter_next(iter)) { kfree(iter); return NULL; } @@ -236,7 +235,7 @@ static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) { - struct qib_qp_iter *iter = iter_ptr; + struct rvt_qp_iter *iter = iter_ptr; if (!iter) return 0; @@ -284,6 +283,3 @@ void qib_dbg_exit(void) debugfs_remove_recursive(qib_dbg_root); qib_dbg_root = NULL; } - -#endif - diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 2b5982f743ef..719906a9fd51 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -66,7 +66,6 @@ MODULE_PARM_DESC(compat_ddr_negotiate, MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Intel <ibsupport@intel.com>"); MODULE_DESCRIPTION("Intel IB driver"); -MODULE_VERSION(QIB_DRIVER_VERSION); /* * QIB_PIO_MAXIBHDR is the max IB header size allowed for in our diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index e423b71e6ea0..3259a60e4f4f 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2013 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. * All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. @@ -1742,38 +1742,32 @@ static void qib_setup_6120_interrupt(struct qib_devdata *dd) */ static void pe_boardname(struct qib_devdata *dd) { - char *n; - u32 boardid, namelen; + u32 boardid; boardid = SYM_FIELD(dd->revision, Revision, BoardID); switch (boardid) { case 2: - n = "InfiniPath_QLE7140"; + dd->boardname = "InfiniPath_QLE7140"; break; default: qib_dev_err(dd, "Unknown 6120 board with ID %u\n", boardid); - n = "Unknown_InfiniPath_6120"; + dd->boardname = "Unknown_InfiniPath_6120"; break; } - namelen = strlen(n) + 1; - dd->boardname = kmalloc(namelen, GFP_KERNEL); - if (dd->boardname) - snprintf(dd->boardname, namelen, "%s", n); if (dd->majrev != 4 || !dd->minrev || dd->minrev > 2) qib_dev_err(dd, - "Unsupported InfiniPath hardware revision %u.%u!\n", - dd->majrev, dd->minrev); + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->majrev, dd->minrev); snprintf(dd->boardversion, sizeof(dd->boardversion), "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, - (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + (unsigned int)SYM_FIELD(dd->revision, Revision_R, Arch), dd->majrev, dd->minrev, - (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); - + (unsigned int)SYM_FIELD(dd->revision, Revision_R, SW)); } /* @@ -1838,7 +1832,7 @@ static int qib_6120_setup_reset(struct qib_devdata *dd) bail: if (ret) { - if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL)) + if (qib_pcie_params(dd, dd->lbus_width, NULL)) qib_dev_err(dd, "Reset failed to setup PCIe or interrupts; continuing anyway\n"); /* clear the reset error, init error/hwerror mask */ @@ -3562,7 +3556,7 @@ struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev, if (qib_mini_init) goto bail; - if (qib_pcie_params(dd, 8, NULL, NULL)) + if (qib_pcie_params(dd, 8, NULL)) qib_dev_err(dd, "Failed to setup PCIe or interrupts; continuing anyway\n"); dd->cspec->irq = pdev->irq; /* save IRQ */ diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index c3679c48e61c..04bdd3d487b1 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2011 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. * All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. @@ -2049,41 +2050,35 @@ static void qib_setup_7220_interrupt(struct qib_devdata *dd) */ static void qib_7220_boardname(struct qib_devdata *dd) { - char *n; - u32 boardid, namelen; + u32 boardid; boardid = SYM_FIELD(dd->revision, Revision, BoardID); switch (boardid) { case 1: - n = "InfiniPath_QLE7240"; + dd->boardname = "InfiniPath_QLE7240"; break; case 2: - n = "InfiniPath_QLE7280"; + dd->boardname = "InfiniPath_QLE7280"; break; default: qib_dev_err(dd, "Unknown 7220 board with ID %u\n", boardid); - n = "Unknown_InfiniPath_7220"; + dd->boardname = "Unknown_InfiniPath_7220"; break; } - namelen = strlen(n) + 1; - dd->boardname = kmalloc(namelen, GFP_KERNEL); - if (dd->boardname) - snprintf(dd->boardname, namelen, "%s", n); - if (dd->majrev != 5 || !dd->minrev || dd->minrev > 2) qib_dev_err(dd, - "Unsupported InfiniPath hardware revision %u.%u!\n", - dd->majrev, dd->minrev); + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->majrev, dd->minrev); snprintf(dd->boardversion, sizeof(dd->boardversion), "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, - (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + (unsigned int)SYM_FIELD(dd->revision, Revision_R, Arch), dd->majrev, dd->minrev, - (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + (unsigned int)SYM_FIELD(dd->revision, Revision_R, SW)); } /* @@ -2148,7 +2143,7 @@ static int qib_setup_7220_reset(struct qib_devdata *dd) bail: if (ret) { - if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL)) + if (qib_pcie_params(dd, dd->lbus_width, NULL)) qib_dev_err(dd, "Reset failed to setup PCIe or interrupts; continuing anyway\n"); @@ -3309,7 +3304,7 @@ static int qib_7220_intr_fallback(struct qib_devdata *dd) qib_devinfo(dd->pcidev, "MSI interrupt not detected, trying INTx interrupts\n"); qib_7220_free_irq(dd); - qib_enable_intx(dd->pcidev); + qib_enable_intx(dd); /* * Some newer kernels require free_irq before disable_msi, * and irq can be changed during disable and INTx enable @@ -4619,7 +4614,7 @@ struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev, minwidth = 8; /* x8 capable boards */ break; } - if (qib_pcie_params(dd, minwidth, NULL, NULL)) + if (qib_pcie_params(dd, minwidth, NULL)) qib_dev_err(dd, "Failed to setup PCIe or interrupts; continuing anyway\n"); diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index bb2439fff8fa..14cadf6d6214 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2008 - 2012 QLogic Corporation. All rights reserved. * * This software is available to you under a choice of one of two @@ -2841,10 +2841,10 @@ static void qib_7322_nomsix(struct qib_devdata *dd) reset_dca_notifier(dd, &dd->cspec->msix_entries[i]); #endif irq_set_affinity_hint( - dd->cspec->msix_entries[i].msix.vector, NULL); + dd->cspec->msix_entries[i].irq, NULL); free_cpumask_var(dd->cspec->msix_entries[i].mask); - free_irq(dd->cspec->msix_entries[i].msix.vector, - dd->cspec->msix_entries[i].arg); + free_irq(dd->cspec->msix_entries[i].irq, + dd->cspec->msix_entries[i].arg); } qib_nomsix(dd); } @@ -3336,9 +3336,9 @@ static void reset_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) qib_devinfo(dd->pcidev, "Disabling notifier on HCA %d irq %d\n", dd->unit, - m->msix.vector); + m->irq); irq_set_affinity_notifier( - m->msix.vector, + m->irq, NULL); m->notifier = NULL; } @@ -3354,7 +3354,7 @@ static void setup_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) int ret; m->notifier = n; - n->notify.irq = m->msix.vector; + n->notify.irq = m->irq; n->notify.notify = qib_irq_notifier_notify; n->notify.release = qib_irq_notifier_release; n->arg = m->arg; @@ -3500,10 +3500,21 @@ try_intx: - 1, QIB_DRV_NAME "%d (kctx)", dd->unit); } - ret = request_irq( - dd->cspec->msix_entries[msixnum].msix.vector, - handler, 0, dd->cspec->msix_entries[msixnum].name, - arg); + + dd->cspec->msix_entries[msixnum].irq = pci_irq_vector( + dd->pcidev, msixnum); + if (dd->cspec->msix_entries[msixnum].irq < 0) { + qib_dev_err(dd, + "Couldn't get MSIx irq (vec=%d): %d\n", + msixnum, + dd->cspec->msix_entries[msixnum].irq); + qib_7322_nomsix(dd); + goto try_intx; + } + ret = request_irq(dd->cspec->msix_entries[msixnum].irq, + handler, 0, + dd->cspec->msix_entries[msixnum].name, + arg); if (ret) { /* * Shouldn't happen since the enable said we could @@ -3512,7 +3523,7 @@ try_intx: qib_dev_err(dd, "Couldn't setup MSIx interrupt (vec=%d, irq=%d): %d\n", msixnum, - dd->cspec->msix_entries[msixnum].msix.vector, + dd->cspec->msix_entries[msixnum].irq, ret); qib_7322_nomsix(dd); goto try_intx; @@ -3548,7 +3559,7 @@ try_intx: dd->cspec->msix_entries[msixnum].mask); } irq_set_affinity_hint( - dd->cspec->msix_entries[msixnum].msix.vector, + dd->cspec->msix_entries[msixnum].irq, dd->cspec->msix_entries[msixnum].mask); } msixnum++; @@ -3571,75 +3582,69 @@ bail:; static unsigned qib_7322_boardname(struct qib_devdata *dd) { /* Will need enumeration of board-types here */ - char *n; - u32 boardid, namelen; - unsigned features = DUAL_PORT_CAP; + u32 boardid; + unsigned int features = DUAL_PORT_CAP; boardid = SYM_FIELD(dd->revision, Revision, BoardID); switch (boardid) { case 0: - n = "InfiniPath_QLE7342_Emulation"; + dd->boardname = "InfiniPath_QLE7342_Emulation"; break; case 1: - n = "InfiniPath_QLE7340"; + dd->boardname = "InfiniPath_QLE7340"; dd->flags |= QIB_HAS_QSFP; features = PORT_SPD_CAP; break; case 2: - n = "InfiniPath_QLE7342"; + dd->boardname = "InfiniPath_QLE7342"; dd->flags |= QIB_HAS_QSFP; break; case 3: - n = "InfiniPath_QMI7342"; + dd->boardname = "InfiniPath_QMI7342"; break; case 4: - n = "InfiniPath_Unsupported7342"; + dd->boardname = "InfiniPath_Unsupported7342"; qib_dev_err(dd, "Unsupported version of QMH7342\n"); features = 0; break; case BOARD_QMH7342: - n = "InfiniPath_QMH7342"; + dd->boardname = "InfiniPath_QMH7342"; features = 0x24; break; case BOARD_QME7342: - n = "InfiniPath_QME7342"; + dd->boardname = "InfiniPath_QME7342"; break; case 8: - n = "InfiniPath_QME7362"; + dd->boardname = "InfiniPath_QME7362"; dd->flags |= QIB_HAS_QSFP; break; case BOARD_QMH7360: - n = "Intel IB QDR 1P FLR-QSFP Adptr"; + dd->boardname = "Intel IB QDR 1P FLR-QSFP Adptr"; dd->flags |= QIB_HAS_QSFP; break; case 15: - n = "InfiniPath_QLE7342_TEST"; + dd->boardname = "InfiniPath_QLE7342_TEST"; dd->flags |= QIB_HAS_QSFP; break; default: - n = "InfiniPath_QLE73xy_UNKNOWN"; + dd->boardname = "InfiniPath_QLE73xy_UNKNOWN"; qib_dev_err(dd, "Unknown 7322 board type %u\n", boardid); break; } dd->board_atten = 1; /* index into txdds_Xdr */ - namelen = strlen(n) + 1; - dd->boardname = kmalloc(namelen, GFP_KERNEL); - if (dd->boardname) - snprintf(dd->boardname, namelen, "%s", n); - snprintf(dd->boardversion, sizeof(dd->boardversion), "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, - (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + (unsigned int)SYM_FIELD(dd->revision, Revision_R, Arch), dd->majrev, dd->minrev, - (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + (unsigned int)SYM_FIELD(dd->revision, Revision_R, SW)); if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) { qib_devinfo(dd->pcidev, - "IB%u: Forced to single port mode by module parameter\n", - dd->unit); + "IB%u: Forced to single port mode by module parameter\n", + dd->unit); features &= PORT_SPD_CAP; } @@ -3744,7 +3749,6 @@ static int qib_do_7322_reset(struct qib_devdata *dd) if (msix_entries) { /* restore the MSIx vector address and data if saved above */ for (i = 0; i < msix_entries; i++) { - dd->cspec->msix_entries[i].msix.entry = i; if (!msix_vecsave || !msix_vecsave[2 * i]) continue; qib_write_kreg(dd, 2 * i + @@ -3762,8 +3766,7 @@ static int qib_do_7322_reset(struct qib_devdata *dd) write_7322_initregs(dd); if (qib_pcie_params(dd, dd->lbus_width, - &dd->cspec->num_msix_entries, - dd->cspec->msix_entries)) + &dd->cspec->num_msix_entries)) qib_dev_err(dd, "Reset failed to setup PCIe or interrupts; continuing anyway\n"); @@ -5195,7 +5198,7 @@ static int qib_7322_intr_fallback(struct qib_devdata *dd) qib_devinfo(dd->pcidev, "MSIx interrupt not detected, trying INTx interrupts\n"); qib_7322_nomsix(dd); - qib_enable_intx(dd->pcidev); + qib_enable_intx(dd); qib_setup_7322_interrupt(dd, 0); return 1; } @@ -6172,7 +6175,7 @@ static int setup_txselect(const char *str, struct kernel_param *kp) unsigned long val; char *n; - if (strlen(str) >= MAX_ATTEN_LEN) { + if (strlen(str) >= ARRAY_SIZE(txselect_list)) { pr_info("txselect_values string too long\n"); return -ENOSPC; } @@ -6183,7 +6186,7 @@ static int setup_txselect(const char *str, struct kernel_param *kp) TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); return -EINVAL; } - strcpy(txselect_list, str); + strncpy(txselect_list, str, ARRAY_SIZE(txselect_list) - 1); list_for_each_entry(dd, &qib_dev_list, list) if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) @@ -7327,10 +7330,7 @@ struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, if (!dd->cspec->msix_entries) tabsize = 0; - for (i = 0; i < tabsize; i++) - dd->cspec->msix_entries[i].msix.entry = i; - - if (qib_pcie_params(dd, 8, &tabsize, dd->cspec->msix_entries)) + if (qib_pcie_params(dd, 8, &tabsize)) qib_dev_err(dd, "Failed to setup PCIe or interrupts; continuing anyway\n"); /* may be less than we wanted, if not enough available */ diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 6c16ba1107ba..c5a4c65636d6 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -399,7 +399,7 @@ static int loadtime_init(struct qib_devdata *dd) if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) { qib_dev_err(dd, - "Driver only handles version %d, chip swversion is %d (%llx), failng\n", + "Driver only handles version %d, chip swversion is %d (%llx), failing\n", QIB_CHIP_SWVERSION, (int)(dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & @@ -1398,7 +1398,6 @@ static void cleanup_device_data(struct qib_devdata *dd) qib_free_ctxtdata(dd, rcd); } kfree(tmp); - kfree(dd->boardname); } /* diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index da295e0392ed..82d9da9b6997 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -105,7 +105,7 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { struct ib_ah *ah; - ah = qib_create_qp0_ah(ibp, ibp->rvp.sm_lid); + ah = qib_create_qp0_ah(ibp, (u16)ibp->rvp.sm_lid); if (IS_ERR(ah)) ret = PTR_ERR(ah); else { @@ -134,24 +134,21 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) } /* - * Send a bad [PQ]_Key trap (ch. 14.3.8). + * Send a bad P_Key trap (ch. 14.3.8). */ -void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, - u32 qp1, u32 qp2, __be16 lid1, __be16 lid2) +void qib_bad_pkey(struct qib_ibport *ibp, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2) { struct ib_mad_notice_attr data; - if (trap_num == IB_NOTICE_TRAP_BAD_PKEY) - ibp->rvp.pkey_violations++; - else - ibp->rvp.qkey_violations++; ibp->rvp.n_pkt_drops++; + ibp->rvp.pkey_violations++; /* Send violation trap */ data.generic_type = IB_NOTICE_TYPE_SECURITY; data.prod_type_msb = 0; data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = trap_num; + data.trap_num = IB_NOTICE_TRAP_BAD_PKEY; data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); data.toggle_count = 0; memset(&data.details, 0, sizeof(data.details)); @@ -499,7 +496,7 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, pip->mkey = ibp->rvp.mkey; pip->gid_prefix = ibp->rvp.gid_prefix; pip->lid = cpu_to_be16(ppd->lid); - pip->sm_lid = cpu_to_be16(ibp->rvp.sm_lid); + pip->sm_lid = cpu_to_be16((u16)ibp->rvp.sm_lid); pip->cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); /* pip->diag_code; */ pip->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period); @@ -874,8 +871,6 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, ib_dispatch_event(&event); } - ret = subn_get_portinfo(smp, ibdev, port); - /* restore re-reg bit per o14-12.2.1 */ pip->clientrereg_resv_subnetto |= clientrereg; @@ -1578,8 +1573,8 @@ static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, cntrs.port_xmit_packets -= ibp->z_port_xmit_packets; cntrs.port_rcv_packets -= ibp->z_port_rcv_packets; - memset(pmp->reserved, 0, sizeof(pmp->reserved) + - sizeof(pmp->data)); + memset(pmp->reserved, 0, sizeof(pmp->reserved)); + memset(pmp->data, 0, sizeof(pmp->data)); /* * Set top 3 bits to indicate interval in picoseconds in diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index c379b8342a09..d90403e31a9d 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2010 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2008, 2009 QLogic Corporation. All rights reserved. * * This software is available to you under a choice of one of two @@ -187,112 +188,84 @@ void qib_pcie_ddcleanup(struct qib_devdata *dd) pci_set_drvdata(dd->pcidev, NULL); } -static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, - struct qib_msix_entry *qib_msix_entry) -{ - int ret; - int nvec = *msixcnt; - struct msix_entry *msix_entry; - int i; - - ret = pci_msix_vec_count(dd->pcidev); - if (ret < 0) - goto do_intx; - - nvec = min(nvec, ret); - - /* We can't pass qib_msix_entry array to qib_msix_setup - * so use a dummy msix_entry array and copy the allocated - * irq back to the qib_msix_entry array. */ - msix_entry = kcalloc(nvec, sizeof(*msix_entry), GFP_KERNEL); - if (!msix_entry) - goto do_intx; - - for (i = 0; i < nvec; i++) - msix_entry[i] = qib_msix_entry[i].msix; - - ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); - if (ret < 0) - goto free_msix_entry; - else - nvec = ret; - - for (i = 0; i < nvec; i++) - qib_msix_entry[i].msix = msix_entry[i]; - - kfree(msix_entry); - *msixcnt = nvec; - return; - -free_msix_entry: - kfree(msix_entry); - -do_intx: - qib_dev_err( - dd, - "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n", - nvec, ret); - *msixcnt = 0; - qib_enable_intx(dd->pcidev); -} - /** * We save the msi lo and hi values, so we can restore them after * chip reset (the kernel PCI infrastructure doesn't yet handle that * correctly. */ -static int qib_msi_setup(struct qib_devdata *dd, int pos) +static void qib_msi_setup(struct qib_devdata *dd, int pos) { struct pci_dev *pdev = dd->pcidev; u16 control; - int ret; - ret = pci_enable_msi(pdev); - if (ret) - qib_dev_err(dd, - "pci_enable_msi failed: %d, interrupts may not work\n", - ret); - /* continue even if it fails, we may still be OK... */ - - pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO, - &dd->msi_lo); - pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI, - &dd->msi_hi); + pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO, &dd->msi_lo); + pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI, &dd->msi_hi); pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control); + /* now save the data (vector) info */ - pci_read_config_word(pdev, pos + ((control & PCI_MSI_FLAGS_64BIT) - ? 12 : 8), + pci_read_config_word(pdev, + pos + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), &dd->msi_data); - return ret; } -int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent, - struct qib_msix_entry *entry) +static int qib_allocate_irqs(struct qib_devdata *dd, u32 maxvec) +{ + unsigned int flags = PCI_IRQ_LEGACY; + + /* Check our capabilities */ + if (dd->pcidev->msix_cap) { + flags |= PCI_IRQ_MSIX; + } else { + if (dd->pcidev->msi_cap) { + flags |= PCI_IRQ_MSI; + /* Get msi_lo and msi_hi */ + qib_msi_setup(dd, dd->pcidev->msi_cap); + } + } + + if (!(flags & (PCI_IRQ_MSIX | PCI_IRQ_MSI))) + qib_dev_err(dd, "No PCI MSI or MSIx capability!\n"); + + return pci_alloc_irq_vectors(dd->pcidev, 1, maxvec, flags); +} + +int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent) { u16 linkstat, speed; - int pos = 0, ret = 1; + int nvec; + int maxvec; + int ret = 0; if (!pci_is_pcie(dd->pcidev)) { qib_dev_err(dd, "Can't find PCI Express capability!\n"); /* set up something... */ dd->lbus_width = 1; dd->lbus_speed = 2500; /* Gen1, 2.5GHz */ + ret = -1; goto bail; } - pos = dd->pcidev->msix_cap; - if (nent && *nent && pos) { - qib_msix_setup(dd, pos, nent, entry); - ret = 0; /* did it, either MSIx or INTx */ - } else { - pos = dd->pcidev->msi_cap; - if (pos) - ret = qib_msi_setup(dd, pos); - else - qib_dev_err(dd, "No PCI MSI or MSIx capability!\n"); + maxvec = (nent && *nent) ? *nent : 1; + nvec = qib_allocate_irqs(dd, maxvec); + if (nvec < 0) { + ret = nvec; + goto bail; + } + + /* + * If nent exists, make sure to record how many vectors were allocated + */ + if (nent) { + *nent = nvec; + + /* + * If we requested (nent) MSIX, but msix_enabled is not set, + * pci_alloc_irq_vectors() enabled INTx. + */ + if (!dd->pcidev->msix_enabled) + qib_dev_err(dd, + "no msix vectors allocated, using INTx\n"); } - if (!pos) - qib_enable_intx(dd->pcidev); pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); /* @@ -379,7 +352,7 @@ int qib_reinit_intr(struct qib_devdata *dd) ret = 1; bail: if (!ret && (dd->flags & QIB_HAS_INTX)) { - qib_enable_intx(dd->pcidev); + qib_enable_intx(dd); ret = 1; } @@ -397,7 +370,7 @@ bail: void qib_nomsi(struct qib_devdata *dd) { dd->msi_lo = 0; - pci_disable_msi(dd->pcidev); + pci_free_irq_vectors(dd->pcidev); } /* @@ -405,23 +378,21 @@ void qib_nomsi(struct qib_devdata *dd) */ void qib_nomsix(struct qib_devdata *dd) { - pci_disable_msix(dd->pcidev); + pci_free_irq_vectors(dd->pcidev); } /* * Similar to pci_intx(pdev, 1), except that we make sure * msi(x) is off. */ -void qib_enable_intx(struct pci_dev *pdev) +void qib_enable_intx(struct qib_devdata *dd) { u16 cw, new; int pos; + struct pci_dev *pdev = dd->pcidev; - /* first, turn on INTx */ - pci_read_config_word(pdev, PCI_COMMAND, &cw); - new = cw & ~PCI_COMMAND_INTX_DISABLE; - if (new != cw) - pci_write_config_word(pdev, PCI_COMMAND, new); + if (pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_LEGACY) < 0) + qib_dev_err(dd, "Failed to enable INTx\n"); pos = pdev->msi_cap; if (pos) { diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index a343e3b5d4cb..344e401915f7 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. * All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -415,53 +415,16 @@ int qib_check_send_wqe(struct rvt_qp *qp, #ifdef CONFIG_DEBUG_FS -struct qib_qp_iter { - struct qib_ibdev *dev; - struct rvt_qp *qp; - int n; -}; - -struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev) -{ - struct qib_qp_iter *iter; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return NULL; - - iter->dev = dev; - - return iter; -} - -int qib_qp_iter_next(struct qib_qp_iter *iter) -{ - struct qib_ibdev *dev = iter->dev; - int n = iter->n; - int ret = 1; - struct rvt_qp *pqp = iter->qp; - struct rvt_qp *qp; - - for (; n < dev->rdi.qp_dev->qp_table_size; n++) { - if (pqp) - qp = rcu_dereference(pqp->next); - else - qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); - pqp = qp; - if (qp) { - iter->qp = qp; - iter->n = n; - return 0; - } - } - return ret; -} - static const char * const qp_type_str[] = { "SMI", "GSI", "RC", "UC", "UD", }; -void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter) +/** + * qib_qp_iter_print - print information to seq_file + * @s - the seq_file + * @iter - the iterator + */ +void qib_qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) { struct rvt_swqe *wqe; struct rvt_qp *qp = iter->qp; diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 4ddbcac5eabe..e9a91736b12d 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -348,7 +348,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) case IB_WR_RDMA_WRITE: if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - /* FALLTHROUGH */ + goto no_flow_control; case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && @@ -356,7 +356,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } - +no_flow_control: ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr); ohdr->u.rc.reth.rkey = diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index bd09de7c6e56..53efbb0b40c4 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -58,8 +58,10 @@ static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ - if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + NULL, &wqe->sg_list[i], + IB_ACCESS_LOCAL_WRITE); + if (unlikely(ret <= 0)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; j++; @@ -256,11 +258,11 @@ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, } if (!qib_pkey_ok((u16)bth0, qib_get_pkey(ibp, qp->s_alt_pkey_index))) { - qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, - (u16)bth0, - (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, - 0, qp->ibqp.qp_num, - hdr->lrh[3], hdr->lrh[1]); + qib_bad_pkey(ibp, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); goto err; } /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ @@ -295,11 +297,11 @@ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, } if (!qib_pkey_ok((u16)bth0, qib_get_pkey(ibp, qp->s_pkey_index))) { - qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, - (u16)bth0, - (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, - 0, qp->ibqp.qp_num, - hdr->lrh[3], hdr->lrh[1]); + qib_bad_pkey(ibp, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); goto err; } /* Validate the SLID. See Ch. 9.6.1.5 */ @@ -643,8 +645,10 @@ u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, hdr->hop_limit = grh->hop_limit; /* The SGID is 32-bit aligned. */ hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; - hdr->sgid.global.interface_id = grh->sgid_index ? - ibp->guids[grh->sgid_index - 1] : ppd_from_ibp(ibp)->guid; + if (!grh->sgid_index) + hdr->sgid.global.interface_id = ppd_from_ibp(ibp)->guid; + else if (grh->sgid_index < QIB_GUIDS_PER_PORT) + hdr->sgid.global.interface_id = ibp->guids[grh->sgid_index - 1]; hdr->dgid = grh->dgid; /* GRH header size in 32-bit words. */ diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index fe4cf5e4acec..ca2638d8f35e 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -247,7 +247,7 @@ static struct kobj_type qib_port_cc_ktype = { .release = qib_port_release, }; -static struct bin_attribute cc_table_bin_attr = { +static const struct bin_attribute cc_table_bin_attr = { .attr = {.name = "cc_table_bin", .mode = 0444}, .read = read_cc_table_bin, .size = PAGE_SIZE, @@ -286,7 +286,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, return count; } -static struct bin_attribute cc_setting_bin_attr = { +static const struct bin_attribute cc_setting_bin_attr = { .attr = {.name = "cc_settings_bin", .mode = 0444}, .read = read_cc_setting_bin, .size = PAGE_SIZE, diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 341a123ee95c..be4907453ac4 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -66,8 +66,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn); if (!qp) { ibp->rvp.n_pkt_drops++; - rcu_read_unlock(); - return; + goto drop; } sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? @@ -94,11 +93,11 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (unlikely(!qib_pkey_ok(pkey1, pkey2))) { lid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); - qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey1, - rdma_ah_get_sl(ah_attr), - sqp->ibqp.qp_num, qp->ibqp.qp_num, - cpu_to_be16(lid), - cpu_to_be16(rdma_ah_get_dlid(ah_attr))); + qib_bad_pkey(ibp, pkey1, + rdma_ah_get_sl(ah_attr), + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(lid), + cpu_to_be16(rdma_ah_get_dlid(ah_attr))); goto drop; } } @@ -113,18 +112,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) qkey = (int)swqe->ud_wr.remote_qkey < 0 ? sqp->qkey : swqe->ud_wr.remote_qkey; - if (unlikely(qkey != qp->qkey)) { - u16 lid; - - lid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & - ((1 << ppd->lmc) - 1)); - qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, - rdma_ah_get_sl(ah_attr), - sqp->ibqp.qp_num, qp->ibqp.qp_num, - cpu_to_be16(lid), - cpu_to_be16(rdma_ah_get_dlid(ah_attr))); + if (unlikely(qkey != qp->qkey)) goto drop; - } } /* @@ -487,22 +476,18 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, pkey1 = be32_to_cpu(ohdr->bth[0]); pkey2 = qib_get_pkey(ibp, qp->s_pkey_index); if (unlikely(!qib_pkey_ok(pkey1, pkey2))) { - qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, - pkey1, - (be16_to_cpu(hdr->lrh[0]) >> 4) & + qib_bad_pkey(ibp, + pkey1, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, - src_qp, qp->ibqp.qp_num, - hdr->lrh[3], hdr->lrh[1]); + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); return; } } - if (unlikely(qkey != qp->qkey)) { - qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, - (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, - src_qp, qp->ibqp.qp_num, - hdr->lrh[3], hdr->lrh[1]); + if (unlikely(qkey != qp->qkey)) return; - } + /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && (tlen != 256 || diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ac42dce7e281..9d92aeb8d9a1 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1341,6 +1341,15 @@ int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) if (rdma_ah_get_sl(ah_attr) > 15) return -EINVAL; + if (rdma_ah_get_dlid(ah_attr) == 0) + return -EINVAL; + if (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE) && + rdma_ah_get_dlid(ah_attr) != + be16_to_cpu(IB_LID_PERMISSIVE) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + return 0; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index a52fc67b40d7..f887737ac142 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -241,8 +241,8 @@ static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) return p1 && p1 == p2 && ((__s16)pkey1 < 0 || (__s16)pkey2 < 0); } -void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, - u32 qp1, u32 qp2, __be16 lid1, __be16 lid2); +void qib_bad_pkey(struct qib_ibport *ibp, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2); void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num); void qib_sys_guid_chg(struct qib_ibport *ibp); void qib_node_desc_chg(struct qib_ibport *ibp); @@ -282,13 +282,7 @@ int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait); #ifdef CONFIG_DEBUG_FS -struct qib_qp_iter; - -struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev); - -int qib_qp_iter_next(struct qib_qp_iter *iter); - -void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter); +void qib_qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); #endif diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c index 3c37dd59c04e..995a26b65156 100644 --- a/drivers/infiniband/hw/usnic/usnic_fwd.c +++ b/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -110,20 +110,12 @@ void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]) spin_unlock(&ufdev->lock); } -int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) +void usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) { - int status; - spin_lock(&ufdev->lock); - if (ufdev->inaddr == 0) { + if (!ufdev->inaddr) ufdev->inaddr = inaddr; - status = 0; - } else { - status = -EFAULT; - } spin_unlock(&ufdev->lock); - - return status; } void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev) diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h index b2ac22be0731..0b2cc4e79707 100644 --- a/drivers/infiniband/hw/usnic/usnic_fwd.h +++ b/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -75,7 +75,7 @@ struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev); void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev); void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]); -int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); +void usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev); void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev); void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index c0c1e8b027b1..f45e99a938e0 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -333,9 +333,7 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void usnic_get_dev_fw_str(struct ib_device *device, - char *str, - size_t str_len) +static void usnic_get_dev_fw_str(struct ib_device *device, char *str) { struct usnic_ib_dev *us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev); @@ -345,7 +343,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); mutex_unlock(&us_ibdev->usdev_lock); - snprintf(str, str_len, "%s", info.fw_version); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version); } /* Start of PF discovery section */ @@ -353,7 +351,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) { struct usnic_ib_dev *us_ibdev; union ib_gid gid; - struct in_ifaddr *in; + struct in_device *ind; struct net_device *netdev; usnic_dbg("\n"); @@ -409,6 +407,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.query_port = usnic_ib_query_port; us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey; us_ibdev->ib_dev.query_gid = usnic_ib_query_gid; + us_ibdev->ib_dev.get_netdev = usnic_get_netdev; us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer; us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd; us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd; @@ -442,9 +441,11 @@ static void *usnic_ib_device_add(struct pci_dev *dev) if (netif_carrier_ok(us_ibdev->netdev)) usnic_fwd_carrier_up(us_ibdev->ufdev); - in = ((struct in_device *)(netdev->ip_ptr))->ifa_list; - if (in != NULL) - usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address); + ind = in_dev_get(netdev); + if (ind->ifa_list) + usnic_fwd_add_ipaddr(us_ibdev->ufdev, + ind->ifa_list->ifa_address); + in_dev_put(ind); usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, us_ibdev->ufdev->inaddr, &gid.raw[0]); @@ -720,7 +721,6 @@ static void __exit usnic_ib_destroy(void) MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver"); MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR); module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3"); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 4996984885c2..e4113ef09315 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -164,6 +164,8 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, if (usnic_ib_share_vf) { /* Try to find resouces on a used vf which is in pd */ dev_list = usnic_uiom_get_dev_list(pd->umem_pd); + if (IS_ERR(dev_list)) + return ERR_CAST(dev_list); for (i = 0; dev_list[i]; i++) { dev = dev_list[i]; vf = pci_get_drvdata(to_pci_dev(dev)); @@ -226,27 +228,6 @@ static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) spin_unlock(&vf->lock); } -static void eth_speed_to_ib_speed(int speed, u8 *active_speed, - u8 *active_width) -{ - if (speed <= 10000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR10; - } else if (speed <= 20000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_DDR; - } else if (speed <= 30000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_QDR; - } else if (speed <= 40000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR10; - } else { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - } -} - static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) { if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || @@ -326,12 +307,16 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); - struct ethtool_link_ksettings cmd; usnic_dbg("\n"); mutex_lock(&us_ibdev->usdev_lock); - __ethtool_get_link_ksettings(us_ibdev->netdev, &cmd); + if (ib_get_eth_speed(ibdev, port, &props->active_speed, + &props->active_width)) { + mutex_unlock(&us_ibdev->usdev_lock); + return -EINVAL; + } + /* props being zeroed by the caller, avoid zeroing it here */ props->lid = 0; @@ -355,8 +340,6 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port, props->pkey_tbl_len = 1; props->bad_pkey_cntr = 0; props->qkey_viol_cntr = 0; - eth_speed_to_ib_speed(cmd.base.speed, &props->active_speed, - &props->active_width); props->max_mtu = IB_MTU_4096; props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); /* Userspace will adjust for hdrs */ @@ -424,6 +407,16 @@ int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, return 0; } +struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(device); + + if (us_ibdev->netdev) + dev_hold(us_ibdev->netdev); + + return us_ibdev->netdev; +} + int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 172e43b6fa95..1fda94425116 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -48,6 +48,7 @@ int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, struct ib_qp_init_attr *qp_init_attr); int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid); +struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num); int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 8e2f0a11690f..663a0c301c43 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -194,6 +194,7 @@ struct pvrdma_dev { void *resp_slot; unsigned long flags; struct list_head device_link; + unsigned int dsr_version; /* Locking and interrupt information. */ spinlock_t cmd_lock; /* Command lock. */ @@ -444,6 +445,7 @@ void pvrdma_ah_attr_to_rdma(struct rdma_ah_attr *dst, const struct pvrdma_ah_attr *src); void rdma_ah_attr_to_pvrdma(struct pvrdma_ah_attr *dst, const struct rdma_ah_attr *src); +u8 ib_gid_type_to_pvrdma(enum ib_gid_type gid_type); int pvrdma_uar_table_init(struct pvrdma_dev *dev); void pvrdma_uar_table_cleanup(struct pvrdma_dev *dev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 90aa326fd7c0..3562c0c30492 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -299,7 +299,7 @@ static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i) void _pvrdma_flush_cqe(struct pvrdma_qp *qp, struct pvrdma_cq *cq) { - int head; + unsigned int head; int has_data; if (!cq->is_kernel) @@ -389,6 +389,7 @@ retry: wc->dlid_path_bits = cqe->dlid_path_bits; wc->port_num = cqe->port_num; wc->vendor_err = cqe->vendor_err; + wc->network_hdr_type = cqe->network_hdr_type; /* Update shared ring state */ pvrdma_idx_ring_inc(&cq->ring_state->rx.cons_head, cq->ibcq.cqe); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h index 09078ccfaec7..df0a6b525021 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h @@ -50,7 +50,15 @@ #include "pvrdma_verbs.h" -#define PVRDMA_VERSION 17 +/* + * PVRDMA version macros. Some new features require updates to PVRDMA_VERSION. + * These macros allow us to check for different features if necessary. + */ + +#define PVRDMA_ROCEV1_VERSION 17 +#define PVRDMA_ROCEV2_VERSION 18 +#define PVRDMA_VERSION PVRDMA_ROCEV2_VERSION + #define PVRDMA_BOARD_ID 1 #define PVRDMA_REV_ID 1 @@ -123,6 +131,31 @@ #define PVRDMA_GID_TYPE_FLAG_ROCE_V1 BIT(0) #define PVRDMA_GID_TYPE_FLAG_ROCE_V2 BIT(1) +/* + * Version checks. This checks whether each version supports specific + * capabilities from the device. + */ + +#define PVRDMA_IS_VERSION17(_dev) \ + (_dev->dsr_version == PVRDMA_ROCEV1_VERSION && \ + _dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1) + +#define PVRDMA_IS_VERSION18(_dev) \ + (_dev->dsr_version >= PVRDMA_ROCEV2_VERSION && \ + (_dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1 || \ + _dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V2)) \ + +#define PVRDMA_SUPPORTED(_dev) \ + ((_dev->dsr->caps.mode == PVRDMA_DEVICE_MODE_ROCE) && \ + (PVRDMA_IS_VERSION17(_dev) || PVRDMA_IS_VERSION18(_dev))) + +/* + * Get capability values based on device version. + */ + +#define PVRDMA_GET_CAP(_dev, _old_val, _val) \ + ((PVRDMA_IS_VERSION18(_dev)) ? _val : _old_val) + enum pvrdma_pci_resource { PVRDMA_PCI_RESOURCE_MSIX, /* BAR0: MSI-X, MMIO. */ PVRDMA_PCI_RESOURCE_REG, /* BAR1: Registers, MMIO. */ @@ -225,7 +258,7 @@ struct pvrdma_device_caps { u8 atomic_ops; /* PVRDMA_ATOMIC_OP_* bits */ u8 bmme_flags; /* FRWR Mem Mgmt Extensions */ u8 gid_types; /* PVRDMA_GID_TYPE_FLAG_ */ - u8 reserved[4]; + u32 max_fast_reg_page_list_len; }; struct pvrdma_ring_page_info { diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 34ebc7615411..6ce709a67959 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -102,12 +102,11 @@ static struct device_attribute *pvrdma_class_attributes[] = { &dev_attr_board_id }; -static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str, - size_t str_len) +static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str) { struct pvrdma_dev *dev = container_of(device, struct pvrdma_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%d\n", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n", (int) (dev->dsr->caps.fw_ver >> 32), (int) (dev->dsr->caps.fw_ver >> 16) & 0xffff, (int) dev->dsr->caps.fw_ver & 0xffff); @@ -129,10 +128,14 @@ static int pvrdma_init_device(struct pvrdma_dev *dev) static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { + struct pvrdma_dev *dev = to_vdev(ibdev); struct ib_port_attr attr; int err; - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + if (dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1) + immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE; + else if (dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V2) + immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; err = ib_query_port(ibdev, port_num, &attr); if (err) @@ -570,6 +573,7 @@ static void pvrdma_free_slots(struct pvrdma_dev *dev) static int pvrdma_add_gid_at_index(struct pvrdma_dev *dev, const union ib_gid *gid, + u8 gid_type, int index) { int ret; @@ -587,7 +591,7 @@ static int pvrdma_add_gid_at_index(struct pvrdma_dev *dev, cmd_bind->mtu = ib_mtu_enum_to_int(IB_MTU_1024); cmd_bind->vlan = 0xfff; cmd_bind->index = index; - cmd_bind->gid_type = PVRDMA_GID_TYPE_FLAG_ROCE_V1; + cmd_bind->gid_type = gid_type; ret = pvrdma_cmd_post(dev, &req, NULL, 0); if (ret < 0) { @@ -608,7 +612,9 @@ static int pvrdma_add_gid(struct ib_device *ibdev, { struct pvrdma_dev *dev = to_vdev(ibdev); - return pvrdma_add_gid_at_index(dev, gid, index); + return pvrdma_add_gid_at_index(dev, gid, + ib_gid_type_to_pvrdma(attr->gid_type), + index); } static int pvrdma_del_gid_at_index(struct pvrdma_dev *dev, int index) @@ -723,7 +729,6 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, int ret; unsigned long start; unsigned long len; - unsigned int version; dma_addr_t slot_dma = 0; dev_dbg(&pdev->dev, "initializing driver %s\n", pci_name(pdev)); @@ -820,13 +825,9 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, goto err_unmap_regs; } - version = pvrdma_read_reg(dev, PVRDMA_REG_VERSION); + dev->dsr_version = pvrdma_read_reg(dev, PVRDMA_REG_VERSION); dev_info(&pdev->dev, "device version %d, driver version %d\n", - version, PVRDMA_VERSION); - if (version < PVRDMA_VERSION) { - dev_err(&pdev->dev, "incompatible device version\n"); - goto err_uar_unmap; - } + dev->dsr_version, PVRDMA_VERSION); dev->dsr = dma_alloc_coherent(&pdev->dev, sizeof(*dev->dsr), &dev->dsrbase, GFP_KERNEL); @@ -897,17 +898,9 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, /* Make sure the write is complete before reading status. */ mb(); - /* Currently, the driver only supports RoCE mode. */ - if (dev->dsr->caps.mode != PVRDMA_DEVICE_MODE_ROCE) { - dev_err(&pdev->dev, "unsupported transport %d\n", - dev->dsr->caps.mode); - ret = -EFAULT; - goto err_free_cq_ring; - } - - /* Currently, the driver only supports RoCE V1. */ - if (!(dev->dsr->caps.gid_types & PVRDMA_GID_TYPE_FLAG_ROCE_V1)) { - dev_err(&pdev->dev, "driver needs RoCE v1 support\n"); + /* The driver supports RoCE V1 and V2. */ + if (!PVRDMA_SUPPORTED(dev)) { + dev_err(&pdev->dev, "driver needs RoCE v1 or v2 support\n"); ret = -EFAULT; goto err_free_cq_ring; } @@ -1078,7 +1071,7 @@ static void pvrdma_pci_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); } -static struct pci_device_id pvrdma_pci_table[] = { +static const struct pci_device_id pvrdma_pci_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_VMWARE, PCI_DEVICE_ID_VMWARE_PVRDMA), }, { 0 }, }; @@ -1119,5 +1112,4 @@ module_exit(pvrdma_cleanup); MODULE_AUTHOR("VMware, Inc"); MODULE_DESCRIPTION("VMware Paravirtual RDMA driver"); -MODULE_VERSION(DRV_VERSION); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c index ec6a4ca1eeb7..fb0c5c0976b3 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c @@ -303,3 +303,10 @@ void rdma_ah_attr_to_pvrdma(struct pvrdma_ah_attr *dst, dst->port_num = rdma_ah_get_port_num(src); memcpy(&dst->dmac, src->roce.dmac, sizeof(dst->dmac)); } + +u8 ib_gid_type_to_pvrdma(enum ib_gid_type gid_type) +{ + return (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? + PVRDMA_GID_TYPE_FLAG_ROCE_V2 : + PVRDMA_GID_TYPE_FLAG_ROCE_V1; +} diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h index ed9022a91a1d..8b558ae234c8 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h @@ -111,21 +111,4 @@ static inline __s32 pvrdma_idx_ring_has_data(const struct pvrdma_ring *r, return PVRDMA_INVALID_IDX; } -static inline bool pvrdma_idx_ring_is_valid_idx(const struct pvrdma_ring *r, - __u32 max_elems, __u32 *idx) -{ - const __u32 tail = atomic_read(&r->prod_tail); - const __u32 head = atomic_read(&r->cons_head); - - if (pvrdma_idx_valid(tail, max_elems) && - pvrdma_idx_valid(head, max_elems) && - pvrdma_idx_valid(*idx, max_elems)) { - if (tail > head && (*idx < tail && *idx >= head)) - return true; - else if (head > tail && (*idx >= head || *idx < tail)) - return true; - } - return false; -} - #endif /* __PVRDMA_RING_H__ */ diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 28517042011d..48776f5ffb0e 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -83,6 +83,8 @@ int pvrdma_query_device(struct ib_device *ibdev, props->max_qp_wr = dev->dsr->caps.max_qp_wr; props->device_cap_flags = dev->dsr->caps.device_cap_flags; props->max_sge = dev->dsr->caps.max_sge; + props->max_sge_rd = PVRDMA_GET_CAP(dev, dev->dsr->caps.max_sge, + dev->dsr->caps.max_sge_rd); props->max_cq = dev->dsr->caps.max_cq; props->max_cqe = dev->dsr->caps.max_cqe; props->max_mr = dev->dsr->caps.max_mr; @@ -101,8 +103,14 @@ int pvrdma_query_device(struct ib_device *ibdev, (dev->dsr->caps.bmme_flags & PVRDMA_BMME_FLAG_REMOTE_INV) && (dev->dsr->caps.bmme_flags & PVRDMA_BMME_FLAG_FAST_REG_WR)) { props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + props->max_fast_reg_page_list_len = PVRDMA_GET_CAP(dev, + PVRDMA_MAX_FAST_REG_PAGES, + dev->dsr->caps.max_fast_reg_page_list_len); } + props->device_cap_flags |= IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_RC_RNR_NAK_GEN; + return 0; } @@ -143,6 +151,7 @@ int pvrdma_query_port(struct ib_device *ibdev, u8 port, props->gid_tbl_len = resp->attrs.gid_tbl_len; props->port_cap_flags = pvrdma_port_cap_flags_to_ib(resp->attrs.port_cap_flags); + props->port_cap_flags |= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; props->max_msg_sz = resp->attrs.max_msg_sz; props->bad_pkey_cntr = resp->attrs.bad_pkey_cntr; props->qkey_viol_cntr = resp->attrs.qkey_viol_cntr; diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index a96d4aa80ae8..ba3639a0d77c 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -66,8 +66,6 @@ int rvt_check_ah(struct ib_device *ibdev, int port_num = rdma_ah_get_port_num(ah_attr); struct ib_port_attr port_attr; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num); - u32 dlid = rdma_ah_get_dlid(ah_attr); u8 ah_flags = rdma_ah_get_ah_flags(ah_attr); u8 static_rate = rdma_ah_get_static_rate(ah_attr); @@ -83,14 +81,6 @@ int rvt_check_ah(struct ib_device *ibdev, if ((ah_flags & IB_AH_GRH) && rdma_ah_read_grh(ah_attr)->sgid_index >= port_attr.gid_tbl_len) return -EINVAL; - if (link != IB_LINK_LAYER_ETHERNET) { - if (dlid == 0) - return -EINVAL; - if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) && - dlid != be16_to_cpu(IB_LID_PERMISSIVE) && - !(ah_flags & IB_AH_GRH)) - return -EINVAL; - } if (rdi->driver_f.check_ah) return rdi->driver_f.check_ah(ibdev, ah_attr); return 0; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 0ae2ff8cf81e..97d71e49c092 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -107,7 +107,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].slid = ib_lid_cpu16(entry->slid); wc->uqueue[head].sl = entry->sl; wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; wc->uqueue[head].port_num = entry->port_num; diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index aa5f9ea318e4..42713511b53b 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -441,6 +441,105 @@ bail_umem: } /** + * rvt_dereg_clean_qp_cb - callback from iterator + * @qp - the qp + * @v - the mregion (as u64) + * + * This routine fields the callback for all QPs and + * for QPs in the same PD as the MR will call the + * rvt_qp_mr_clean() to potentially cleanup references. + */ +static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v) +{ + struct rvt_mregion *mr = (struct rvt_mregion *)v; + + /* skip PDs that are not ours */ + if (mr->pd != qp->ibqp.pd) + return; + rvt_qp_mr_clean(qp, mr->lkey); +} + +/** + * rvt_dereg_clean_qps - find QPs for reference cleanup + * @mr - the MR that is being deregistered + * + * This routine iterates RC QPs looking for references + * to the lkey noted in mr. + */ +static void rvt_dereg_clean_qps(struct rvt_mregion *mr) +{ + struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); + + rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb); +} + +/** + * rvt_check_refs - check references + * @mr - the megion + * @t - the caller identification + * + * This routine checks MRs holding a reference during + * when being de-registered. + * + * If the count is non-zero, the code calls a clean routine then + * waits for the timeout for the count to zero. + */ +static int rvt_check_refs(struct rvt_mregion *mr, const char *t) +{ + unsigned long timeout; + struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); + + if (percpu_ref_is_zero(&mr->refcount)) + return 0; + /* avoid dma mr */ + if (mr->lkey) + rvt_dereg_clean_qps(mr); + timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ); + if (!timeout) { + rvt_pr_err(rdi, + "%s timeout mr %p pd %p lkey %x refcount %ld\n", + t, mr, mr->pd, mr->lkey, + atomic_long_read(&mr->refcount.count)); + rvt_get_mr(mr); + return -EBUSY; + } + return 0; +} + +/** + * rvt_mr_has_lkey - is MR + * @mr - the mregion + * @lkey - the lkey + */ +bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey) +{ + return mr && lkey == mr->lkey; +} + +/** + * rvt_ss_has_lkey - is mr in sge tests + * @ss - the sge state + * @lkey + * + * This code tests for an MR in the indicated + * sge state. + */ +bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey) +{ + int i; + bool rval = false; + + if (!ss->num_sge) + return rval; + /* first one */ + rval = rvt_mr_has_lkey(ss->sge.mr, lkey); + /* any others */ + for (i = 0; !rval && i < ss->num_sge - 1; i++) + rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey); + return rval; +} + +/** * rvt_dereg_mr - unregister and free a memory region * @ibmr: the memory region to free * @@ -453,22 +552,14 @@ bail_umem: int rvt_dereg_mr(struct ib_mr *ibmr) { struct rvt_mr *mr = to_imr(ibmr); - struct rvt_dev_info *rdi = ib_to_rvt(ibmr->pd->device); - int ret = 0; - unsigned long timeout; + int ret; rvt_free_lkey(&mr->mr); rvt_put_mr(&mr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&mr->mr.comp, 5 * HZ); - if (!timeout) { - rvt_pr_err(rdi, - "rvt_dereg_mr timeout mr %p pd %p\n", - mr, mr->mr.pd); - rvt_get_mr(&mr->mr); - ret = -EBUSY; + ret = rvt_check_refs(&mr->mr, __func__); + if (ret) goto out; - } rvt_deinit_mregion(&mr->mr); if (mr->umem) ib_umem_release(mr->umem); @@ -761,16 +852,12 @@ int rvt_dealloc_fmr(struct ib_fmr *ibfmr) { struct rvt_fmr *fmr = to_ifmr(ibfmr); int ret = 0; - unsigned long timeout; rvt_free_lkey(&fmr->mr); rvt_put_mr(&fmr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&fmr->mr.comp, 5 * HZ); - if (!timeout) { - rvt_get_mr(&fmr->mr); - ret = -EBUSY; + ret = rvt_check_refs(&fmr->mr, __func__); + if (ret) goto out; - } rvt_deinit_mregion(&fmr->mr); kfree(fmr); out: @@ -778,23 +865,52 @@ out: } /** + * rvt_sge_adjacent - is isge compressible + * @last_sge: last outgoing SGE written + * @sge: SGE to check + * + * If adjacent will update last_sge to add length. + * + * Return: true if isge is adjacent to last sge + */ +static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge, + struct ib_sge *sge) +{ + if (last_sge && sge->lkey == last_sge->mr->lkey && + ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) { + if (sge->lkey) { + if (unlikely((sge->addr - last_sge->mr->user_base + + sge->length > last_sge->mr->length))) + return false; /* overrun, caller will catch */ + } else { + last_sge->length += sge->length; + } + last_sge->sge_length += sge->length; + trace_rvt_sge_adjacent(last_sge, sge); + return true; + } + return false; +} + +/** * rvt_lkey_ok - check IB SGE for validity and initialize * @rkt: table containing lkey to check SGE against * @pd: protection domain * @isge: outgoing internal SGE + * @last_sge: last outgoing SGE written * @sge: SGE to check * @acc: access flags * * Check the IB SGE for validity and initialize our internal version * of it. * - * Return: 1 if valid and successful, otherwise returns 0. - * - * increments the reference count upon success + * Increments the reference count when a new sge is stored. * + * Return: 0 if compressed, 1 if added , otherwise returns -errno. */ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, - struct rvt_sge *isge, struct ib_sge *sge, int acc) + struct rvt_sge *isge, struct rvt_sge *last_sge, + struct ib_sge *sge, int acc) { struct rvt_mregion *mr; unsigned n, m; @@ -804,12 +920,14 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, * We use LKEY == zero for kernel virtual addresses * (see rvt_get_dma_mr() and dma_virt_ops). */ - rcu_read_lock(); if (sge->lkey == 0) { struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device); if (pd->user) - goto bail; + return -EINVAL; + if (rvt_sge_adjacent(last_sge, sge)) + return 0; + rcu_read_lock(); mr = rcu_dereference(dev->dma_mr); if (!mr) goto bail; @@ -824,6 +942,9 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, isge->n = 0; goto ok; } + if (rvt_sge_adjacent(last_sge, sge)) + return 0; + rcu_read_lock(); mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]); if (!mr) goto bail; @@ -874,12 +995,13 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, isge->m = m; isge->n = n; ok: + trace_rvt_sge_new(isge, sge); return 1; bail_unref: rvt_put_mr(mr); bail: rcu_read_unlock(); - return 0; + return -EINVAL; } EXPORT_SYMBOL(rvt_lkey_ok); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 8876ee7bc326..22df09ae809e 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -52,6 +52,7 @@ #include <linux/slab.h> #include <rdma/ib_verbs.h> #include <rdma/ib_hdrs.h> +#include <rdma/opa_addr.h> #include "qp.h" #include "vt.h" #include "trace.h" @@ -421,15 +422,6 @@ bail: return ret; } -static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn) -{ - struct rvt_qpn_map *map; - - map = qpt->map + qpn / RVT_BITS_PER_PAGE; - if (map->page) - clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); -} - /** * rvt_clear_mr_refs - Drop help mr refs * @qp: rvt qp data structure @@ -448,13 +440,9 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) if (clr_sends) { while (qp->s_last != qp->s_head) { struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last); - unsigned i; - for (i = 0; i < wqe->wr.num_sge; i++) { - struct rvt_sge *sge = &wqe->sg_list[i]; + rvt_put_swqe(wqe); - rvt_put_mr(sge->mr); - } if (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) @@ -470,10 +458,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) } } - if (qp->ibqp.qp_type != IB_QPT_RC) - return; - - for (n = 0; n < rvt_max_atomic(rdi); n++) { + for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) { struct rvt_ack_entry *e = &qp->s_ack_queue[n]; if (e->rdma_sge.mr) { @@ -484,6 +469,113 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) } /** + * rvt_swqe_has_lkey - return true if lkey is used by swqe + * @wqe - the send wqe + * @lkey - the lkey + * + * Test the swqe for using lkey + */ +static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey) +{ + int i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + if (rvt_mr_has_lkey(sge->mr, lkey)) + return true; + } + return false; +} + +/** + * rvt_qp_sends_has_lkey - return true is qp sends use lkey + * @qp - the rvt_qp + * @lkey - the lkey + */ +static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey) +{ + u32 s_last = qp->s_last; + + while (s_last != qp->s_head) { + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last); + + if (rvt_swqe_has_lkey(wqe, lkey)) + return true; + + if (++s_last >= qp->s_size) + s_last = 0; + } + if (qp->s_rdma_mr) + if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey)) + return true; + return false; +} + +/** + * rvt_qp_acks_has_lkey - return true if acks have lkey + * @qp - the qp + * @lkey - the lkey + */ +static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey) +{ + int i; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey)) + return true; + } + return false; +} + +/* + * rvt_qp_mr_clean - clean up remote ops for lkey + * @qp - the qp + * @lkey - the lkey that is being de-registered + * + * This routine checks if the lkey is being used by + * the qp. + * + * If so, the qp is put into an error state to elminate + * any references from the qp. + */ +void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey) +{ + bool lastwqe = false; + + if (qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + /* avoid special QPs */ + return; + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) + goto check_lwqe; + + if (rvt_ss_has_lkey(&qp->r_sge, lkey) || + rvt_qp_sends_has_lkey(qp, lkey) || + rvt_qp_acks_has_lkey(qp, lkey)) + lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR); +check_lwqe: + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +/** * rvt_remove_qp - remove qp form table * @rdi: rvt dev struct * @qp: qp to remove @@ -645,6 +737,19 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, lockdep_assert_held(&qp->s_lock); } +/** rvt_free_qpn - Free a qpn from the bit map + * @qpt: QP table + * @qpn: queue pair number to free + */ +static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) +{ + struct rvt_qpn_map *map; + + map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); +} + /** * rvt_create_qp - create a queue pair for a device * @ibpd: the protection domain who's device we create the queue pair for @@ -914,7 +1019,7 @@ bail_ip: kref_put(&qp->ip->ref, rvt_release_mmap_info); bail_qpn: - free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); + rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); bail_rq_wq: if (!qp->ip) @@ -1062,6 +1167,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mig = 0; int pmtu = 0; /* for gcc warning only */ enum rdma_link_layer link; + int opa_ah; link = rdma_port_get_link_layer(ibqp->device, qp->port_num); @@ -1072,6 +1178,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, link)) @@ -1082,17 +1189,31 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto inval; if (attr_mask & IB_QP_AV) { - if (rdma_ah_get_dlid(&attr->ah_attr) >= - be16_to_cpu(IB_MULTICAST_LID_BASE)) - goto inval; + if (opa_ah) { + if (rdma_ah_get_dlid(&attr->ah_attr) >= + opa_get_mcast_base(OPA_MCAST_NR)) + goto inval; + } else { + if (rdma_ah_get_dlid(&attr->ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto inval; + } + if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr)) goto inval; } if (attr_mask & IB_QP_ALT_PATH) { - if (rdma_ah_get_dlid(&attr->alt_ah_attr) >= - be16_to_cpu(IB_MULTICAST_LID_BASE)) - goto inval; + if (opa_ah) { + if (rdma_ah_get_dlid(&attr->alt_ah_attr) >= + opa_get_mcast_base(OPA_MCAST_NR)) + goto inval; + } else { + if (rdma_ah_get_dlid(&attr->alt_ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto inval; + } + if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) goto inval; if (attr->alt_pkey_index >= rvt_get_npkeys(rdi)) @@ -1239,7 +1360,6 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_PATH_MTU) { qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu); - qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu); qp->log_pmtu = ilog2(qp->pmtu); } @@ -1301,19 +1421,6 @@ inval: return -EINVAL; } -/** rvt_free_qpn - Free a qpn from the bit map - * @qpt: QP table - * @qpn: queue pair number to free - */ -static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) -{ - struct rvt_qpn_map *map; - - map = qpt->map + qpn / RVT_BITS_PER_PAGE; - if (map->page) - clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); -} - /** * rvt_destroy_qp - destroy a queue pair * @ibqp: the queue pair to destroy @@ -1375,7 +1482,7 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->qp_state = qp->state; attr->cur_qp_state = attr->qp_state; - attr->path_mtu = qp->path_mtu; + attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu); attr->path_mig_state = qp->s_mig_state; attr->qkey = qp->qkey; attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask; @@ -1695,22 +1802,23 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->length = 0; j = 0; if (wr->num_sge) { + struct rvt_sge *last_sge = NULL; + acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0; for (i = 0; i < wr->num_sge; i++) { u32 length = wr->sg_list[i].length; - int ok; if (length == 0) continue; - ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], - &wr->sg_list[i], acc); - if (!ok) { - ret = -EINVAL; + ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge, + &wr->sg_list[i], acc); + if (unlikely(ret < 0)) goto bail_inval_free; - } wqe->length += length; - j++; + if (ret) + last_sge = &wqe->sg_list[j]; + j += ret; } wqe->wr.num_sge = j; } @@ -1757,7 +1865,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; qp->s_avail--; } - trace_rvt_post_one_wr(qp, wqe); + trace_rvt_post_one_wr(qp, wqe, wr->num_sge); smp_wmb(); /* see request builders */ qp->s_head = next; @@ -2065,3 +2173,147 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t) return HRTIMER_NORESTART; } EXPORT_SYMBOL(rvt_rc_rnr_retry); + +/** + * rvt_qp_iter_init - initial for QP iteration + * @rdi - rvt devinfo + * @v - u64 value + * + * This returns an iterator suitable for iterating QPs + * in the system. + * + * The @cb is a user defined callback and @v is a 64 + * bit value passed to and relevant for processing in the + * @cb. An example use case would be to alter QP processing + * based on criteria not part of the rvt_qp. + * + * Use cases that require memory allocation to succeed + * must preallocate appropriately. + * + * Return: a pointer to an rvt_qp_iter or NULL + */ +struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)) +{ + struct rvt_qp_iter *i; + + i = kzalloc(sizeof(*i), GFP_KERNEL); + if (!i) + return NULL; + + i->rdi = rdi; + /* number of special QPs (SMI/GSI) for device */ + i->specials = rdi->ibdev.phys_port_cnt * 2; + i->v = v; + i->cb = cb; + + return i; +} +EXPORT_SYMBOL(rvt_qp_iter_init); + +/** + * rvt_qp_iter_next - return the next QP in iter + * @iter - the iterator + * + * Fine grained QP iterator suitable for use + * with debugfs seq_file mechanisms. + * + * Updates iter->qp with the current QP when the return + * value is 0. + * + * Return: 0 - iter->qp is valid 1 - no more QPs + */ +int rvt_qp_iter_next(struct rvt_qp_iter *iter) + __must_hold(RCU) +{ + int n = iter->n; + int ret = 1; + struct rvt_qp *pqp = iter->qp; + struct rvt_qp *qp; + struct rvt_dev_info *rdi = iter->rdi; + + /* + * The approach is to consider the special qps + * as additional table entries before the + * real hash table. Since the qp code sets + * the qp->next hash link to NULL, this works just fine. + * + * iter->specials is 2 * # ports + * + * n = 0..iter->specials is the special qp indices + * + * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are + * the potential hash bucket entries + * + */ + for (; n < rdi->qp_dev->qp_table_size + iter->specials; n++) { + if (pqp) { + qp = rcu_dereference(pqp->next); + } else { + if (n < iter->specials) { + struct rvt_ibport *rvp; + int pidx; + + pidx = n % rdi->ibdev.phys_port_cnt; + rvp = rdi->ports[pidx]; + qp = rcu_dereference(rvp->qp[n & 1]); + } else { + qp = rcu_dereference( + rdi->qp_dev->qp_table[ + (n - iter->specials)]); + } + } + pqp = qp; + if (qp) { + iter->qp = qp; + iter->n = n; + return 0; + } + } + return ret; +} +EXPORT_SYMBOL(rvt_qp_iter_next); + +/** + * rvt_qp_iter - iterate all QPs + * @rdi - rvt devinfo + * @v - a 64 bit value + * @cb - a callback + * + * This provides a way for iterating all QPs. + * + * The @cb is a user defined callback and @v is a 64 + * bit value passed to and relevant for processing in the + * cb. An example use case would be to alter QP processing + * based on criteria not part of the rvt_qp. + * + * The code has an internal iterator to simplify + * non seq_file use cases. + */ +void rvt_qp_iter(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)) +{ + int ret; + struct rvt_qp_iter i = { + .rdi = rdi, + .specials = rdi->ibdev.phys_port_cnt * 2, + .v = v, + .cb = cb + }; + + rcu_read_lock(); + do { + ret = rvt_qp_iter_next(&i); + if (!ret) { + rvt_get_qp(i.qp); + rcu_read_unlock(); + i.cb(i.qp, i.v); + rcu_read_lock(); + rvt_put_qp(i.qp); + } + } while (!ret); + rcu_read_unlock(); +} +EXPORT_SYMBOL(rvt_qp_iter); diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index 3318a6c36373..976e482930a3 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -103,6 +103,68 @@ DEFINE_EVENT( TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len), TP_ARGS(mr, m, n, v, len)); +DECLARE_EVENT_CLASS( + rvt_sge_template, + TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), + TP_ARGS(sge, isge), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(sge->mr->pd->device)) + __field(struct rvt_mregion *, mr) + __field(struct rvt_sge *, sge) + __field(struct ib_sge *, isge) + __field(void *, vaddr) + __field(u64, ivaddr) + __field(u32, lkey) + __field(u32, sge_length) + __field(u32, length) + __field(u32, ilength) + __field(int, user) + __field(u16, m) + __field(u16, n) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(sge->mr->pd->device)); + __entry->mr = sge->mr; + __entry->sge = sge; + __entry->isge = isge; + __entry->vaddr = sge->vaddr; + __entry->ivaddr = isge->addr; + __entry->lkey = sge->mr->lkey; + __entry->sge_length = sge->sge_length; + __entry->length = sge->length; + __entry->ilength = isge->length; + __entry->m = sge->m; + __entry->n = sge->m; + __entry->user = ibpd_to_rvtpd(sge->mr->pd)->user; + ), + TP_printk( + "[%s] mr %p sge %p isge %p vaddr %p ivaddr %llx lkey %x sge_length %u length %u ilength %u m %u n %u user %u", + __get_str(dev), + __entry->mr, + __entry->sge, + __entry->isge, + __entry->vaddr, + __entry->ivaddr, + __entry->lkey, + __entry->sge_length, + __entry->length, + __entry->ilength, + __entry->m, + __entry->n, + __entry->user + ) +); + +DEFINE_EVENT( + rvt_sge_template, rvt_sge_adjacent, + TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), + TP_ARGS(sge, isge)); + +DEFINE_EVENT( + rvt_sge_template, rvt_sge_new, + TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), + TP_ARGS(sge, isge)); + #endif /* __RVT_TRACE_MR_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index a613a2223751..0ef25fc49f25 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -84,12 +84,12 @@ __print_symbolic(opcode, \ wr_opcode_name(RESERVED10)) #define POS_PRN \ -"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u" +"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u wr_num_sge %u" TRACE_EVENT( rvt_post_one_wr, - TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe), - TP_ARGS(qp, wqe), + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, int wr_num_sge), + TP_ARGS(qp, wqe, wr_num_sge), TP_STRUCT__entry( RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) __field(u64, wr_id) @@ -108,6 +108,7 @@ TRACE_EVENT( __field(int, send_flags) __field(pid_t, pid) __field(int, num_sge) + __field(int, wr_num_sge) ), TP_fast_assign( RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) @@ -127,6 +128,7 @@ TRACE_EVENT( __entry->ssn = wqe->ssn; __entry->send_flags = wqe->wr.send_flags; __entry->num_sge = wqe->wr.num_sge; + __entry->wr_num_sge = wr_num_sge; ), TP_printk( POS_PRN, @@ -146,7 +148,8 @@ TRACE_EVENT( __entry->head, __entry->last, __entry->pid, - __entry->num_sge + __entry->num_sge, + __entry->wr_num_sge ) ); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 0d7c6bb551d9..64bdd442078a 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -202,8 +202,13 @@ static int rvt_modify_port(struct ib_device *ibdev, u8 port_num, return -EINVAL; rvp = rdi->ports[port_index]; - rvp->port_cap_flags |= props->set_port_cap_mask; - rvp->port_cap_flags &= ~props->clr_port_cap_mask; + if (port_modify_mask & IB_PORT_OPA_MASK_CHG) { + rvp->port_cap3_flags |= props->set_port_cap_mask; + rvp->port_cap3_flags &= ~props->clr_port_cap_mask; + } else { + rvp->port_cap_flags |= props->set_port_cap_mask; + rvp->port_cap_flags &= ~props->clr_port_cap_mask; + } if (props->set_port_cap_mask || props->clr_port_cap_mask) rdi->driver_f.cap_mask_chg(rdi, port_num); diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index c21c913f911a..8c3d30b3092d 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -38,7 +38,6 @@ MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); MODULE_DESCRIPTION("Soft RDMA transport"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION("0.2"); /* free resources for all ports on a device */ static void rxe_cleanup_ports(struct rxe_dev *rxe) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 1ac5b8551a4d..6447d736d5a4 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -97,7 +97,7 @@ int rxe_rcv(struct sk_buff *skb); void rxe_dev_put(struct rxe_dev *rxe); struct rxe_dev *net_to_rxe(struct net_device *ndev); -struct rxe_dev *get_rxe_by_name(const char* name); +struct rxe_dev *get_rxe_by_name(const char *name); void rxe_port_up(struct rxe_dev *rxe); void rxe_port_down(struct rxe_dev *rxe); diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 5bddf469361b..1cc9e2e1365d 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -38,18 +38,13 @@ int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) { struct rxe_port *port; - if (rdma_ah_get_port_num(attr) != 1) { - pr_info("invalid port_num = %d\n", rdma_ah_get_port_num(attr)); - return -EINVAL; - } - port = &rxe->port; if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) { u8 sgid_index = rdma_ah_read_grh(attr)->sgid_index; if (sgid_index > port->attr.gid_tbl_len) { - pr_info("invalid sgid index = %d\n", sgid_index); + pr_warn("invalid sgid index = %d\n", sgid_index); return -EINVAL; } } diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 49fe42c23f4d..c4aabf78dc90 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -69,6 +69,14 @@ err1: static void rxe_send_complete(unsigned long data) { struct rxe_cq *cq = (struct rxe_cq *)data; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + if (cq->is_dying) { + spin_unlock_irqrestore(&cq->cq_lock, flags); + return; + } + spin_unlock_irqrestore(&cq->cq_lock, flags); cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } @@ -97,6 +105,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, if (udata) cq->is_user = 1; + cq->is_dying = false; + tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq); spin_lock_init(&cq->cq_lock); @@ -156,6 +166,15 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) return 0; } +void rxe_cq_disable(struct rxe_cq *cq) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + cq->is_dying = true; + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + void rxe_cq_cleanup(struct rxe_pool_entry *arg) { struct rxe_cq *cq = container_of(arg, typeof(*cq), pelem); diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c index 7ef90aad7dfd..6aeb7a165e46 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c @@ -33,7 +33,7 @@ #include "rxe.h" #include "rxe_hw_counters.h" -const char * const rxe_counter_name[] = { +static const char * const rxe_counter_name[] = { [RXE_CNT_SENT_PKTS] = "sent_pkts", [RXE_CNT_RCVD_PKTS] = "rcvd_pkts", [RXE_CNT_DUP_REQ] = "duplicate_request", diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index d6299edf9a5b..77b3ed0df936 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -64,6 +64,8 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata); int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); +void rxe_cq_disable(struct rxe_cq *cq); + void rxe_cq_cleanup(struct rxe_pool_entry *arg); /* rxe_mcast.c */ @@ -219,8 +221,6 @@ static inline void rxe_advance_resp_resource(struct rxe_qp *qp) void retransmit_timer(unsigned long data); void rnr_nak_timer(unsigned long data); -void dump_qp(struct rxe_qp *qp); - /* rxe_srq.c */ #define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT) @@ -250,7 +250,7 @@ void rxe_resp_queue_pkt(struct rxe_dev *rxe, void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp, struct sk_buff *skb); -static inline unsigned wr_opcode_mask(int opcode, struct rxe_qp *qp) +static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) { return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; } diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c index bd812e00988e..d22431e3a908 100644 --- a/drivers/infiniband/sw/rxe/rxe_mmap.c +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -76,7 +76,7 @@ static void rxe_vma_close(struct vm_area_struct *vma) kref_put(&ip->ref, rxe_mmap_release); } -static struct vm_operations_struct rxe_vm_ops = { +static const struct vm_operations_struct rxe_vm_ops = { .open = rxe_vma_open, .close = rxe_vma_close, }; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index e37cc89987e1..5c2684bf430f 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -367,11 +367,11 @@ int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length, dest = (dir == to_mem_obj) ? ((void *)(uintptr_t)iova) : addr; + memcpy(dest, src, length); + if (crcp) *crcp = rxe_crc32(to_rdev(mem->pd->ibpd.device), - *crcp, src, length); - - memcpy(dest, src, length); + *crcp, dest, length); return 0; } @@ -401,11 +401,11 @@ int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length, if (bytes > length) bytes = length; + memcpy(dest, src, bytes); + if (crcp) crc = rxe_crc32(to_rdev(mem->pd->ibpd.device), - crc, src, bytes); - - memcpy(dest, src, bytes); + crc, dest, bytes); length -= bytes; addr += bytes; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 08f3f90d2912..59dee10bebcb 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -191,7 +191,7 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); - if (!dst || !(dst->obsolete && dst->ops->check(dst, 0))) { + if (!dst || !dst_check(dst, qp->dst_cookie)) { if (dst) dst_release(dst); @@ -209,6 +209,11 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route6(rxe->ndev, saddr6, daddr6); +#if IS_ENABLED(CONFIG_IPV6) + if (dst) + qp->dst_cookie = + rt6_get_cookie((struct rt6_info *)dst); +#endif } } @@ -337,7 +342,7 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - skb_dst_set(skb, dst); + skb_dst_set(skb, dst_clone(dst)); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); @@ -388,7 +393,7 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb, struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; - struct dst_entry *dst = NULL; + struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; @@ -460,12 +465,17 @@ int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) nskb->destructor = rxe_skb_tx_dtor; nskb->sk = pkt->qp->sk->sk; + rxe_add_ref(pkt->qp); + atomic_inc(&pkt->qp->skb_out); + if (av->network_type == RDMA_NETWORK_IPV4) { err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); } else if (av->network_type == RDMA_NETWORK_IPV6) { err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); } else { pr_err("Unknown layer 3 protocol: %d\n", av->network_type); + atomic_dec(&pkt->qp->skb_out); + rxe_drop_ref(pkt->qp); kfree_skb(nskb); return -EINVAL; } @@ -475,10 +485,7 @@ int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) return -EAGAIN; } - rxe_add_ref(pkt->qp); - atomic_inc(&pkt->qp->skb_out); kfree_skb(skb); - return 0; } @@ -644,8 +651,13 @@ static int rxe_notify(struct notifier_block *not_blk, pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; - case NETDEV_REBOOT: case NETDEV_CHANGE: + if (netif_running(ndev) && netif_carrier_ok(ndev)) + rxe_port_up(rxe); + else + rxe_port_down(rxe); + break; + case NETDEV_REBOOT: case NETDEV_GOING_DOWN: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 75d11ee635ec..c1b5f38f31a5 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -188,7 +188,7 @@ int rxe_pool_init( struct rxe_dev *rxe, struct rxe_pool *pool, enum rxe_elem_type type, - unsigned max_elem) + unsigned int max_elem) { int err = 0; size_t size = rxe_type_info[type].size; diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 80ccc7c7c341..00bda9380a2e 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -851,13 +851,8 @@ void rxe_qp_cleanup(struct rxe_pool_entry *arg) qp->resp.mr = NULL; } - if (qp_type(qp) == IB_QPT_RC) { - struct dst_entry *dst = NULL; - - dst = sk_dst_get(qp->sk->sk); - if (dst) - dst_release(dst); - } + if (qp_type(qp) == IB_QPT_RC) + sk_dst_reset(qp->sk->sk); free_rd_atomic_resources(qp); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 7ee465d1a1e1..d84222f9d5d2 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -43,7 +43,7 @@ static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, static inline void retry_first_write_send(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - unsigned mask, int npsn) + unsigned int mask, int npsn) { int i; @@ -594,8 +594,10 @@ int rxe_requester(void *arg) rxe_add_ref(qp); next_wqe: - if (unlikely(!qp->valid)) + if (unlikely(!qp->valid)) { + rxe_drain_req_pkts(qp, true); goto exit; + } if (unlikely(qp->req.state == QP_STATE_ERROR)) { rxe_drain_req_pkts(qp, true); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index a958ee918a49..4240866a5331 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1055,7 +1055,7 @@ static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn) { int i; - for (i = 0; i < qp->attr.max_rd_atomic; i++) { + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { struct resp_res *res = &qp->resp.resources[i]; if (res->type == 0) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index d2a14a1bdc7f..ea3810b29273 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -78,7 +78,7 @@ void rxe_do_task(unsigned long data) default: spin_unlock_irqrestore(&task->state_lock, flags); - pr_warn("bad state = %d in rxe_do_task\n", task->state); + pr_warn("%s failed with bad state %d\n", __func__, task->state); return; } @@ -105,7 +105,7 @@ void rxe_do_task(unsigned long data) break; default: - pr_warn("bad state = %d in rxe_do_task\n", + pr_warn("%s failed with bad state %d\n", __func__, task->state); } spin_unlock_irqrestore(&task->state_lock, flags); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index af90a7d42b96..0b362f49a10a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -51,40 +51,16 @@ static int rxe_query_device(struct ib_device *dev, return 0; } -static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed, - u8 *active_width) -{ - if (speed <= 1000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_SDR; - } else if (speed <= 10000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR10; - } else if (speed <= 20000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_DDR; - } else if (speed <= 30000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_QDR; - } else if (speed <= 40000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR10; - } else { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - } -} - static int rxe_query_port(struct ib_device *dev, u8 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(dev); struct rxe_port *port; - u32 speed; + int rc = -EINVAL; if (unlikely(port_num != 1)) { pr_warn("invalid port_number %d\n", port_num); - goto err1; + goto out; } port = &rxe->port; @@ -93,29 +69,12 @@ static int rxe_query_port(struct ib_device *dev, *attr = port->attr; mutex_lock(&rxe->usdev_lock); - if (rxe->ndev->ethtool_ops->get_link_ksettings) { - struct ethtool_link_ksettings ks; - - rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks); - speed = ks.base.speed; - } else if (rxe->ndev->ethtool_ops->get_settings) { - struct ethtool_cmd cmd; - - rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd); - speed = cmd.speed; - } else { - pr_warn("%s speed is unknown, defaulting to 1000\n", - rxe->ndev->name); - speed = 1000; - } - rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, - &attr->active_width); + rc = ib_get_eth_speed(dev, port_num, &attr->active_speed, + &attr->active_width); mutex_unlock(&rxe->usdev_lock); - return 0; - -err1: - return -EINVAL; +out: + return rc; } static int rxe_query_gid(struct ib_device *device, @@ -960,6 +919,8 @@ static int rxe_destroy_cq(struct ib_cq *ibcq) { struct rxe_cq *cq = to_rcq(ibcq); + rxe_cq_disable(cq); + rxe_drop_ref(cq); return 0; } @@ -1210,8 +1171,8 @@ static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) return rxe_mcast_drop_grp_elem(rxe, qp, mgid); } -static ssize_t rxe_show_parent(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t parent_show(struct device *device, + struct device_attribute *attr, char *buf) { struct rxe_dev *rxe = container_of(device, struct rxe_dev, ib_dev.dev); @@ -1219,7 +1180,7 @@ static ssize_t rxe_show_parent(struct device *device, return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1)); } -static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL); +static DEVICE_ATTR_RO(parent); static struct device_attribute *rxe_dev_attributes[] = { &dev_attr_parent, @@ -1336,15 +1297,15 @@ int rxe_register_device(struct rxe_dev *rxe) err = ib_register_device(dev, NULL); if (err) { - pr_warn("rxe_register_device failed, err = %d\n", err); + pr_warn("%s failed with error %d\n", __func__, err); goto err1; } for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) { err = device_create_file(&dev->dev, rxe_dev_attributes[i]); if (err) { - pr_warn("device_create_file failed, i = %d, err = %d\n", - i, err); + pr_warn("%s failed with error %d for attr number %d\n", + __func__, err, i); goto err2; } } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 5a180fbe40d9..0c2dbe45c729 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -89,6 +89,7 @@ struct rxe_cq { struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; + bool is_dying; int is_user; struct tasklet_struct comp_task; }; @@ -247,6 +248,7 @@ struct rxe_qp { struct rxe_rq rq; struct socket *sk; + u32 dst_cookie; struct rxe_av pri_av; struct rxe_av alt_av; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7ac25059c40f..4a5c7a07a631 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -337,6 +337,7 @@ struct ipoib_dev_priv { struct rw_semaphore vlan_rwsem; struct mutex mcast_mutex; + struct mutex sysfs_mutex; struct rb_root path_tree; struct list_head path_list; @@ -367,7 +368,7 @@ struct ipoib_dev_priv { u32 qkey; union ib_gid local_gid; - u16 local_lid; + u32 local_lid; unsigned int admin_mtu; unsigned int mcast_mtu; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index d69410c2ed97..14b62f7472b4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1506,9 +1506,14 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags)) return -EPERM; - if (!rtnl_trylock()) + if (!mutex_trylock(&priv->sysfs_mutex)) return restart_syscall(); + if (!rtnl_trylock()) { + mutex_unlock(&priv->sysfs_mutex); + return restart_syscall(); + } + ret = ipoib_set_mode(dev, buf); /* The assumption is that the function ipoib_set_mode returned @@ -1517,6 +1522,7 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, */ if (ret != -EBUSY) rtnl_unlock(); + mutex_unlock(&priv->sysfs_mutex); return (!ret || ret == -EBUSY) ? count : ret; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 184a22f48027..8dc1e6225cc8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -63,8 +63,7 @@ static void ipoib_get_drvinfo(struct net_device *netdev, { struct ipoib_dev_priv *priv = ipoib_priv(netdev); - ib_get_device_fw_str(priv->ca, drvinfo->fw_version, - sizeof(drvinfo->fw_version)); + ib_get_device_fw_str(priv->ca, drvinfo->fw_version); strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent), sizeof(drvinfo->bus_info)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6c77df34869d..bac95b509a9b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -60,7 +60,6 @@ const char ipoib_driver_version[] = DRV_VERSION; MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; @@ -100,6 +99,8 @@ static struct net_device *ipoib_get_net_dev_by_params( const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static int ipoib_set_mac(struct net_device *dev, void *addr); +static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, + int cmd); static struct ib_client ipoib_client = { .name = "ipoib", @@ -1681,6 +1682,17 @@ out: return -ENOMEM; } +static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, + int cmd) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (!priv->rn_ops->ndo_do_ioctl) + return -EOPNOTSUPP; + + return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd); +} + int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1835,6 +1847,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_set_vf_guid = ipoib_set_vf_guid, .ndo_set_mac_address = ipoib_set_mac, .ndo_get_stats64 = ipoib_get_stats, + .ndo_do_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_ops_vf = { @@ -1848,6 +1861,7 @@ static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_get_stats64 = ipoib_get_stats, + .ndo_do_ioctl = ipoib_ioctl, }; void ipoib_setup_common(struct net_device *dev) @@ -1879,6 +1893,7 @@ static void ipoib_build_priv(struct net_device *dev) spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); + mutex_init(&priv->sysfs_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -2228,13 +2243,7 @@ static struct net_device *ipoib_add_port(const char *format, INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); - result = ib_register_event_handler(&priv->event_handler); - if (result < 0) { - printk(KERN_WARNING "%s: ib_register_event_handler failed for " - "port %d (ret = %d)\n", - hca->name, port, result); - goto event_failed; - } + ib_register_event_handler(&priv->event_handler); result = register_netdev(priv->dev); if (result) { @@ -2267,8 +2276,6 @@ register_failed: set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(priv->wq); - -event_failed: ipoib_dev_cleanup(priv->dev); device_init_failed: @@ -2338,7 +2345,11 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(priv->wq); + /* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */ + mutex_lock(&priv->sysfs_mutex); unregister_netdev(priv->dev); + mutex_unlock(&priv->sysfs_mutex); + rn->free_rdma_netdev(priv->dev); list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 081b33deff1b..9927cd6b7082 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -133,12 +133,20 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) snprintf(intf_name, sizeof intf_name, "%s.%04x", ppriv->dev->name, pkey); - if (!rtnl_trylock()) + if (!mutex_trylock(&ppriv->sysfs_mutex)) return restart_syscall(); + if (!rtnl_trylock()) { + mutex_unlock(&ppriv->sysfs_mutex); + return restart_syscall(); + } + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); - if (!priv) + if (!priv) { + rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); return -ENOMEM; + } down_write(&ppriv->vlan_rwsem); @@ -164,8 +172,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) out: up_write(&ppriv->vlan_rwsem); - rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); if (result) { free_netdev(priv->dev); @@ -188,8 +196,13 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) return -EPERM; - if (!rtnl_trylock()) + if (!mutex_trylock(&ppriv->sysfs_mutex)) + return restart_syscall(); + + if (!rtnl_trylock()) { + mutex_unlock(&ppriv->sysfs_mutex); return restart_syscall(); + } down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { @@ -208,6 +221,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) } rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); if (dev) { free_netdev(dev); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 37b33d708c2d..19624e023ebd 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -77,7 +77,6 @@ MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); -MODULE_VERSION(DRV_VER); static struct scsi_host_template iscsi_iser_sht; static struct iscsi_transport iscsi_iser_transport; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 26a004e97ae0..55a73b0ed4c6 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -106,9 +106,7 @@ static int iser_create_device_ib_res(struct iser_device *device) INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev, iser_event_handler); - if (ib_register_event_handler(&device->event_handler)) - goto cq_err; - + ib_register_event_handler(&device->event_handler); return 0; cq_err: @@ -141,7 +139,7 @@ static void iser_free_device_ib_res(struct iser_device *device) comp->cq = NULL; } - (void)ib_unregister_event_handler(&device->event_handler); + ib_unregister_event_handler(&device->event_handler); ib_dealloc_pd(device->pd); kfree(device->comps); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 0e662656ef42..ceabdb85df8b 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2710,7 +2710,6 @@ static void __exit isert_exit(void) } MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); -MODULE_VERSION("1.0"); MODULE_AUTHOR("nab@Linux-iSCSI.org"); MODULE_LICENSE("GPL"); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index cf768dd78d1b..21f0b481edcc 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -52,7 +52,9 @@ #include <linux/module.h> #include <rdma/ib_addr.h> -#include <rdma/ib_smi.h> +#include <rdma/ib_verbs.h> +#include <rdma/opa_smi.h> +#include <rdma/opa_port_info.h> #include "opa_vnic_internal.h" @@ -952,12 +954,7 @@ static int vema_register(struct opa_vnic_ctrl_port *cport) INIT_IB_EVENT_HANDLER(&port->event_handler, cport->ibdev, opa_vnic_event); - ret = ib_register_event_handler(&port->event_handler); - if (ret) { - c_err("port %d: event handler register failed\n", i); - vema_unregister(cport); - return ret; - } + ib_register_event_handler(&port->event_handler); idr_init(&port->vport_idr); mutex_init(&port->lock); @@ -980,6 +977,27 @@ static int vema_register(struct opa_vnic_ctrl_port *cport) } /** + * opa_vnic_ctrl_config_dev -- This function sends a trap to the EM + * by way of ib_modify_port to indicate support for ethernet on the + * fabric. + * @cport: pointer to control port + * @en: enable or disable ethernet on fabric support + */ +static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en) +{ + struct ib_port_modify pm = { 0 }; + int i; + + if (en) + pm.set_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported; + else + pm.clr_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported; + + for (i = 1; i <= cport->num_ports; i++) + ib_modify_port(cport->ibdev, i, IB_PORT_OPA_MASK_CHG, &pm); +} + +/** * opa_vnic_vema_add_one -- Handle new ib device * @device: ib device pointer * @@ -1007,6 +1025,7 @@ static void opa_vnic_vema_add_one(struct ib_device *device) c_info("VNIC client initialized\n"); ib_set_client_data(device, &opa_vnic_client, cport); + opa_vnic_ctrl_config_dev(cport, true); } /** @@ -1025,6 +1044,7 @@ static void opa_vnic_vema_rem_one(struct ib_device *device, return; c_info("removing VNIC client\n"); + opa_vnic_ctrl_config_dev(cport, false); vema_unregister(cport); kfree(cport); } @@ -1053,4 +1073,3 @@ module_exit(opa_vnic_deinit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("Intel OPA Virtual Network driver"); -MODULE_VERSION(DRV_VERSION); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 2354c742caa1..fa5ccdb3bb2a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -62,7 +62,6 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); MODULE_INFO(release_date, DRV_RELDATE); #if !defined(CONFIG_DYNAMIC_DEBUG) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 402275be0931..9e8e9220f816 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2238,7 +2238,7 @@ static int srpt_write_pending(struct se_cmd *se_cmd) cqe, first_wr); cqe = NULL; } - + ret = ib_post_send(ch->qp, first_wr, &bad_wr); if (ret) { pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n", @@ -2530,8 +2530,7 @@ static void srpt_add_one(struct ib_device *device) INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, srpt_event_handler); - if (ib_register_event_handler(&sdev->event_handler)) - goto err_cm; + ib_register_event_handler(&sdev->event_handler); sdev->ioctx_ring = (struct srpt_recv_ioctx **) srpt_alloc_ioctx_ring(sdev, sdev->srq_size, diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index cc1183851af5..1b817e51b84b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -328,8 +328,8 @@ struct srpt_port { u8 port_guid[24]; u8 port_gid[64]; u8 port; - u16 sm_lid; - u16 lid; + u32 sm_lid; + u32 lid; union ib_gid gid; struct work_struct work; struct se_portal_group port_guid_tpg; diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 674773b28b2e..78b89ceb4f46 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -2535,8 +2535,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev) } if (!priv->cmd.pool) { - priv->cmd.pool = pci_pool_create("mlx4_cmd", - dev->persist->pdev, + priv->cmd.pool = dma_pool_create("mlx4_cmd", + &dev->persist->pdev->dev, MLX4_MAILBOX_SIZE, MLX4_MAILBOX_SIZE, 0); if (!priv->cmd.pool) @@ -2607,7 +2607,7 @@ void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask) struct mlx4_priv *priv = mlx4_priv(dev); if (priv->cmd.pool && (cleanup_mask & MLX4_CMD_CLEANUP_POOL)) { - pci_pool_destroy(priv->cmd.pool); + dma_pool_destroy(priv->cmd.pool); priv->cmd.pool = NULL; } @@ -2699,7 +2699,7 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev) if (!mailbox) return ERR_PTR(-ENOMEM); - mailbox->buf = pci_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL, + mailbox->buf = dma_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL, &mailbox->dma); if (!mailbox->buf) { kfree(mailbox); @@ -2716,7 +2716,7 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, if (!mailbox) return; - pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma); + dma_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox); diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c index c56a511b918e..72eb50cd5ecd 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/cq.c @@ -241,13 +241,14 @@ err_out: return err; } -static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn, u8 usage) { + u32 in_modifier = RES_CQ | (((u32)usage & 3) << 30); u64 out_param; int err; if (mlx4_is_mfunc(dev)) { - err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ, + err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier, RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (err) @@ -303,7 +304,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, cq->vector = vector; - err = mlx4_cq_alloc_icm(dev, &cq->cqn); + err = mlx4_cq_alloc_icm(dev, &cq->cqn, cq->usage); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 85fe17e4dcfb..f849eec21824 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -140,6 +140,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, (cq->type == RX && priv->hwtstamp_config.rx_filter)) timestamp_en = 1; + cq->mcq.usage = MLX4_RES_USAGE_DRIVER; err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq, cq->vector, 0, timestamp_en); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 3a291fc1780a..e3e6d9fa69fd 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -651,7 +651,8 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv) return 0; } - err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP); + err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP, + MLX4_RES_USAGE_DRIVER); en_dbg(DRV, priv, "Reserved qp %d\n", *qpn); if (err) { en_err(priv, "Failed to reserve qp for mac registration\n"); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index bf1638044a7a..ec24c4057be6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1088,7 +1088,8 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) u32 qpn; err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, - MLX4_RESERVE_A0_QP); + MLX4_RESERVE_A0_QP, + MLX4_RES_USAGE_DRIVER); if (err) { en_err(priv, "Failed reserving drop qpn\n"); return err; @@ -1134,7 +1135,8 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) flags = priv->rx_ring_num == 1 ? MLX4_RESERVE_A0_QP : 0; err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, priv->rx_ring_num, - &rss_map->base_qpn, flags); + &rss_map->base_qpn, flags, + MLX4_RES_USAGE_DRIVER); if (err) { en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 73faa3d77921..a81db2582555 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -105,7 +105,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, (unsigned long long) ring->sp_wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, - MLX4_RESERVE_ETH_BF_QP); + MLX4_RESERVE_ETH_BF_QP, + MLX4_RES_USAGE_DRIVER); if (err) { en_err(priv, "failed reserving qp for TX ring\n"); goto err_hwq_res; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 5fe5cdc51357..a594bfd9e095 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2477,7 +2477,7 @@ static int mlx4_allocate_default_counters(struct mlx4_dev *dev) priv->def_counter[port] = -1; for (port = 0; port < dev->caps.num_ports; port++) { - err = mlx4_counter_alloc(dev, &idx); + err = mlx4_counter_alloc(dev, &idx, MLX4_RES_USAGE_DRIVER); if (!err || err == -ENOSPC) { priv->def_counter[port] = idx; @@ -2519,13 +2519,14 @@ int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) return 0; } -int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage) { + u32 in_modifier = RES_COUNTER | (((u32)usage & 3) << 30); u64 out_param; int err; if (mlx4_is_mfunc(dev)) { - err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER, + err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier, RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 706d7f21ac5c..852d00a5b016 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -626,7 +626,7 @@ struct mlx4_mgm { }; struct mlx4_cmd { - struct pci_pool *pool; + struct dma_pool *pool; void __iomem *hcr; struct mutex slave_cmd_mutex; struct semaphore poll_sem; diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 26747212526b..5e5b4475b85e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -245,8 +245,9 @@ int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, } int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, - int *base, u8 flags) + int *base, u8 flags, u8 usage) { + u32 in_modifier = RES_QP | (((u32)usage & 3) << 30); u64 in_param = 0; u64 out_param; int err; @@ -258,7 +259,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt); set_param_h(&in_param, align); err = mlx4_cmd_imm(dev, in_param, &out_param, - RES_QP, RES_OP_RESERVE, + in_modifier, RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 31cbe5e86a01..1acbb721f38d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1110,7 +1110,7 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev, if (!mailbox) return ERR_PTR(-ENOMEM); - mailbox->buf = pci_pool_zalloc(dev->cmd.pool, flags, + mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags, &mailbox->dma); if (!mailbox->buf) { mlx5_core_dbg(dev, "failed allocation\n"); @@ -1125,7 +1125,7 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev, static void free_cmd_box(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *mailbox) { - pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } @@ -1776,7 +1776,8 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) return -EINVAL; } - cmd->pool = pci_pool_create("mlx5_cmd", dev->pdev, size, align, 0); + cmd->pool = dma_pool_create("mlx5_cmd", &dev->pdev->dev, size, align, + 0); if (!cmd->pool) return -ENOMEM; @@ -1866,7 +1867,7 @@ err_free_page: free_cmd_page(dev, cmd); err_free_pool: - pci_pool_destroy(cmd->pool); + dma_pool_destroy(cmd->pool); return err; } @@ -1880,6 +1881,6 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) destroy_workqueue(cmd->wq); destroy_msg_cache(dev); free_cmd_page(dev, cmd); - pci_pool_destroy(cmd->pool); + dma_pool_destroy(cmd->pool); } EXPORT_SYMBOL(mlx5_cmd_cleanup); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 2f26fb34d741..040d1af310b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -596,7 +596,6 @@ struct mlx5e_channel { struct mlx5_core_dev *mdev; struct mlx5e_tstamp *tstamp; int ix; - int cpu; }; struct mlx5e_channels { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6ad7f07e7861..d75f3099d164 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -71,6 +71,11 @@ struct mlx5e_channel_param { struct mlx5e_cq_param icosq_cq; }; +static int mlx5e_get_node(struct mlx5e_priv *priv, int ix) +{ + return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix); +} + static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { return MLX5_CAP_GEN(mdev, striding_rq) && @@ -396,7 +401,7 @@ static void mlx5e_enable_async_events(struct mlx5e_priv *priv) static void mlx5e_disable_async_events(struct mlx5e_priv *priv) { clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state); - synchronize_irq(mlx5_get_msix_vec(priv->mdev, MLX5_EQ_VEC_ASYNC)); + synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC)); } static inline int mlx5e_get_wqe_mtt_sz(void) @@ -443,16 +448,17 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int wq_sz = mlx5_wq_ll_get_size(&rq->wq); int mtt_sz = mlx5e_get_wqe_mtt_sz(); int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1; + int node = mlx5e_get_node(c->priv, c->ix); int i; rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info), - GFP_KERNEL, cpu_to_node(c->cpu)); + GFP_KERNEL, node); if (!rq->mpwqe.info) goto err_out; /* We allocate more than mtt_sz as we will align the pointer */ - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL, - cpu_to_node(c->cpu)); + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, + GFP_KERNEL, node); if (unlikely(!rq->mpwqe.mtt_no_align)) goto err_free_wqe_info; @@ -560,7 +566,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, int err; int i; - rqp->wq.db_numa_node = cpu_to_node(c->cpu); + rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); @@ -627,7 +633,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, default: /* MLX5_WQ_TYPE_LINKED_LIST */ rq->wqe.frag_info = kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info), - GFP_KERNEL, cpu_to_node(c->cpu)); + GFP_KERNEL, + mlx5e_get_node(c->priv, c->ix)); if (!rq->wqe.frag_info) { err = -ENOMEM; goto err_rq_wq_destroy; @@ -992,13 +999,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1046,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, sq->channel = c; sq->uar_map = mdev->mlx5e_res.bfreg.map; - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1118,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1496,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c, struct mlx5_core_dev *mdev = c->priv->mdev; int err; - param->wq.buf_numa_node = cpu_to_node(c->cpu); - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); param->eq_ix = c->ix; err = mlx5e_alloc_cq_common(mdev, param, cq); @@ -1596,11 +1603,6 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq) mlx5e_free_cq(cq); } -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix) -{ - return cpumask_first(priv->mdev->priv.irq_info[ix].mask); -} - static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_channel_param *cparam) @@ -1749,11 +1751,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, { struct mlx5e_cq_moder icocq_moder = {0, 0}; struct net_device *netdev = priv->netdev; - int cpu = mlx5e_get_cpu(priv, ix); struct mlx5e_channel *c; int err; - c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu)); + c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix)); if (!c) return -ENOMEM; @@ -1761,7 +1762,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->mdev = priv->mdev; c->tstamp = &priv->tstamp; c->ix = ix; - c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); @@ -1847,7 +1847,8 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) for (tc = 0; tc < c->num_tc; tc++) mlx5e_activate_txqsq(&c->sq[tc]); mlx5e_activate_rq(&c->rq); - netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix); + netif_set_xps_queue(c->netdev, + mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix); } static void mlx5e_deactivate_channel(struct mlx5e_channel *c) @@ -3793,18 +3794,8 @@ void mlx5e_build_default_indir_rqt(struct mlx5_core_dev *mdev, u32 *indirection_rqt, int len, int num_channels) { - int node = mdev->priv.numa_node; - int node_num_of_cores; int i; - if (node == -1) - node = first_online_node; - - node_num_of_cores = cpumask_weight(cpumask_of_node(node)); - - if (node_num_of_cores) - num_channels = min_t(int, num_channels, node_num_of_cores); - for (i = 0; i < len; i++) indirection_rqt[i] = i % num_channels; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 898759fcf9ec..1f1f8af87d4d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -189,6 +189,7 @@ struct mlx5e_lbt_priv { struct packet_type pt; struct completion comp; bool loopback_ok; + bool local_lb; }; static int @@ -236,6 +237,13 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, { int err = 0; + /* Temporarily enable local_lb */ + if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { + mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb); + if (!lbtp->local_lb) + mlx5_nic_vport_update_local_lb(priv->mdev, true); + } + err = mlx5e_refresh_tirs(priv, true); if (err) return err; @@ -254,6 +262,11 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv, struct mlx5e_lbt_priv *lbtp) { + if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { + if (!lbtp->local_lb) + mlx5_nic_vport_update_local_lb(priv->mdev, false); + } + dev_remove_pack(&lbtp->pt); mlx5e_refresh_tirs(priv, false); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 52b9a64cd3a2..edd11ebd392e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -161,6 +161,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE"; case MLX5_EVENT_TYPE_FPGA_ERROR: return "MLX5_EVENT_TYPE_FPGA_ERROR"; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + return "MLX5_EVENT_TYPE_GENERAL_EVENT"; default: return "Unrecognized event"; } @@ -378,6 +380,20 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token, EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); #endif +static void general_event_handler(struct mlx5_core_dev *dev, + struct mlx5_eqe *eqe) +{ + switch (eqe->sub_type) { + case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT: + if (dev->event) + dev->event(dev, MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, 0); + break; + default: + mlx5_core_dbg(dev, "General event with unrecognized subtype: sub_type %d\n", + eqe->sub_type); + } +} + static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) { struct mlx5_eq *eq = eq_ptr; @@ -486,6 +502,9 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) mlx5_fpga_event(dev, eqe->type, &eqe->data.raw); break; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + general_event_handler(dev, eqe); + break; default: mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", eqe->type, eq->eqn); @@ -585,7 +604,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, name, pci_name(dev->pdev)); eq->eqn = MLX5_GET(create_eq_out, out, eq_number); - eq->irqn = priv->msix_arr[vecidx].vector; + eq->irqn = pci_irq_vector(dev->pdev, vecidx); eq->dev = dev; eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET; err = request_irq(eq->irqn, handler, 0, @@ -620,7 +639,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, return 0; err_irq: - free_irq(priv->msix_arr[vecidx].vector, eq); + free_irq(eq->irqn, eq); err_eq: mlx5_cmd_destroy_eq(dev, eq->eqn); @@ -661,11 +680,6 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) } EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq); -u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx) -{ - return dev->priv.msix_arr[MLX5_EQ_VEC_ASYNC].vector; -} - int mlx5_eq_init(struct mlx5_core_dev *dev) { int err; @@ -693,6 +707,10 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) mlx5_core_is_pf(dev)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE); + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH && + MLX5_CAP_GEN(dev, general_notification_event)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_GENERAL_EVENT); + if (MLX5_CAP_GEN(dev, port_module_event)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT); else diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 8b18cc9ec026..5b41f692acad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1585,7 +1585,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num) /* Mark this vport as disabled to discard new events */ vport->enabled = false; - synchronize_irq(mlx5_get_msix_vec(esw->dev, MLX5_EQ_VEC_ASYNC)); + synchronize_irq(pci_irq_vector(esw->dev->pdev, MLX5_EQ_VEC_ASYNC)); /* Wait for current already scheduled events to complete */ flush_workqueue(esw->work_queue); /* Disable events from this vport */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index fa33d59ab485..2c71557d1cee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -120,6 +120,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_IPOIB_ENHANCED_OFFLOADS); + if (err) + return err; + } + if (MLX5_CAP_GEN(dev, pg)) { err = mlx5_core_get_caps(dev, MLX5_CAP_ODP); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 4b6b03d6297f..8aea0a065e56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -81,7 +81,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev) u64 vector; /* wait for pending handlers to complete */ - synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); + synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD)); spin_lock_irqsave(&dev->cmd.alloc_lock, flags); vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); if (!vector) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 16885827367b..8c4b45ef539c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -47,6 +47,7 @@ #include <linux/debugfs.h> #include <linux/kmod.h> #include <linux/mlx5/mlx5_ifc.h> +#include <linux/mlx5/vport.h> #ifdef CONFIG_RFS_ACCEL #include <linux/cpu_rmap.h> #endif @@ -312,13 +313,15 @@ static void release_bar(struct pci_dev *pdev) pci_release_regions(pdev); } -static int mlx5_enable_msix(struct mlx5_core_dev *dev) +static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; + struct irq_affinity irqdesc = { + .pre_vectors = MLX5_EQ_VEC_COMP_BASE, + }; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; - int i; nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; @@ -326,17 +329,14 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev) if (nvec <= MLX5_EQ_VEC_COMP_BASE) return -ENOMEM; - priv->msix_arr = kcalloc(nvec, sizeof(*priv->msix_arr), GFP_KERNEL); - priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL); - if (!priv->msix_arr || !priv->irq_info) + if (!priv->irq_info) goto err_free_msix; - for (i = 0; i < nvec; i++) - priv->msix_arr[i].entry = i; - - nvec = pci_enable_msix_range(dev->pdev, priv->msix_arr, - MLX5_EQ_VEC_COMP_BASE + 1, nvec); + nvec = pci_alloc_irq_vectors_affinity(dev->pdev, + MLX5_EQ_VEC_COMP_BASE + 1, nvec, + PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, + &irqdesc); if (nvec < 0) return nvec; @@ -346,17 +346,15 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev) err_free_msix: kfree(priv->irq_info); - kfree(priv->msix_arr); return -ENOMEM; } -static void mlx5_disable_msix(struct mlx5_core_dev *dev) +static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; - pci_disable_msix(dev->pdev); + pci_free_irq_vectors(dev->pdev); kfree(priv->irq_info); - kfree(priv->msix_arr); } struct mlx5_reg_host_endianness { @@ -579,6 +577,18 @@ static int set_hca_ctrl(struct mlx5_core_dev *dev) return err; } +static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev) +{ + int ret = 0; + + /* Disable local_lb by default */ + if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + MLX5_CAP_GEN(dev, disable_local_lb)) + ret = mlx5_nic_vport_update_local_lb(dev, false); + + return ret; +} + int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id) { u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {0}; @@ -612,65 +622,6 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev) return (u64)timer_l | (u64)timer_h1 << 32; } -static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) -{ - struct mlx5_priv *priv = &mdev->priv; - struct msix_entry *msix = priv->msix_arr; - int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; - - if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { - mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); - return -ENOMEM; - } - - cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node), - priv->irq_info[i].mask); - - if (IS_ENABLED(CONFIG_SMP) && - irq_set_affinity_hint(irq, priv->irq_info[i].mask)) - mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq); - - return 0; -} - -static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) -{ - struct mlx5_priv *priv = &mdev->priv; - struct msix_entry *msix = priv->msix_arr; - int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; - - irq_set_affinity_hint(irq, NULL); - free_cpumask_var(priv->irq_info[i].mask); -} - -static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev) -{ - int err; - int i; - - for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) { - err = mlx5_irq_set_affinity_hint(mdev, i); - if (err) - goto err_out; - } - - return 0; - -err_out: - for (i--; i >= 0; i--) - mlx5_irq_clear_affinity_hint(mdev, i); - - return err; -} - -static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev) -{ - int i; - - for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) - mlx5_irq_clear_affinity_hint(mdev, i); -} - int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn) { @@ -760,8 +711,8 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev) } #ifdef CONFIG_RFS_ACCEL - irq_cpu_rmap_add(dev->rmap, - dev->priv.msix_arr[i + MLX5_EQ_VEC_COMP_BASE].vector); + irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev, + MLX5_EQ_VEC_COMP_BASE + i)); #endif snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i); err = mlx5_create_map_eq(dev, eq, @@ -1119,9 +1070,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_poll; } - err = mlx5_enable_msix(dev); + err = mlx5_alloc_irq_vectors(dev); if (err) { - dev_err(&pdev->dev, "enable msix failed\n"); + dev_err(&pdev->dev, "alloc irq vectors failed\n"); goto err_cleanup_once; } @@ -1143,15 +1094,15 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_eqs; } - err = mlx5_irq_set_affinity_hints(dev); + err = mlx5_init_fs(dev); if (err) { - dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n"); - goto err_affinity_hints; + dev_err(&pdev->dev, "Failed to init flow steering\n"); + goto err_fs; } - err = mlx5_init_fs(dev); + err = mlx5_core_set_hca_defaults(dev); if (err) { - dev_err(&pdev->dev, "Failed to init flow steering\n"); + dev_err(&pdev->dev, "Failed to set hca defaults\n"); goto err_fs; } @@ -1207,9 +1158,6 @@ err_sriov: mlx5_cleanup_fs(dev); err_fs: - mlx5_irq_clear_affinity_hints(dev); - -err_affinity_hints: free_comp_eqs(dev); err_stop_eqs: @@ -1219,7 +1167,7 @@ err_put_uars: mlx5_put_uars_page(dev, priv->uar); err_disable_msix: - mlx5_disable_msix(dev); + mlx5_free_irq_vectors(dev); err_cleanup_once: if (boot) @@ -1281,11 +1229,10 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, mlx5_eswitch_detach(dev->priv.eswitch); #endif mlx5_cleanup_fs(dev); - mlx5_irq_clear_affinity_hints(dev); free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_put_uars_page(dev, priv->uar); - mlx5_disable_msix(dev); + mlx5_free_irq_vectors(dev); if (cleanup) mlx5_cleanup_once(dev); mlx5_stop_health_poll(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 6a263e8d883a..01d637dac533 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -110,7 +110,6 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, u32 element_id); int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev); u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev); -u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx); struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn); void mlx5_cq_tasklet_cb(unsigned long data); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 340f281c9801..db9e665ab104 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -242,6 +242,20 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); +int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, + u32 timeout_usec) +{ + u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {0}; + + MLX5_SET(set_delay_drop_params_in, in, opcode, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); + MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, + timeout_usec / 100); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} +EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop); + struct mbox_info { u32 *in; u32 *out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index bf99d40e30b4..28d8472b36f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -32,6 +32,7 @@ #include <linux/pci.h> #include <linux/mlx5/driver.h> +#include <linux/mlx5/vport.h> #include "mlx5_core.h" #ifdef CONFIG_MLX5_CORE_EN #include "eswitch.h" @@ -44,6 +45,38 @@ bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev) return !!sriov->num_vfs; } +static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_hca_vport_context *in; + int err = 0; + + /* Restore sriov guid and policy settings */ + if (sriov->vfs_ctx[vf].node_guid || + sriov->vfs_ctx[vf].port_guid || + sriov->vfs_ctx[vf].policy != MLX5_POLICY_INVALID) { + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->node_guid = sriov->vfs_ctx[vf].node_guid; + in->port_guid = sriov->vfs_ctx[vf].port_guid; + in->policy = sriov->vfs_ctx[vf].policy; + in->field_select = + !!(in->port_guid) * MLX5_HCA_VPORT_SEL_PORT_GUID | + !!(in->node_guid) * MLX5_HCA_VPORT_SEL_NODE_GUID | + !!(in->policy) * MLX5_HCA_VPORT_SEL_STATE_POLICY; + + err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in); + if (err) + mlx5_core_warn(dev, "modify vport context failed, unable to restore VF %d settings\n", vf); + + kfree(in); + } + + return err; +} + static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; @@ -74,6 +107,15 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) } sriov->vfs_ctx[vf].enabled = 1; sriov->enabled_vfs++; + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) { + err = sriov_restore_guids(dev, vf); + if (err) { + mlx5_core_warn(dev, + "failed to restore VF %d settings, err %d\n", + vf, err); + continue; + } + } mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c index 520f6382dfde..23cc337a96c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -435,16 +435,128 @@ out: return err; } +static int create_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0}; + void *create_in; + void *xrqc; + void *wq; + int pas_size; + int inlen; + int err; + + pas_size = get_pas_size(in); + inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size; + create_in = kvzalloc(inlen, GFP_KERNEL); + if (!create_in) + return -ENOMEM; + + xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context); + wq = MLX5_ADDR_OF(xrqc, xrqc, wq); + + set_wq(wq, in); + memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size); + + if (in->type == IB_SRQT_TM) { + MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING); + if (in->flags & MLX5_SRQ_FLAG_RNDV) + MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV); + MLX5_SET(xrqc, xrqc, + tag_matching_topology_context.log_matching_list_sz, + in->tm_log_list_size); + } + MLX5_SET(xrqc, xrqc, user_index, in->user_index); + MLX5_SET(xrqc, xrqc, cqn, in->cqn); + MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ); + err = mlx5_cmd_exec(dev, create_in, inlen, create_out, + sizeof(create_out)); + kvfree(create_in); + if (!err) + srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn); + + return err; +} + +static int destroy_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0}; + + MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ); + MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int arm_xrq_cmd(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + u16 lwm) +{ + u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0}; + + MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ); + MLX5_SET(arm_rq_in, in, srq_number, srq->srqn); + MLX5_SET(arm_rq_in, in, lwm, lwm); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int query_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0}; + u32 *xrq_out; + int outlen = MLX5_ST_SZ_BYTES(query_xrq_out); + void *xrqc; + int err; + + xrq_out = kvzalloc(outlen, GFP_KERNEL); + if (!xrq_out) + return -ENOMEM; + + MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ); + MLX5_SET(query_xrq_in, in, xrqn, srq->srqn); + + err = mlx5_cmd_exec(dev, in, sizeof(in), xrq_out, outlen); + if (err) + goto out; + + xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context); + get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out); + if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; + out->tm_next_tag = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.append_next_index); + out->tm_hw_phase_cnt = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.hw_phase_cnt); + out->tm_sw_phase_cnt = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.sw_phase_cnt); + +out: + kvfree(xrq_out); + return err; +} + static int create_srq_split(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, struct mlx5_srq_attr *in) { if (!dev->issi) return create_srq_cmd(dev, srq, in); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return create_xrc_srq_cmd(dev, srq, in); - else + case MLX5_RES_XRQ: + return create_xrq_cmd(dev, srq, in); + default: return create_rmp_cmd(dev, srq, in); + } } static int destroy_srq_split(struct mlx5_core_dev *dev, @@ -452,10 +564,14 @@ static int destroy_srq_split(struct mlx5_core_dev *dev, { if (!dev->issi) return destroy_srq_cmd(dev, srq); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return destroy_xrc_srq_cmd(dev, srq); - else + case MLX5_RES_XRQ: + return destroy_xrq_cmd(dev, srq); + default: return destroy_rmp_cmd(dev, srq); + } } int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, @@ -464,10 +580,16 @@ int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, int err; struct mlx5_srq_table *table = &dev->priv.srq_table; - if (in->type == IB_SRQT_XRC) + switch (in->type) { + case IB_SRQT_XRC: srq->common.res = MLX5_RES_XSRQ; - else + break; + case IB_SRQT_TM: + srq->common.res = MLX5_RES_XRQ; + break; + default: srq->common.res = MLX5_RES_SRQ; + } err = create_srq_split(dev, srq, in); if (err) @@ -528,10 +650,14 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, { if (!dev->issi) return query_srq_cmd(dev, srq, out); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return query_xrc_srq_cmd(dev, srq, out); - else + case MLX5_RES_XRQ: + return query_xrq_cmd(dev, srq, out); + default: return query_rmp_cmd(dev, srq, out); + } } EXPORT_SYMBOL(mlx5_core_query_srq); @@ -540,10 +666,14 @@ int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, { if (!dev->issi) return arm_srq_cmd(dev, srq, lwm, is_srq); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return arm_xrc_srq_cmd(dev, srq, lwm); - else + case MLX5_RES_XRQ: + return arm_xrq_cmd(dev, srq, lwm); + default: return arm_rmp_cmd(dev, srq, lwm); + } } EXPORT_SYMBOL(mlx5_core_arm_srq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 5abfec1c3399..d653b0025b13 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -897,6 +897,68 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, } EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc); +enum { + UC_LOCAL_LB, + MC_LOCAL_LB +}; + +int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + mlx5_core_dbg(mdev, "%s local_lb\n", enable ? "enable" : "disable"); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_mc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.disable_mc_local_lb, !enable); + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_uc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.disable_uc_local_lb, !enable); + + err = mlx5_modify_nic_vport_context(mdev, in, inlen); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_update_local_lb); + +int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int value; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + if (err) + goto out; + + value = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.disable_mc_local_lb) << MC_LOCAL_LB; + + value |= MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.disable_uc_local_lb) << UC_LOCAL_LB; + + *status = !value; + +out: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_query_local_lb); + enum mlx5_vport_roce_state { MLX5_VPORT_ROCE_DISABLED = 0, MLX5_VPORT_ROCE_ENABLED = 1, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a03299d77922..a7f7d0ae3331 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -19,6 +19,7 @@ #include <linux/string.h> #include <linux/atomic.h> #include <linux/blk-mq.h> +#include <linux/blk-mq-rdma.h> #include <linux/types.h> #include <linux/list.h> #include <linux/mutex.h> @@ -463,14 +464,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) ibdev = queue->device->dev; /* - * The admin queue is barely used once the controller is live, so don't - * bother to spread it out. + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. */ - if (idx == 0) - comp_vector = 0; - else - comp_vector = idx % ibdev->num_comp_vectors; - + comp_vector = idx == 0 ? idx : idx - 1; /* +1 for ib_stop_cq */ queue->ib_cq = ib_alloc_cq(ibdev, queue, @@ -611,10 +608,20 @@ out_free_queues: static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + struct ib_device *ibdev = ctrl->device->dev; unsigned int nr_io_queues; int i, ret; nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); + + /* + * we map queues according to the device irq vectors for + * optimal locality so we don't need more queues than + * completion vectors. + */ + nr_io_queues = min_t(unsigned int, nr_io_queues, + ibdev->num_comp_vectors); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; @@ -1502,6 +1509,13 @@ static void nvme_rdma_complete_rq(struct request *rq) nvme_complete_rq(rq); } +static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_rdma_ctrl *ctrl = set->driver_data; + + return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); +} + static const struct blk_mq_ops nvme_rdma_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, @@ -1511,6 +1525,7 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { .init_hctx = nvme_rdma_init_hctx, .poll = nvme_rdma_poll, .timeout = nvme_rdma_timeout, + .map_queues = nvme_rdma_map_queues, }; static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { @@ -1950,10 +1965,6 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .create_ctrl = nvme_rdma_create_ctrl, }; -static void nvme_rdma_add_one(struct ib_device *ib_device) -{ -} - static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) { struct nvme_rdma_ctrl *ctrl; @@ -1975,7 +1986,6 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) static struct ib_client nvme_rdma_ib_client = { .name = "nvme_rdma", - .add = nvme_rdma_add_one, .remove = nvme_rdma_remove_one }; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 56a4cba690b5..76d2bb793afe 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1510,10 +1510,6 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { .delete_ctrl = nvmet_rdma_delete_ctrl, }; -static void nvmet_rdma_add_one(struct ib_device *ib_device) -{ -} - static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) { struct nvmet_rdma_queue *queue; @@ -1534,7 +1530,6 @@ static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data static struct ib_client nvmet_rdma_ib_client = { .name = "nvmet_rdma", - .add = nvmet_rdma_add_one, .remove = nvmet_rdma_remove_one }; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d51e8738f9c2..4450feaf5c00 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1307,6 +1307,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner, drv->driver.bus = &pci_bus_type; drv->driver.owner = owner; drv->driver.mod_name = mod_name; + drv->driver.groups = drv->groups; spin_lock_init(&drv->dynids.lock); INIT_LIST_HEAD(&drv->dynids.list); diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h new file mode 100644 index 000000000000..b4ade198007d --- /dev/null +++ b/include/linux/blk-mq-rdma.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_BLK_MQ_RDMA_H +#define _LINUX_BLK_MQ_RDMA_H + +struct blk_mq_tag_set; +struct ib_device; + +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec); + +#endif /* _LINUX_BLK_MQ_RDMA_H */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index b54517c05e9a..c8a63e148a98 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -428,6 +428,12 @@ enum mlx4_steer_type { MLX4_NUM_STEERS }; +enum mlx4_resource_usage { + MLX4_RES_USAGE_NONE, + MLX4_RES_USAGE_DRIVER, + MLX4_RES_USAGE_USER_VERBS, +}; + enum { MLX4_NUM_FEXCH = 64 * 1024, }; @@ -749,6 +755,7 @@ struct mlx4_cq { } tasklet_ctx; int reset_notify_added; struct list_head reset_notify; + u8 usage; }; struct mlx4_qp { @@ -758,6 +765,7 @@ struct mlx4_qp { atomic_t refcount; struct completion free; + u8 usage; }; struct mlx4_srq { @@ -1121,7 +1129,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, unsigned vector, int collapsed, int timestamp_en); void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, - int *base, u8 flags); + int *base, u8 flags, u8 usage); void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); @@ -1418,7 +1426,7 @@ int mlx4_get_phys_port_id(struct mlx4_dev *dev); int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); -int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage); void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f31a0b5377e1..c13d71deaeca 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -290,6 +290,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_GPIO_EVENT = 0x15, MLX5_EVENT_TYPE_PORT_MODULE_EVENT = 0x16, MLX5_EVENT_TYPE_REMOTE_CONFIG = 0x19, + MLX5_EVENT_TYPE_GENERAL_EVENT = 0x22, MLX5_EVENT_TYPE_PPS_EVENT = 0x25, MLX5_EVENT_TYPE_DB_BF_CONGESTION = 0x1a, @@ -305,6 +306,10 @@ enum mlx5_event { }; enum { + MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, +}; + +enum { MLX5_PORT_CHANGE_SUBTYPE_DOWN = 1, MLX5_PORT_CHANGE_SUBTYPE_ACTIVE = 4, MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED = 5, @@ -968,7 +973,7 @@ enum mlx5_cap_type { MLX5_CAP_ATOMIC, MLX5_CAP_ROCE, MLX5_CAP_IPOIB_OFFLOADS, - MLX5_CAP_EOIB_OFFLOADS, + MLX5_CAP_IPOIB_ENHANCED_OFFLOADS, MLX5_CAP_FLOW_TABLE, MLX5_CAP_ESWITCH_FLOW_TABLE, MLX5_CAP_ESWITCH, @@ -1011,6 +1016,10 @@ enum mlx5_mcam_feature_groups { MLX5_GET(per_protocol_networking_offload_caps,\ mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap) +#define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \ + MLX5_GET(per_protocol_networking_offload_caps,\ + mdev->caps.hca_cur[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS], cap) + #define MLX5_CAP_ROCE(mdev, cap) \ MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 205d82d4c468..b3fc9d586a9f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -162,6 +162,13 @@ enum dbg_rsc_type { MLX5_DBG_RSC_CQ, }; +enum port_state_policy { + MLX5_POLICY_DOWN = 0, + MLX5_POLICY_UP = 1, + MLX5_POLICY_FOLLOW = 2, + MLX5_POLICY_INVALID = 0xffffffff +}; + struct mlx5_field_desc { struct dentry *dent; int i; @@ -185,6 +192,7 @@ enum mlx5_dev_event { MLX5_DEV_EVENT_GUID_CHANGE, MLX5_DEV_EVENT_CLIENT_REREG, MLX5_DEV_EVENT_PPS, + MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, }; enum mlx5_port_status { @@ -291,7 +299,7 @@ struct mlx5_cmd { struct semaphore pages_sem; int mode; struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS]; - struct pci_pool *pool; + struct dma_pool *pool; struct mlx5_cmd_debug dbg; struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES]; int checksum_disabled; @@ -410,6 +418,7 @@ enum mlx5_res_type { MLX5_RES_SQ = MLX5_EVENT_QUEUE_TYPE_SQ, MLX5_RES_SRQ = 3, MLX5_RES_XSRQ = 4, + MLX5_RES_XRQ = 5, }; struct mlx5_core_rsc_common { @@ -525,6 +534,9 @@ struct mlx5_mkey_table { struct mlx5_vf_context { int enabled; + u64 port_guid; + u64 node_guid; + enum port_state_policy policy; }; struct mlx5_core_sriov { @@ -534,7 +546,6 @@ struct mlx5_core_sriov { }; struct mlx5_irq_info { - cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; }; @@ -597,7 +608,6 @@ struct mlx5_port_module_event_stats { struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; - struct msix_entry *msix_arr; struct mlx5_irq_info *irq_info; /* pages stuff */ @@ -840,13 +850,6 @@ struct mlx5_pas { u8 log_sz; }; -enum port_state_policy { - MLX5_POLICY_DOWN = 0, - MLX5_POLICY_UP = 1, - MLX5_POLICY_FOLLOW = 2, - MLX5_POLICY_INVALID = 0xffffffff -}; - enum phy_port_state { MLX5_AAA_111 }; @@ -1089,7 +1092,7 @@ enum { }; enum { - MAX_UMR_CACHE_ENTRY = 20, + MR_CACHE_LAST_STD_ENTRY = 20, MLX5_IMR_MTT_CACHE_ENTRY, MLX5_IMR_KSM_CACHE_ENTRY, MAX_MR_CACHE_ENTRIES @@ -1183,4 +1186,10 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; +static inline const struct cpumask * +mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector) +{ + return pci_irq_get_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + vector); +} + #endif /* MLX5_DRIVER_H */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3030121b4746..b7338a21c780 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -200,6 +200,7 @@ enum { MLX5_CMD_OP_QUERY_SQ = 0x907, MLX5_CMD_OP_CREATE_RQ = 0x908, MLX5_CMD_OP_MODIFY_RQ = 0x909, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS = 0x910, MLX5_CMD_OP_DESTROY_RQ = 0x90a, MLX5_CMD_OP_QUERY_RQ = 0x90b, MLX5_CMD_OP_CREATE_RMP = 0x90c, @@ -294,8 +295,10 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_tcp_dport[0x1]; u8 inner_tcp_flags[0x1]; u8 reserved_at_37[0x9]; + u8 reserved_at_40[0x1a]; + u8 bth_dst_qp[0x1]; - u8 reserved_at_40[0x40]; + u8 reserved_at_5b[0x25]; }; struct mlx5_ifc_flow_table_prop_layout_bits { @@ -431,7 +434,9 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_100[0xc]; u8 inner_ipv6_flow_label[0x14]; - u8 reserved_at_120[0xe0]; + u8 reserved_at_120[0x28]; + u8 bth_dst_qp[0x18]; + u8 reserved_at_160[0xa0]; }; struct mlx5_ifc_cmd_pas_bits { @@ -599,7 +604,7 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 rss_ind_tbl_cap[0x4]; u8 reg_umr_sq[0x1]; u8 scatter_fcs[0x1]; - u8 reserved_at_1a[0x1]; + u8 enhanced_multi_pkt_send_wqe[0x1]; u8 tunnel_lso_const_out_ip_id[0x1]; u8 reserved_at_1c[0x2]; u8 tunnel_statless_gre[0x1]; @@ -840,7 +845,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 retransmission_q_counters[0x1]; u8 reserved_at_183[0x1]; u8 modify_rq_counter_set_id[0x1]; - u8 reserved_at_185[0x1]; + u8 rq_delay_drop[0x1]; u8 max_qp_cnt[0xa]; u8 pkey_table_size[0x10]; @@ -857,7 +862,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 pcam_reg[0x1]; u8 local_ca_ack_delay[0x5]; u8 port_module_event[0x1]; - u8 reserved_at_1b1[0x1]; + u8 enhanced_error_q_counters[0x1]; u8 ports_check[0x1]; u8 reserved_at_1b3[0x1]; u8 disable_link_up[0x1]; @@ -873,7 +878,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_tc[0x4]; u8 reserved_at_1d0[0x1]; u8 dcbx[0x1]; - u8 reserved_at_1d2[0x3]; + u8 general_notification_event[0x1]; + u8 reserved_at_1d3[0x2]; u8 fpga[0x1]; u8 rol_s[0x1]; u8 rol_g[0x1]; @@ -1016,7 +1022,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_wq_sz[0x5]; u8 nic_vport_change_event[0x1]; - u8 reserved_at_3e1[0xa]; + u8 disable_local_lb[0x1]; + u8 reserved_at_3e2[0x9]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; @@ -1187,7 +1194,8 @@ struct mlx5_ifc_cong_control_r_roce_ecn_np_bits { u8 reserved_at_c0[0x12]; u8 cnp_dscp[0x6]; - u8 reserved_at_d8[0x5]; + u8 reserved_at_d8[0x4]; + u8 cnp_prio_mode[0x1]; u8 cnp_802p_prio[0x3]; u8 reserved_at_e0[0x720]; @@ -2015,6 +2023,10 @@ enum { }; enum { + MLX5_QPC_OFFLOAD_TYPE_RNDV = 0x1, +}; + +enum { MLX5_QPC_END_PADDING_MODE_SCATTER_AS_IS = 0x0, MLX5_QPC_END_PADDING_MODE_PAD_TO_CACHE_LINE_ALIGNMENT = 0x1, }; @@ -2057,7 +2069,8 @@ struct mlx5_ifc_qpc_bits { u8 st[0x8]; u8 reserved_at_10[0x3]; u8 pm_state[0x2]; - u8 reserved_at_15[0x7]; + u8 reserved_at_15[0x3]; + u8 offload_type[0x4]; u8 end_padding_mode[0x2]; u8 reserved_at_1e[0x2]; @@ -2437,7 +2450,7 @@ struct mlx5_ifc_sqc_bits { u8 cd_master[0x1]; u8 fre[0x1]; u8 flush_in_error_en[0x1]; - u8 reserved_at_4[0x1]; + u8 allow_multi_pkt_send_wqe[0x1]; u8 min_wqe_inline_mode[0x3]; u8 state[0x4]; u8 reg_umr[0x1]; @@ -2515,7 +2528,7 @@ enum { struct mlx5_ifc_rqc_bits { u8 rlky[0x1]; - u8 reserved_at_1[0x1]; + u8 delay_drop_en[0x1]; u8 scatter_fcs[0x1]; u8 vsd[0x1]; u8 mem_rq_type[0x4]; @@ -2562,7 +2575,9 @@ struct mlx5_ifc_rmpc_bits { struct mlx5_ifc_nic_vport_context_bits { u8 reserved_at_0[0x5]; u8 min_wqe_inline_mode[0x3]; - u8 reserved_at_8[0x17]; + u8 reserved_at_8[0x15]; + u8 disable_mc_local_lb[0x1]; + u8 disable_uc_local_lb[0x1]; u8 roce_en[0x1]; u8 arm_change_event[0x1]; @@ -3000,7 +3015,7 @@ struct mlx5_ifc_xrqc_bits { struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context; - u8 reserved_at_180[0x880]; + u8 reserved_at_180[0x280]; struct mlx5_ifc_wq_bits wq; }; @@ -3947,7 +3962,47 @@ struct mlx5_ifc_query_q_counter_out_bits { u8 local_ack_timeout_err[0x20]; - u8 reserved_at_320[0x4e0]; + u8 reserved_at_320[0xa0]; + + u8 resp_local_length_error[0x20]; + + u8 req_local_length_error[0x20]; + + u8 resp_local_qp_error[0x20]; + + u8 local_operation_error[0x20]; + + u8 resp_local_protection[0x20]; + + u8 req_local_protection[0x20]; + + u8 resp_cqe_error[0x20]; + + u8 req_cqe_error[0x20]; + + u8 req_mw_binding[0x20]; + + u8 req_bad_response[0x20]; + + u8 req_remote_invalid_request[0x20]; + + u8 resp_remote_invalid_request[0x20]; + + u8 req_remote_access_errors[0x20]; + + u8 resp_remote_access_errors[0x20]; + + u8 req_remote_operation_errors[0x20]; + + u8 req_transport_retries_exceeded[0x20]; + + u8 cq_overflow[0x20]; + + u8 resp_cqe_flush_error[0x20]; + + u8 req_cqe_flush_error[0x20]; + + u8 reserved_at_620[0x1e0]; }; struct mlx5_ifc_query_q_counter_in_bits { @@ -5229,7 +5284,9 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits { }; struct mlx5_ifc_modify_nic_vport_field_select_bits { - u8 reserved_at_0[0x16]; + u8 reserved_at_0[0x14]; + u8 disable_uc_local_lb[0x1]; + u8 disable_mc_local_lb[0x1]; u8 node_guid[0x1]; u8 port_guid[0x1]; u8 min_inline[0x1]; @@ -5847,6 +5904,28 @@ struct mlx5_ifc_destroy_rq_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_set_delay_drop_params_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 delay_drop_timeout[0x10]; +}; + +struct mlx5_ifc_set_delay_drop_params_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + struct mlx5_ifc_destroy_rmp_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f378dc0e7eaf..66d19b611fe4 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -560,6 +560,9 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *out, int outlen); +int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, + u32 timeout_usec); + int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); void mlx5_init_qp_table(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h index 1cde0fd53f90..24ff23e27c8a 100644 --- a/include/linux/mlx5/srq.h +++ b/include/linux/mlx5/srq.h @@ -38,6 +38,7 @@ enum { MLX5_SRQ_FLAG_ERR = (1 << 0), MLX5_SRQ_FLAG_WQ_SIG = (1 << 1), + MLX5_SRQ_FLAG_RNDV = (1 << 2), }; struct mlx5_srq_attr { @@ -56,6 +57,10 @@ struct mlx5_srq_attr { u32 user_index; u64 db_record; __be64 *pas; + u32 tm_log_list_size; + u32 tm_next_tag; + u32 tm_hw_phase_cnt; + u32 tm_sw_phase_cnt; }; struct mlx5_core_dev; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 656c70b65dd2..aaa0bb9e7655 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -114,5 +114,6 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, int vf, struct mlx5_hca_vport_context *req); - +int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable); +int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status); #endif /* __MLX5_VPORT_H__ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index f958d0732af6..da05e5db06ac 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -731,6 +731,7 @@ struct pci_driver { void (*shutdown) (struct pci_dev *dev); int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* PF pdev */ const struct pci_error_handlers *err_handler; + const struct attribute_group **groups; struct device_driver driver; struct pci_dynids dynids; }; diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index b73a14edc85e..ec5008cf5d51 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -172,7 +172,8 @@ static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) (struct in6_addr *)gid); break; case AF_INET6: - memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, 16); + *(struct in6_addr *)&gid->raw = + ((struct sockaddr_in6 *)addr)->sin6_addr; break; default: return -EINVAL; @@ -304,7 +305,13 @@ static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) static inline int rdma_is_multicast_addr(struct in6_addr *addr) { - return addr->s6_addr[0] == 0xff; + u32 ipv4_addr; + + if (addr->s6_addr[0] == 0xff) + return 1; + + memcpy(&ipv4_addr, addr->s6_addr + 12, 4); + return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 5519f31f043a..c124d515f7d5 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -193,8 +193,12 @@ static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth) #define IB_LNH_MASK 3 #define IB_SC_MASK 0xf #define IB_SC_SHIFT 12 +#define IB_SC5_MASK 0x10 #define IB_SL_MASK 0xf #define IB_SL_SHIFT 4 +#define IB_SL_SHIFT 4 +#define IB_LVER_MASK 0xf +#define IB_LVER_SHIFT 8 static inline u8 ib_get_lnh(struct ib_header *hdr) { @@ -206,6 +210,11 @@ static inline u8 ib_get_sc(struct ib_header *hdr) return ((be16_to_cpu(hdr->lrh[0]) >> IB_SC_SHIFT) & IB_SC_MASK); } +static inline bool ib_is_sc5(u16 sc5) +{ + return !!(sc5 & IB_SC5_MASK); +} + static inline u8 ib_get_sl(struct ib_header *hdr) { return ((be16_to_cpu(hdr->lrh[0]) >> IB_SL_SHIFT) & IB_SL_MASK); @@ -221,6 +230,27 @@ static inline u16 ib_get_slid(struct ib_header *hdr) return (be16_to_cpu(hdr->lrh[3])); } +static inline u8 ib_get_lver(struct ib_header *hdr) +{ + return (u8)((be16_to_cpu(hdr->lrh[0]) >> IB_LVER_SHIFT) & + IB_LVER_MASK); +} + +static inline u16 ib_get_len(struct ib_header *hdr) +{ + return (u16)(be16_to_cpu(hdr->lrh[2])); +} + +static inline u32 ib_get_qkey(struct ib_other_headers *ohdr) +{ + return be32_to_cpu(ohdr->u.ud.deth[0]); +} + +static inline u32 ib_get_sqpn(struct ib_other_headers *ohdr) +{ + return ((be32_to_cpu(ohdr->u.ud.deth[1])) & IB_QPN_MASK); +} + /* * BTH */ @@ -229,6 +259,14 @@ static inline u16 ib_get_slid(struct ib_header *hdr) #define IB_BTH_PAD_MASK 3 #define IB_BTH_PKEY_MASK 0xffff #define IB_BTH_PAD_SHIFT 20 +#define IB_BTH_A_MASK 1 +#define IB_BTH_A_SHIFT 31 +#define IB_BTH_M_MASK 1 +#define IB_BTH_M_SHIFT 22 +#define IB_BTH_SE_MASK 1 +#define IB_BTH_SE_SHIFT 23 +#define IB_BTH_TVER_MASK 0xf +#define IB_BTH_TVER_SHIFT 16 static inline u8 ib_bth_get_pad(struct ib_other_headers *ohdr) { @@ -247,4 +285,50 @@ static inline u8 ib_bth_get_opcode(struct ib_other_headers *ohdr) IB_BTH_OPCODE_MASK); } +static inline u8 ib_bth_get_ackreq(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[2]) >> IB_BTH_A_SHIFT) & + IB_BTH_A_MASK); +} + +static inline u8 ib_bth_get_migreq(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_M_SHIFT) & + IB_BTH_M_MASK); +} + +static inline u8 ib_bth_get_se(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_SE_SHIFT) & + IB_BTH_SE_MASK); +} + +static inline u32 ib_bth_get_psn(struct ib_other_headers *ohdr) +{ + return (u32)(be32_to_cpu(ohdr->bth[2])); +} + +static inline u32 ib_bth_get_qpn(struct ib_other_headers *ohdr) +{ + return (u32)((be32_to_cpu(ohdr->bth[1])) & IB_QPN_MASK); +} + +static inline u8 ib_bth_get_becn(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[1]) >> IB_BECN_SHIFT) & + IB_BECN_MASK); +} + +static inline u8 ib_bth_get_fecn(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[1]) >> IB_FECN_SHIFT) & + IB_FECN_MASK); +} + +static inline u8 ib_bth_get_tver(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_TVER_SHIFT) & + IB_BTH_TVER_MASK); +} + #endif /* IB_HDRS_H */ diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index 68cef3bd50fb..8ebf84ae9ed1 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -38,10 +38,12 @@ #include <rdma/ib_user_verbs.h> #include <rdma/ib_user_sa.h> -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src); -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, struct rdma_ah_attr *src); void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 88c32aba32f7..e6df68048517 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -64,6 +64,8 @@ #include <linux/cgroup_rdma.h> #include <uapi/rdma/ib_user_verbs.h> +#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; @@ -168,7 +170,7 @@ enum ib_device_cap_flags { IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), IB_DEVICE_SHUTDOWN_PORT = (1 << 8), - IB_DEVICE_INIT_TYPE = (1 << 9), + /* Not in use, former INIT_TYPE = (1 << 9),*/ IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), @@ -183,7 +185,7 @@ enum ib_device_cap_flags { * which will always contain a usable lkey. */ IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), - IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), + /* Reserved, old SEND_W_INV = (1 << 16),*/ IB_DEVICE_MEM_WINDOW = (1 << 17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support @@ -218,7 +220,7 @@ enum ib_device_cap_flags { * of I/O operations with single completion queue managed * by hardware. */ - IB_DEVICE_CROSS_CHANNEL = (1 << 27), + IB_DEVICE_CROSS_CHANNEL = (1 << 27), IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), @@ -278,6 +280,24 @@ struct ib_rss_caps { u32 max_rwq_indirection_table_size; }; +enum ib_tm_cap_flags { + /* Support tag matching on RC transport */ + IB_TM_CAP_RC = 1 << 0, +}; + +struct ib_xrq_caps { + /* Max size of RNDV header */ + u32 max_rndv_hdr_size; + /* Max number of entries in tag matching list */ + u32 max_num_tags; + /* From enum ib_tm_cap_flags */ + u32 flags; + /* Max number of outstanding list operations */ + u32 max_ops; + /* Max number of SGE in tag matching entry */ + u32 max_sge; +}; + enum ib_cq_creation_flags { IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, @@ -338,6 +358,7 @@ struct ib_device_attr { struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ + struct ib_xrq_caps xrq_caps; }; enum ib_mtu { @@ -549,8 +570,8 @@ struct ib_port_attr { u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; - u16 lid; - u16 sm_lid; + u32 sm_lid; + u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; @@ -577,7 +598,8 @@ struct ib_device_modify { enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), - IB_PORT_RESET_QKEY_CNTR = (1<<3) + IB_PORT_RESET_QKEY_CNTR = (1<<3), + IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { @@ -664,6 +686,8 @@ union rdma_network_hdr { }; }; +#define IB_QPN_MASK 0xFFFFFF + enum { IB_MULTICAST_QPN = 0xffffff }; @@ -859,6 +883,7 @@ struct roce_ah_attr { struct opa_ah_attr { u32 dlid; u8 src_path_bits; + bool make_grd; }; struct rdma_ah_attr { @@ -948,7 +973,7 @@ struct ib_wc { u32 src_qp; int wc_flags; u16 pkey_index; - u16 slid; + u32 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ @@ -966,9 +991,16 @@ enum ib_cq_notify_flags { enum ib_srq_type { IB_SRQT_BASIC, - IB_SRQT_XRC + IB_SRQT_XRC, + IB_SRQT_TM, }; +static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) +{ + return srq_type == IB_SRQT_XRC || + srq_type == IB_SRQT_TM; +} + enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, @@ -986,11 +1018,17 @@ struct ib_srq_init_attr { struct ib_srq_attr attr; enum ib_srq_type srq_type; - union { - struct { - struct ib_xrcd *xrcd; - struct ib_cq *cq; - } xrc; + struct { + struct ib_cq *cq; + union { + struct { + struct ib_xrcd *xrcd; + } xrc; + + struct { + u32 max_num_tags; + } tag_matching; + }; } ext; }; @@ -1059,6 +1097,7 @@ enum ib_qp_create_flags { /* FREE = 1 << 7, */ IB_QP_CREATE_SCATTER_FCS = 1 << 8, IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_QP_CREATE_SOURCE_QPN = 1 << 10, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -1086,6 +1125,7 @@ struct ib_qp_init_attr { */ u8 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; + u32 source_qpn; }; struct ib_qp_open_attr { @@ -1527,12 +1567,14 @@ struct ib_srq { enum ib_srq_type srq_type; atomic_t usecnt; - union { - struct { - struct ib_xrcd *xrcd; - struct ib_cq *cq; - u32 srq_num; - } xrc; + struct { + struct ib_cq *cq; + union { + struct { + struct ib_xrcd *xrcd; + u32 srq_num; + } xrc; + }; } ext; }; @@ -1546,6 +1588,10 @@ enum ib_raw_packet_caps { IB_RAW_PACKET_CAP_SCATTER_FCS = (1 << 1), /* Checksum offloads are supported (for both send and receive). */ IB_RAW_PACKET_CAP_IP_CSUM = (1 << 2), + /* When a packet is received for an RQ with no receive WQEs, the + * packet processing is delayed. + */ + IB_RAW_PACKET_CAP_DELAY_DROP = (1 << 3), }; enum ib_wq_type { @@ -1574,6 +1620,7 @@ struct ib_wq { enum ib_wq_flags { IB_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, IB_WQ_FLAGS_SCATTER_FCS = 1 << 1, + IB_WQ_FLAGS_DELAY_DROP = 1 << 2, }; struct ib_wq_init_attr { @@ -2289,6 +2336,8 @@ struct ib_device { struct rdmacg_device cg_device; #endif + u32 index; + /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this @@ -2296,7 +2345,11 @@ struct ib_device { * in fast paths. */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); - void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); + void (*get_dev_fw_str)(struct ib_device *, char *str); + const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, + int comp_vector); + + struct uverbs_root_spec *specs_root; }; struct ib_client { @@ -2332,7 +2385,7 @@ struct ib_client { struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); -void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); +void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, @@ -2396,8 +2449,8 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask, enum rdma_link_layer ll); -int ib_register_event_handler (struct ib_event_handler *event_handler); -int ib_unregister_event_handler(struct ib_event_handler *event_handler); +void ib_register_event_handler(struct ib_event_handler *event_handler); +void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); int ib_query_port(struct ib_device *device, @@ -3556,6 +3609,7 @@ void ib_drain_qp(struct ib_qp *qp); int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr); +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { @@ -3609,6 +3663,20 @@ static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) return 0; } +static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr, + bool make_grd) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + attr->opa.make_grd = make_grd; +} + +static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + return attr->opa.make_grd; + return false; +} + static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num) { attr->port_num = port_num; @@ -3707,4 +3775,52 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, else return RDMA_AH_ATTR_TYPE_IB; } + +/** + * ib_lid_cpu16 - Return lid in 16bit CPU encoding. + * In the current implementation the only way to get + * get the 32bit lid is from other sources for OPA. + * For IB, lids will always be 16bits so cast the + * value accordingly. + * + * @lid: A 32bit LID + */ +static inline u16 ib_lid_cpu16(u32 lid) +{ + WARN_ON_ONCE(lid & 0xFFFF0000); + return (u16)lid; +} + +/** + * ib_lid_be16 - Return lid in 16bit BE encoding. + * + * @lid: A 32bit LID + */ +static inline __be16 ib_lid_be16(u32 lid) +{ + WARN_ON_ONCE(lid & 0xFFFF0000); + return cpu_to_be16((u16)lid); +} + +/** + * ib_get_vector_affinity - Get the affinity mappings of a given completion + * vector + * @device: the rdma device + * @comp_vector: index of completion vector + * + * Returns NULL on failure, otherwise a corresponding cpu map of the + * completion vector (returns all-cpus map if the device driver doesn't + * implement get_vector_affinity). + */ +static inline const struct cpumask * +ib_get_vector_affinity(struct ib_device *device, int comp_vector) +{ + if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || + !device->get_vector_affinity) + return NULL; + + return device->get_vector_affinity(device, comp_vector); + +} + #endif /* IB_VERBS_H */ diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index eace28f1555d..e6e90f18e6d5 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -48,8 +48,21 @@ #ifndef OPA_ADDR_H #define OPA_ADDR_H +#include <rdma/opa_smi.h> + #define OPA_SPECIAL_OUI (0x00066AULL) #define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x))) +#define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \ + ? 0 : x) +#define OPA_GID_INDEX 0x1 +/** + * 0xF8 - 4 bits of multicast range and 1 bit for collective range + * Example: For 24 bit LID space, + * Multicast range: 0xF00000 to 0xF7FFFF + * Collective range: 0xF80000 to 0xFFFFFE + */ +#define OPA_MCAST_NR 0x4 /* Number of top bits set */ +#define OPA_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */ /** * ib_is_opa_gid: Returns true if the top 24 bits of the gid @@ -59,7 +72,7 @@ * * @gid: The Global identifier */ -static inline bool ib_is_opa_gid(union ib_gid *gid) +static inline bool ib_is_opa_gid(const union ib_gid *gid) { return ((be64_to_cpu(gid->global.interface_id) >> 40) == OPA_SPECIAL_OUI); @@ -72,8 +85,33 @@ static inline bool ib_is_opa_gid(union ib_gid *gid) * * @gid: The Global identifier */ -static inline u32 opa_get_lid_from_gid(union ib_gid *gid) +static inline u32 opa_get_lid_from_gid(const union ib_gid *gid) { return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF; } + +/** + * opa_is_extended_lid: Returns true if dlid or slid are + * extended. + * + * @dlid: The DLID + * @slid: The SLID + */ +static inline bool opa_is_extended_lid(u32 dlid, u32 slid) +{ + if ((be32_to_cpu(dlid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (be32_to_cpu(slid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE))) + return true; + else + return false; +} + +/* Get multicast lid base */ +static inline u32 opa_get_mcast_base(u32 nr_top_bits) +{ + return (be32_to_cpu(OPA_LID_PERMISSIVE) << (32 - nr_top_bits)); +} + #endif /* OPA_ADDR_H */ diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index 39d6890616a6..0c07a70bd7f6 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -54,9 +54,6 @@ #include <rdma/ib_verbs.h> -/* VNIC uses 16B header format */ -#define OPA_VNIC_L2_TYPE 0x2 - /* 16 header bytes + 2 reserved bytes */ #define OPA_VNIC_L2_HDR_LEN (16 + 2) diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 348c102cb5f6..2d878596b1e0 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -5,29 +5,43 @@ #include <linux/netlink.h> #include <uapi/rdma/rdma_netlink.h> -struct ibnl_client_cbs { +struct rdma_nl_cbs { + int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); - struct module *module; + u8 flags; }; +enum rdma_nl_flags { + /* Require CAP_NET_ADMIN */ + RDMA_NL_ADMIN_PERM = 1 << 0, +}; + +/* Define this module as providing netlink services for NETLINK_RDMA, with + * index _index. Since the client indexes were setup in a uapi header as an + * enum and we do no want to change that, the user must supply the expanded + * constant as well and the compiler checks they are the same. + */ +#define MODULE_ALIAS_RDMA_NETLINK(_index, _val) \ + static inline void __chk_##_index(void) \ + { \ + BUILD_BUG_ON(_index != _val); \ + } \ + MODULE_ALIAS("rdma-netlink-subsys-" __stringify(_val)) + /** - * Add a a client to the list of IB netlink exporters. + * Register client in RDMA netlink. * @index: Index of the added client - * @nops: Number of supported ops by the added client. * @cb_table: A table for op->callback - * - * Returns 0 on success or a negative error code. */ -int ibnl_add_client(int index, int nops, - const struct ibnl_client_cbs cb_table[]); +void rdma_nl_register(unsigned int index, + const struct rdma_nl_cbs cb_table[]); /** * Remove a client from IB netlink. * @index: Index of the removed IB client. - * - * Returns 0 on success or a negative error code. */ -int ibnl_remove_client(int index); +void rdma_nl_unregister(unsigned int index); /** * Put a new message in a supplied skb. @@ -56,22 +70,32 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, /** * Send the supplied skb to a specific userspace PID. * @skb: The netlink skb - * @nlh: Header of the netlink message to send * @pid: Userspace netlink process ID * Returns 0 on success or a negative error code. */ -int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid); +int rdma_nl_unicast(struct sk_buff *skb, u32 pid); + +/** + * Send, with wait/1 retry, the supplied skb to a specific userspace PID. + * @skb: The netlink skb + * @pid: Userspace netlink process ID + * Returns 0 on success or a negative error code. + */ +int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid); /** * Send the supplied skb to a netlink group. * @skb: The netlink skb - * @nlh: Header of the netlink message to send * @group: Netlink group ID * @flags: allocation flags * Returns 0 on success or a negative error code. */ -int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, - unsigned int group, gfp_t flags); +int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); +/** + * Check if there are any listeners to the netlink group + * @group: the netlink group ID + * Returns 0 on success or a negative for no listeners. + */ +int rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 55af69271053..1ba84a78f1c5 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -57,11 +57,21 @@ #include <linux/list.h> #include <linux/hash.h> #include <rdma/ib_verbs.h> +#include <rdma/ib_mad.h> #include <rdma/rdmavt_mr.h> #include <rdma/rdmavt_qp.h> #define RVT_MAX_PKEY_VALUES 16 +#define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */ +#define RVT_MAX_TRAP_LISTS 5 /*((IB_NOTICE_TYPE_INFO & 0x0F) + 1)*/ +#define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */ + +struct trap_list { + u32 list_len; + struct list_head list; +}; + struct rvt_ibport { struct rvt_qp __rcu *qp[2]; struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ @@ -75,12 +85,13 @@ struct rvt_ibport { __be64 mkey; u64 tid; u32 port_cap_flags; + u16 port_cap3_flags; u32 pma_sample_start; u32 pma_sample_interval; __be16 pma_counter_select[5]; u16 pma_tag; u16 mkey_lease_period; - u16 sm_lid; + u32 sm_lid; u8 sm_sl; u8 mkeyprot; u8 subnet_timeout; @@ -127,6 +138,13 @@ struct rvt_ibport { u16 *pkey_table; struct rvt_ah *sm_ah; + + /* + * Keep a list of traps that have not been repressed. They will be + * resent based on trap_timer. + */ + struct trap_list trap_lists[RVT_MAX_TRAP_LISTS]; + struct timer_list trap_timer; }; #define RVT_CQN_MAX 16 /* maximum length of cq name */ @@ -514,7 +532,8 @@ int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey); int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, u32 len, u64 vaddr, u32 rkey, int acc); int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, - struct rvt_sge *isge, struct ib_sge *sge, int acc); + struct rvt_sge *isge, struct rvt_sge *last_sge, + struct ib_sge *sge, int acc); struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid, u16 lid); diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h index f418bd5571a5..72a3856d4057 100644 --- a/include/rdma/rdmavt_mr.h +++ b/include/rdma/rdmavt_mr.h @@ -191,4 +191,7 @@ static inline void rvt_skip_sge(struct rvt_sge_state *ss, u32 length, } } +bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey); +bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey); + #endif /* DEF_RDMAVT_INCMRH */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d664d2e76280..0eed3d8752fa 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -277,7 +277,6 @@ struct rvt_qp { unsigned long timeout_jiffies; /* computed from timeout */ - enum ib_mtu path_mtu; int srate_mbps; /* s_srate (below) converted to Mbit/s */ pid_t pid; /* pid for user mode QPs */ u32 remote_qpn; @@ -396,7 +395,7 @@ struct rvt_srq { #define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) #define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) #define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1) -#define RVT_QPN_MASK 0xFFFFFF +#define RVT_QPN_MASK IB_QPN_MASK /* * QPN-map pages start out as NULL, they get allocated upon @@ -674,4 +673,34 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +/** + * struct rvt_qp_iter - the iterator for QPs + * @qp - the current QP + * + * This structure defines the current iterator + * state for sequenced access to all QPs relative + * to an rvt_dev_info. + */ +struct rvt_qp_iter { + struct rvt_qp *qp; + /* private: backpointer */ + struct rvt_dev_info *rdi; + /* private: callback routine */ + void (*cb)(struct rvt_qp *qp, u64 v); + /* private: for arg to callback routine */ + u64 v; + /* private: number of SMI,GSI QPs for device */ + int specials; + /* private: current iterator index */ + int n; +}; + +struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)); +int rvt_qp_iter_next(struct rvt_qp_iter *iter); +void rvt_qp_iter(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)); +void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey); #endif /* DEF_RDMAVT_INCQP_H */ diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h new file mode 100644 index 000000000000..6da44079aa58 --- /dev/null +++ b/include/rdma/uverbs_ioctl.h @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UVERBS_IOCTL_ +#define _UVERBS_IOCTL_ + +#include <rdma/uverbs_types.h> +#include <linux/uaccess.h> +#include <rdma/rdma_user_ioctl.h> +#include <rdma/ib_user_ioctl_verbs.h> + +/* + * ======================================= + * Verbs action specifications + * ======================================= + */ + +enum uverbs_attr_type { + UVERBS_ATTR_TYPE_NA, + UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_TYPE_PTR_OUT, + UVERBS_ATTR_TYPE_IDR, + UVERBS_ATTR_TYPE_FD, +}; + +enum uverbs_obj_access { + UVERBS_ACCESS_READ, + UVERBS_ACCESS_WRITE, + UVERBS_ACCESS_NEW, + UVERBS_ACCESS_DESTROY +}; + +enum { + UVERBS_ATTR_SPEC_F_MANDATORY = 1U << 0, + /* Support extending attributes by length */ + UVERBS_ATTR_SPEC_F_MIN_SZ = 1U << 1, +}; + +struct uverbs_attr_spec { + enum uverbs_attr_type type; + union { + u16 len; + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u8 access; + } obj; + }; + /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ + u8 flags; +}; + +struct uverbs_attr_spec_hash { + size_t num_attrs; + unsigned long *mandatory_attrs_bitmask; + struct uverbs_attr_spec attrs[0]; +}; + +struct uverbs_attr_bundle; +struct ib_uverbs_file; + +enum { + /* + * Action marked with this flag creates a context (or root for all + * objects). + */ + UVERBS_ACTION_FLAG_CREATE_ROOT = 1U << 0, +}; + +struct uverbs_method_spec { + /* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */ + u32 flags; + size_t num_buckets; + size_t num_child_attrs; + int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *ctx); + struct uverbs_attr_spec_hash *attr_buckets[0]; +}; + +struct uverbs_method_spec_hash { + size_t num_methods; + struct uverbs_method_spec *methods[0]; +}; + +struct uverbs_object_spec { + const struct uverbs_obj_type *type_attrs; + size_t num_buckets; + struct uverbs_method_spec_hash *method_buckets[0]; +}; + +struct uverbs_object_spec_hash { + size_t num_objects; + struct uverbs_object_spec *objects[0]; +}; + +struct uverbs_root_spec { + size_t num_buckets; + struct uverbs_object_spec_hash *object_buckets[0]; +}; + +/* + * ======================================= + * Verbs definitions + * ======================================= + */ + +struct uverbs_attr_def { + u16 id; + struct uverbs_attr_spec attr; +}; + +struct uverbs_method_def { + u16 id; + /* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */ + u32 flags; + size_t num_attrs; + const struct uverbs_attr_def * const (*attrs)[]; + int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *ctx); +}; + +struct uverbs_object_def { + u16 id; + const struct uverbs_obj_type *type_attrs; + size_t num_methods; + const struct uverbs_method_def * const (*methods)[]; +}; + +struct uverbs_object_tree_def { + size_t num_objects; + const struct uverbs_object_def * const (*objects)[]; +}; + +#define UA_FLAGS(_flags) .flags = _flags +#define __UVERBS_ATTR0(_id, _len, _type, ...) \ + ((const struct uverbs_attr_def) \ + {.id = _id, .attr = {.type = _type, {.len = _len}, .flags = 0, } }) +#define __UVERBS_ATTR1(_id, _len, _type, _flags) \ + ((const struct uverbs_attr_def) \ + {.id = _id, .attr = {.type = _type, {.len = _len}, _flags, } }) +#define __UVERBS_ATTR(_id, _len, _type, _flags, _n, ...) \ + __UVERBS_ATTR##_n(_id, _len, _type, _flags) +/* + * In new compiler, UVERBS_ATTR could be simplified by declaring it as + * [_id] = {.type = _type, .len = _len, ##__VA_ARGS__} + * But since we support older compilers too, we need the more complex code. + */ +#define UVERBS_ATTR(_id, _len, _type, ...) \ + __UVERBS_ATTR(_id, _len, _type, ##__VA_ARGS__, 1, 0) +#define UVERBS_ATTR_PTR_IN_SZ(_id, _len, ...) \ + UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_IN, ##__VA_ARGS__) +/* If sizeof(_type) <= sizeof(u64), this will be inlined rather than a pointer */ +#define UVERBS_ATTR_PTR_IN(_id, _type, ...) \ + UVERBS_ATTR_PTR_IN_SZ(_id, sizeof(_type), ##__VA_ARGS__) +#define UVERBS_ATTR_PTR_OUT_SZ(_id, _len, ...) \ + UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_OUT, ##__VA_ARGS__) +#define UVERBS_ATTR_PTR_OUT(_id, _type, ...) \ + UVERBS_ATTR_PTR_OUT_SZ(_id, sizeof(_type), ##__VA_ARGS__) + +/* + * In new compiler, UVERBS_ATTR_IDR (and FD) could be simplified by declaring + * it as + * {.id = _id, \ + * .attr {.type = __obj_class, \ + * .obj = {.obj_type = _idr_type, \ + * .access = _access \ + * }, ##__VA_ARGS__ } } + * But since we support older compilers too, we need the more complex code. + */ +#define ___UVERBS_ATTR_OBJ0(_id, _obj_class, _obj_type, _access, ...)\ + ((const struct uverbs_attr_def) \ + {.id = _id, \ + .attr = {.type = _obj_class, \ + {.obj = {.obj_type = _obj_type, .access = _access } },\ + .flags = 0} }) +#define ___UVERBS_ATTR_OBJ1(_id, _obj_class, _obj_type, _access, _flags)\ + ((const struct uverbs_attr_def) \ + {.id = _id, \ + .attr = {.type = _obj_class, \ + {.obj = {.obj_type = _obj_type, .access = _access} }, \ + _flags} }) +#define ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, _flags, \ + _n, ...) \ + ___UVERBS_ATTR_OBJ##_n(_id, _obj_class, _obj_type, _access, _flags) +#define __UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, ...) \ + ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, \ + ##__VA_ARGS__, 1, 0) +#define UVERBS_ATTR_IDR(_id, _idr_type, _access, ...) \ + __UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_IDR, _idr_type, _access,\ + ##__VA_ARGS__) +#define UVERBS_ATTR_FD(_id, _fd_type, _access, ...) \ + __UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_FD, _fd_type, \ + (_access) + BUILD_BUG_ON_ZERO( \ + (_access) != UVERBS_ACCESS_NEW && \ + (_access) != UVERBS_ACCESS_READ), \ + ##__VA_ARGS__) +#define DECLARE_UVERBS_ATTR_SPEC(_name, ...) \ + const struct uverbs_attr_def _name = __VA_ARGS__ + +#define _UVERBS_METHOD_ATTRS_SZ(...) \ + (sizeof((const struct uverbs_attr_def * const []){__VA_ARGS__}) /\ + sizeof(const struct uverbs_attr_def *)) +#define _UVERBS_METHOD(_id, _handler, _flags, ...) \ + ((const struct uverbs_method_def) { \ + .id = _id, \ + .flags = _flags, \ + .handler = _handler, \ + .num_attrs = _UVERBS_METHOD_ATTRS_SZ(__VA_ARGS__), \ + .attrs = &(const struct uverbs_attr_def * const []){__VA_ARGS__} }) +#define DECLARE_UVERBS_METHOD(_name, _id, _handler, ...) \ + const struct uverbs_method_def _name = \ + _UVERBS_METHOD(_id, _handler, 0, ##__VA_ARGS__) +#define DECLARE_UVERBS_CTX_METHOD(_name, _id, _handler, _flags, ...) \ + const struct uverbs_method_def _name = \ + _UVERBS_METHOD(_id, _handler, \ + UVERBS_ACTION_FLAG_CREATE_ROOT, \ + ##__VA_ARGS__) +#define _UVERBS_OBJECT_METHODS_SZ(...) \ + (sizeof((const struct uverbs_method_def * const []){__VA_ARGS__}) / \ + sizeof(const struct uverbs_method_def *)) +#define _UVERBS_OBJECT(_id, _type_attrs, ...) \ + ((const struct uverbs_object_def) { \ + .id = _id, \ + .type_attrs = _type_attrs, \ + .num_methods = _UVERBS_OBJECT_METHODS_SZ(__VA_ARGS__), \ + .methods = &(const struct uverbs_method_def * const []){__VA_ARGS__} }) +#define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...) \ + const struct uverbs_object_def _name = \ + _UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__) +#define _UVERBS_TREE_OBJECTS_SZ(...) \ + (sizeof((const struct uverbs_object_def * const []){__VA_ARGS__}) / \ + sizeof(const struct uverbs_object_def *)) +#define _UVERBS_OBJECT_TREE(...) \ + ((const struct uverbs_object_tree_def) { \ + .num_objects = _UVERBS_TREE_OBJECTS_SZ(__VA_ARGS__), \ + .objects = &(const struct uverbs_object_def * const []){__VA_ARGS__} }) +#define DECLARE_UVERBS_OBJECT_TREE(_name, ...) \ + const struct uverbs_object_tree_def _name = \ + _UVERBS_OBJECT_TREE(__VA_ARGS__) + +/* ================================================= + * Parsing infrastructure + * ================================================= + */ + +struct uverbs_ptr_attr { + union { + u64 data; + void __user *ptr; + }; + u16 len; + /* Combination of bits from enum UVERBS_ATTR_F_XXXX */ + u16 flags; +}; + +struct uverbs_obj_attr { + /* pointer to the kernel descriptor -> type, access, etc */ + const struct uverbs_obj_type *type; + struct ib_uobject *uobject; + /* fd or id in idr of this object */ + int id; +}; + +struct uverbs_attr { + /* + * pointer to the user-space given attribute, in order to write the + * new uobject's id or update flags. + */ + struct ib_uverbs_attr __user *uattr; + union { + struct uverbs_ptr_attr ptr_attr; + struct uverbs_obj_attr obj_attr; + }; +}; + +struct uverbs_attr_bundle_hash { + /* if bit i is set, it means attrs[i] contains valid information */ + unsigned long *valid_bitmap; + size_t num_attrs; + /* + * arrays of attributes, each element corresponds to the specification + * of the attribute in the same index. + */ + struct uverbs_attr *attrs; +}; + +struct uverbs_attr_bundle { + size_t num_buckets; + struct uverbs_attr_bundle_hash hash[]; +}; + +static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_hash *attrs_hash, + unsigned int idx) +{ + return test_bit(idx, attrs_hash->valid_bitmap); +} + +static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_bundle, + unsigned int idx) +{ + u16 idx_bucket = idx >> UVERBS_ID_NS_SHIFT; + + if (attrs_bundle->num_buckets <= idx_bucket) + return false; + + return uverbs_attr_is_valid_in_hash(&attrs_bundle->hash[idx_bucket], + idx & ~UVERBS_ID_NS_MASK); +} + +static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, + u16 idx) +{ + u16 idx_bucket = idx >> UVERBS_ID_NS_SHIFT; + + if (!uverbs_attr_is_valid(attrs_bundle, idx)) + return ERR_PTR(-ENOENT); + + return &attrs_bundle->hash[idx_bucket].attrs[idx & ~UVERBS_ID_NS_MASK]; +} + +static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, const void *from) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + u16 flags; + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + flags = attr->ptr_attr.flags | UVERBS_ATTR_F_VALID_OUTPUT; + return (!copy_to_user(attr->ptr_attr.ptr, from, attr->ptr_attr.len) && + !put_user(flags, &attr->uattr->flags)) ? 0 : -EFAULT; +} + +static inline int _uverbs_copy_from(void *to, size_t to_size, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + if (to_size <= sizeof(((struct ib_uverbs_attr *)0)->data)) + memcpy(to, &attr->ptr_attr.data, attr->ptr_attr.len); + else if (copy_from_user(to, attr->ptr_attr.ptr, attr->ptr_attr.len)) + return -EFAULT; + + return 0; +} + +#define uverbs_copy_from(to, attrs_bundle, idx) \ + _uverbs_copy_from(to, sizeof(*(to)), attrs_bundle, idx) + +/* ================================================= + * Definitions -> Specs infrastructure + * ================================================= + */ + +/* + * uverbs_alloc_spec_tree - Merges different common and driver specific feature + * into one parsing tree that every uverbs command will be parsed upon. + * + * @num_trees: Number of trees in the array @trees. + * @trees: Array of pointers to tree root definitions to merge. Each such tree + * possibly contains objects, methods and attributes definitions. + * + * Returns: + * uverbs_root_spec *: The root of the merged parsing tree. + * On error, we return an error code. Error is checked via IS_ERR. + * + * The following merges could take place: + * a. Two trees representing the same method with different handler + * -> We take the handler of the tree that its handler != NULL + * and its index in the trees array is greater. The incentive for that + * is that developers are expected to first merge common trees and then + * merge trees that gives specialized the behaviour. + * b. Two trees representing the same object with different + * type_attrs (struct uverbs_obj_type): + * -> We take the type_attrs of the tree that its type_attr != NULL + * and its index in the trees array is greater. This could be used + * in order to override the free function, allocation size, etc. + * c. Two trees representing the same method attribute (same id but possibly + * different attributes): + * -> ERROR (-ENOENT), we believe that's not the programmer's intent. + * + * An object without any methods is considered invalid and will abort the + * function with -ENOENT error. + */ +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees); +void uverbs_free_spec_tree(struct uverbs_root_spec *root); +#else +static inline struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees) +{ + return NULL; +} + +static inline void uverbs_free_spec_tree(struct uverbs_root_spec *root) +{ +} +#endif + +#endif diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 7771ce966952..5f8e20bbd67c 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -34,19 +34,35 @@ #define _UVERBS_STD_TYPES__ #include <rdma/uverbs_types.h> - -extern const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_cq; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_qp; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_wq; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_srq; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_ah; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_flow; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_mr; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_mw; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_pd; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd; +#include <rdma/uverbs_ioctl.h> +#include <rdma/ib_user_ioctl_verbs.h> + +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +extern const struct uverbs_object_def uverbs_object_comp_channel; +extern const struct uverbs_object_def uverbs_object_cq; +extern const struct uverbs_object_def uverbs_object_qp; +extern const struct uverbs_object_def uverbs_object_rwq_ind_table; +extern const struct uverbs_object_def uverbs_object_wq; +extern const struct uverbs_object_def uverbs_object_srq; +extern const struct uverbs_object_def uverbs_object_ah; +extern const struct uverbs_object_def uverbs_object_flow; +extern const struct uverbs_object_def uverbs_object_mr; +extern const struct uverbs_object_def uverbs_object_mw; +extern const struct uverbs_object_def uverbs_object_pd; +extern const struct uverbs_object_def uverbs_object_xrcd; +extern const struct uverbs_object_def uverbs_object_device; + +extern const struct uverbs_object_tree_def uverbs_default_objects; +static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) +{ + return &uverbs_default_objects; +} +#else +static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) +{ + return NULL; +} +#endif static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, bool write, @@ -56,22 +72,22 @@ static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, return rdma_lookup_get_uobject(type, ucontext, id, write); } -#define uobj_get_type(_type) uverbs_type_attrs_##_type.type +#define uobj_get_type(_object) uverbs_object_##_object.type_attrs #define uobj_get_read(_type, _id, _ucontext) \ - __uobj_get(&(_type), false, _ucontext, _id) + __uobj_get(_type, false, _ucontext, _id) -#define uobj_get_obj_read(_type, _id, _ucontext) \ +#define uobj_get_obj_read(_object, _id, _ucontext) \ ({ \ - struct ib_uobject *uobj = \ - __uobj_get(&uobj_get_type(_type), \ + struct ib_uobject *__uobj = \ + __uobj_get(uverbs_object_##_object.type_attrs, \ false, _ucontext, _id); \ \ - (struct ib_##_type *)(IS_ERR(uobj) ? NULL : uobj->object); \ + (struct ib_##_object *)(IS_ERR(__uobj) ? NULL : __uobj->object);\ }) #define uobj_get_write(_type, _id, _ucontext) \ - __uobj_get(&(_type), true, _ucontext, _id) + __uobj_get(_type, true, _ucontext, _id) static inline void uobj_put_read(struct ib_uobject *uobj) { @@ -108,7 +124,7 @@ static inline struct ib_uobject *__uobj_alloc(const struct uverbs_obj_type *type } #define uobj_alloc(_type, ucontext) \ - __uobj_alloc(&(_type), ucontext) + __uobj_alloc(_type, ucontext) #endif diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index 351ea185df44..cc04ec65588d 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -129,6 +129,7 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type, void rdma_alloc_abort_uobject(struct ib_uobject *uobj); int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj); int rdma_alloc_commit_uobject(struct ib_uobject *uobj); +int rdma_explicit_destroy(struct ib_uobject *uobject); struct uverbs_obj_fd_type { /* @@ -151,22 +152,30 @@ extern const struct uverbs_obj_type_class uverbs_fd_class; #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) -#define UVERBS_TYPE_ALLOC_FD(_size, _order) \ - { \ - .destroy_order = _order, \ - .type_class = &uverbs_fd_class, \ - .obj_size = (_size) + \ - UVERBS_BUILD_BUG_ON((_size) < \ - sizeof(struct ib_uobject_file)),\ - } -#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order) \ - { \ +#define UVERBS_TYPE_ALLOC_FD(_order, _obj_size, _context_closed, _fops, _name, _flags)\ + ((&((const struct uverbs_obj_fd_type) \ + {.type = { \ + .destroy_order = _order, \ + .type_class = &uverbs_fd_class, \ + .obj_size = (_obj_size) + \ + UVERBS_BUILD_BUG_ON((_obj_size) < sizeof(struct ib_uobject_file)), \ + }, \ + .context_closed = _context_closed, \ + .fops = _fops, \ + .name = _name, \ + .flags = _flags}))->type) +#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order, _destroy_object) \ + ((&((const struct uverbs_obj_idr_type) \ + {.type = { \ .destroy_order = _order, \ .type_class = &uverbs_idr_class, \ .obj_size = (_size) + \ - UVERBS_BUILD_BUG_ON((_size) < \ - sizeof(struct ib_uobject)), \ - } -#define UVERBS_TYPE_ALLOC_IDR(_order) \ - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order) + UVERBS_BUILD_BUG_ON((_size) < \ + sizeof(struct ib_uobject)) \ + }, \ + .destroy_object = _destroy_object,}))->type) +#define UVERBS_TYPE_ALLOC_IDR(_order, _destroy_object) \ + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order, \ + _destroy_object) + #endif diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h new file mode 100644 index 000000000000..842792eae383 --- /dev/null +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_IOCTL_VERBS_H +#define IB_USER_IOCTL_VERBS_H + +#include <rdma/rdma_user_ioctl.h> + +#define UVERBS_UDATA_DRIVER_DATA_NS 1 +#define UVERBS_UDATA_DRIVER_DATA_FLAG (1UL << UVERBS_ID_NS_SHIFT) + +enum uverbs_default_objects { + UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ + UVERBS_OBJECT_PD, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_OBJECT_CQ, + UVERBS_OBJECT_QP, + UVERBS_OBJECT_SRQ, + UVERBS_OBJECT_AH, + UVERBS_OBJECT_MR, + UVERBS_OBJECT_MW, + UVERBS_OBJECT_FLOW, + UVERBS_OBJECT_XRCD, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_OBJECT_WQ, + UVERBS_OBJECT_LAST, +}; + +enum { + UVERBS_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG, + UVERBS_UHW_OUT, +}; + +enum uverbs_create_cq_cmd_attr_ids { + CREATE_CQ_HANDLE, + CREATE_CQ_CQE, + CREATE_CQ_USER_HANDLE, + CREATE_CQ_COMP_CHANNEL, + CREATE_CQ_COMP_VECTOR, + CREATE_CQ_FLAGS, + CREATE_CQ_RESP_CQE, +}; + +enum uverbs_destroy_cq_cmd_attr_ids { + DESTROY_CQ_HANDLE, + DESTROY_CQ_RESP, +}; + +enum uverbs_actions_cq_ops { + UVERBS_CQ_CREATE, + UVERBS_CQ_DESTROY, +}; + +#endif + diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 270c350bedc6..9a0b6479fe0c 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -236,6 +236,20 @@ struct ib_uverbs_rss_caps { __u32 reserved; }; +struct ib_uverbs_tm_caps { + /* Max size of rendezvous request message */ + __u32 max_rndv_hdr_size; + /* Max number of entries in tag matching list */ + __u32 max_num_tags; + /* TM flags */ + __u32 flags; + /* Max number of outstanding list operations */ + __u32 max_ops; + /* Max number of SGE in tag matching entry */ + __u32 max_sge; + __u32 reserved; +}; + struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_query_device_resp base; __u32 comp_mask; @@ -247,6 +261,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; + struct ib_uverbs_tm_caps xrq_caps; }; struct ib_uverbs_query_port { @@ -578,7 +593,7 @@ struct ib_uverbs_ex_create_qp { __u32 comp_mask; __u32 create_flags; __u32 rwq_ind_tbl_handle; - __u32 reserved1; + __u32 source_qpn; }; struct ib_uverbs_open_qp { @@ -1024,7 +1039,7 @@ struct ib_uverbs_create_xsrq { __u32 max_wr; __u32 max_sge; __u32 srq_limit; - __u32 reserved; + __u32 max_num_tags; __u32 xrcd_handle; __u32 cq_handle; __u64 driver_data[0]; diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index af431752655c..c55f60e05f86 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -95,13 +95,63 @@ struct mlx4_ib_create_srq_resp { __u32 reserved; }; +struct mlx4_ib_create_qp_rss { + __u64 rx_hash_fields_mask; + __u8 rx_hash_function; + __u8 reserved[7]; + __u8 rx_hash_key[40]; + __u32 comp_mask; + __u32 reserved1; +}; + struct mlx4_ib_create_qp { __u64 buf_addr; __u64 db_addr; __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; - __u8 reserved[5]; + __u8 reserved; + __u32 inl_recv_sz; +}; + +struct mlx4_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u8 log_range_size; + __u8 reserved[3]; + __u32 comp_mask; +}; + +struct mlx4_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; + +struct mlx4_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; + +/* RX Hash function flags */ +enum mlx4_ib_rx_hash_function_flags { + MLX4_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + */ +enum mlx4_ib_rx_hash_fields { + MLX4_IB_RX_HASH_SRC_IPV4 = 1 << 0, + MLX4_IB_RX_HASH_DST_IPV4 = 1 << 1, + MLX4_IB_RX_HASH_SRC_IPV6 = 1 << 2, + MLX4_IB_RX_HASH_DST_IPV6 = 1 << 3, + MLX4_IB_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX4_IB_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX4_IB_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX4_IB_RX_HASH_DST_PORT_UDP = 1 << 7 }; #endif /* MLX4_ABI_USER_H */ diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 0b3d30837a9f..1791bf123ba9 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -168,6 +168,28 @@ struct mlx5_packet_pacing_caps { __u32 reserved; }; +enum mlx5_ib_mpw_caps { + MPW_RESERVED = 1 << 0, + MLX5_IB_ALLOW_MPW = 1 << 1, + MLX5_IB_SUPPORT_EMPW = 1 << 2, +}; + +enum mlx5_ib_sw_parsing_offloads { + MLX5_IB_SW_PARSING = 1 << 0, + MLX5_IB_SW_PARSING_CSUM = 1 << 1, + MLX5_IB_SW_PARSING_LSO = 1 << 2, +}; + +struct mlx5_ib_sw_parsing_caps { + __u32 sw_parsing_offloads; /* enum mlx5_ib_sw_parsing_offloads */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RAW_PACKET + */ + __u32 supported_qpts; +}; + struct mlx5_ib_query_device_resp { __u32 comp_mask; __u32 response_length; @@ -177,6 +199,7 @@ struct mlx5_ib_query_device_resp { struct mlx5_packet_pacing_caps packet_pacing_caps; __u32 mlx5_ib_support_multi_pkt_send_wqes; __u32 reserved; + struct mlx5_ib_sw_parsing_caps sw_parsing_caps; }; struct mlx5_ib_create_cq { diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index 75c270d839c8..54b64357ab24 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -49,6 +49,9 @@ struct qedr_alloc_ucontext_resp { __u32 sges_per_recv_wr; __u32 sges_per_srq_wr; __u32 max_cqes; + __u8 dpm_enabled; + __u8 wids_enabled; + __u16 wid_count; }; struct qedr_alloc_pd_ureq { diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 02fe8390c18f..861440a87e7c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -8,7 +8,7 @@ enum { RDMA_NL_IWCM, RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ - RDMA_NL_I40IW, + RDMA_NL_NLDEV, /* RDMA device interface */ RDMA_NL_NUM_CLIENTS }; @@ -222,4 +222,86 @@ struct rdma_nla_ls_gid { __u8 gid[16]; }; +enum rdma_nldev_command { + RDMA_NLDEV_CMD_UNSPEC, + + RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, + RDMA_NLDEV_CMD_NEW, + RDMA_NLDEV_CMD_DEL, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ + RDMA_NLDEV_CMD_PORT_SET, + RDMA_NLDEV_CMD_PORT_NEW, + RDMA_NLDEV_CMD_PORT_DEL, + + RDMA_NLDEV_NUM_OPS +}; + +enum rdma_nldev_attr { + /* don't change the order or add anything between, this is ABI! */ + RDMA_NLDEV_ATTR_UNSPEC, + + /* Identifier for ib_device */ + RDMA_NLDEV_ATTR_DEV_INDEX, /* u32 */ + + RDMA_NLDEV_ATTR_DEV_NAME, /* string */ + /* + * Device index together with port index are identifiers + * for port/link properties. + * + * For RDMA_NLDEV_CMD_GET commamnd, port index will return number + * of available ports in ib_device, while for port specific operations, + * it will be real port index as it appears in sysfs. Port index follows + * sysfs notation and starts from 1 for the first port. + */ + RDMA_NLDEV_ATTR_PORT_INDEX, /* u32 */ + + /* + * Device and port capabilities + */ + RDMA_NLDEV_ATTR_CAP_FLAGS, /* u64 */ + + /* + * FW version + */ + RDMA_NLDEV_ATTR_FW_VERSION, /* string */ + + /* + * Node GUID (in host byte order) associated with the RDMA device. + */ + RDMA_NLDEV_ATTR_NODE_GUID, /* u64 */ + + /* + * System image GUID (in host byte order) associated with + * this RDMA device and other devices which are part of a + * single system. + */ + RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, /* u64 */ + + /* + * Subnet prefix (in host byte order) + */ + RDMA_NLDEV_ATTR_SUBNET_PREFIX, /* u64 */ + + /* + * Local Identifier (LID), + * According to IB specification, It is 16-bit address assigned + * by the Subnet Manager. Extended to be 32-bit for OmniPath users. + */ + RDMA_NLDEV_ATTR_LID, /* u32 */ + RDMA_NLDEV_ATTR_SM_LID, /* u32 */ + + /* + * LID mask control (LMC) + */ + RDMA_NLDEV_ATTR_LMC, /* u8 */ + + RDMA_NLDEV_ATTR_PORT_STATE, /* u8 */ + RDMA_NLDEV_ATTR_PORT_PHYS_STATE, /* u8 */ + + RDMA_NLDEV_ATTR_DEV_NODE_TYPE, /* u8 */ + + RDMA_NLDEV_ATTR_MAX +}; #endif /* _UAPI_RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h index 9388125ad51b..165a27e969d5 100644 --- a/include/uapi/rdma/rdma_user_ioctl.h +++ b/include/uapi/rdma/rdma_user_ioctl.h @@ -43,6 +43,39 @@ /* Legacy name, for user space application which already use it */ #define IB_IOCTL_MAGIC RDMA_IOCTL_MAGIC +#define RDMA_VERBS_IOCTL \ + _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) + +#define UVERBS_ID_NS_MASK 0xF000 +#define UVERBS_ID_NS_SHIFT 12 + +enum { + /* User input */ + UVERBS_ATTR_F_MANDATORY = 1U << 0, + /* + * Valid output bit should be ignored and considered set in + * mandatory fields. This bit is kernel output. + */ + UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1, +}; + +struct ib_uverbs_attr { + __u16 attr_id; /* command specific type attribute */ + __u16 len; /* only for pointers */ + __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ + __u16 reserved; + __u64 data; /* ptr to command, inline data or idr/fd */ +}; + +struct ib_uverbs_ioctl_hdr { + __u16 length; + __u16 object_id; + __u16 method_id; + __u16 num_attrs; + __u64 reserved; + struct ib_uverbs_attr attrs[0]; +}; + /* * General blocks assignments * It is closed on purpose do not expose it it user space diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index c8c1d2d6df4d..c6569b0032ec 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -125,7 +125,8 @@ enum pvrdma_wc_flags { PVRDMA_WC_IP_CSUM_OK = 1 << 3, PVRDMA_WC_WITH_SMAC = 1 << 4, PVRDMA_WC_WITH_VLAN = 1 << 5, - PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_VLAN, + PVRDMA_WC_WITH_NETWORK_HDR_TYPE = 1 << 6, + PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_NETWORK_HDR_TYPE, }; struct pvrdma_alloc_ucontext_resp { @@ -283,7 +284,8 @@ struct pvrdma_cqe { __u8 dlid_path_bits; __u8 port_num; __u8 smac[6]; - __u8 reserved2[7]; /* Pad to next power of 2 (64). */ + __u8 network_hdr_type; + __u8 reserved2[6]; /* Pad to next power of 2 (64). */ }; #endif /* __VMW_PVRDMA_ABI_H__ */ |