diff options
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r-- | drivers/infiniband/core/addr.c | 13 | ||||
-rw-r--r-- | drivers/infiniband/core/cm.c | 126 | ||||
-rw-r--r-- | drivers/infiniband/core/cma.c | 77 | ||||
-rw-r--r-- | drivers/infiniband/core/iwcm.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/mad.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/core/multicast.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/sa_query.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/sysfs.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/ucma.c | 3 | ||||
-rw-r--r-- | drivers/infiniband/core/umem.c | 8 | ||||
-rw-r--r-- | drivers/infiniband/core/umem_odp.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_cmd.c | 121 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_main.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/core/verbs.c | 77 |
14 files changed, 323 insertions, 128 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 1374541a4528..0f58f46dbad7 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -699,13 +699,16 @@ EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { struct rdma_dev_addr *addr; struct completion comp; + int status; }; static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { - memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct - rdma_dev_addr)); + if (!status) + memcpy(((struct resolve_cb_context *)context)->addr, + addr, sizeof(struct rdma_dev_addr)); + ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } @@ -743,6 +746,10 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, wait_for_completion(&ctx.comp); + ret = ctx.status; + if (ret) + return ret; + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); if (!dev) @@ -800,7 +807,7 @@ static struct notifier_block nb = { int addr_init(void) { - addr_wq = create_singlethread_workqueue("ib_addr"); + addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); if (!addr_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c99525512b34..71c7c4c328ef 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -80,6 +80,8 @@ static struct ib_cm { __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; + /* Sync on cm change port state */ + spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ @@ -161,6 +163,8 @@ struct cm_port { struct ib_mad_agent *mad_agent; struct kobject port_obj; u8 port_num; + struct list_head cm_priv_prim_list; + struct list_head cm_priv_altr_list; struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; @@ -241,6 +245,12 @@ struct cm_id_private { u8 service_timeout; u8 target_ack_delay; + struct list_head prim_list; + struct list_head altr_list; + /* Indicates that the send port mad is registered and av is set */ + int prim_send_port_not_ready; + int altr_send_port_not_ready; + struct list_head work_list; atomic_t work_count; }; @@ -259,20 +269,47 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; + struct cm_av *av; + unsigned long flags, flags2; + int ret = 0; + /* don't let the port to be released till the agent is down */ + spin_lock_irqsave(&cm.state_lock, flags2); + spin_lock_irqsave(&cm.lock, flags); + if (!cm_id_priv->prim_send_port_not_ready) + av = &cm_id_priv->av; + else if (!cm_id_priv->altr_send_port_not_ready && + (cm_id_priv->alt_av.port)) + av = &cm_id_priv->alt_av; + else { + pr_info("%s: not valid CM id\n", __func__); + ret = -ENODEV; + spin_unlock_irqrestore(&cm.lock, flags); + goto out; + } + spin_unlock_irqrestore(&cm.lock, flags); + /* Make sure the port haven't released the mad yet */ mad_agent = cm_id_priv->av.port->mad_agent; - ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr); - if (IS_ERR(ah)) - return PTR_ERR(ah); + if (!mad_agent) { + pr_info("%s: not a valid MAD agent\n", __func__); + ret = -ENODEV; + goto out; + } + ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto out; + } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, - cm_id_priv->av.pkey_index, + av->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); - return PTR_ERR(m); + ret = PTR_ERR(m); + goto out; } /* Timeout set by caller if response is expected. */ @@ -282,7 +319,10 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, atomic_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; *msg = m; - return 0; + +out: + spin_unlock_irqrestore(&cm.state_lock, flags2); + return ret; } static int cm_alloc_response_msg(struct cm_port *port, @@ -352,7 +392,8 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, grh, &av->ah_attr); } -static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) +static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, + struct cm_id_private *cm_id_priv) { struct cm_device *cm_dev; struct cm_port *port = NULL; @@ -387,7 +428,17 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) &av->ah_attr); av->timeout = path->packet_life_time + 1; - return 0; + spin_lock_irqsave(&cm.lock, flags); + if (&cm_id_priv->av == av) + list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); + else if (&cm_id_priv->alt_av == av) + list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); + else + ret = -EINVAL; + + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) @@ -677,6 +728,8 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device, spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->prim_list); + INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); atomic_set(&cm_id_priv->refcount, 1); return &cm_id_priv->id; @@ -892,6 +945,15 @@ retest: break; } + spin_lock_irq(&cm.lock); + if (!list_empty(&cm_id_priv->altr_list) && + (!cm_id_priv->altr_send_port_not_ready)) + list_del(&cm_id_priv->altr_list); + if (!list_empty(&cm_id_priv->prim_list) && + (!cm_id_priv->prim_send_port_not_ready)) + list_del(&cm_id_priv->prim_list); + spin_unlock_irq(&cm.lock); + cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); @@ -1192,12 +1254,13 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, goto out; } - ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, + cm_id_priv); if (ret) goto error1; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, - &cm_id_priv->alt_av); + &cm_id_priv->alt_av, cm_id_priv); if (ret) goto error1; } @@ -1653,7 +1716,8 @@ static int cm_req_handler(struct cm_work *work) dev_put(gid_attr.ndev); } work->path[0].gid_type = gid_attr.gid_type; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); } if (ret) { int err = ib_get_cached_gid(work->port->cm_dev->ib_device, @@ -1672,7 +1736,8 @@ static int cm_req_handler(struct cm_work *work) goto rejected; } if (req_msg->alt_local_lid) { - ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av); + ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, + cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, @@ -2727,7 +2792,8 @@ int ib_send_cm_lap(struct ib_cm_id *cm_id, goto out; } - ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av); + ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, + cm_id_priv); if (ret) goto out; cm_id_priv->alt_av.timeout = @@ -2839,7 +2905,8 @@ static int cm_lap_handler(struct cm_work *work) cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); - cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av); + cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, + cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); @@ -3031,7 +3098,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - ret = cm_init_av_by_path(param->path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); if (ret) goto out; @@ -3468,7 +3535,9 @@ out: static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; + struct cm_av tmp_av; unsigned long flags; + int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3477,7 +3546,14 @@ static int cm_migrate(struct ib_cm_id *cm_id) (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; + /* Swap address vector */ + tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; + cm_id_priv->alt_av = tmp_av; + /* Swap port send ready state */ + tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; + cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; + cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -3888,6 +3964,9 @@ static void cm_add_one(struct ib_device *ib_device) port->cm_dev = cm_dev; port->port_num = i; + INIT_LIST_HEAD(&port->cm_priv_prim_list); + INIT_LIST_HEAD(&port->cm_priv_altr_list); + ret = cm_create_port_fs(port); if (ret) goto error1; @@ -3945,6 +4024,8 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; struct cm_port *port; + struct cm_id_private *cm_id_priv; + struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; @@ -3968,15 +4049,27 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); + /* Mark all the cm_id's as not valid */ + spin_lock_irq(&cm.lock); + list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) + cm_id_priv->altr_send_port_not_ready = 1; + list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) + cm_id_priv->prim_send_port_not_ready = 1; + spin_unlock_irq(&cm.lock); /* * We flush the queue here after the going_down set, this * verify that no new works will be queued in the recv handler, * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); - ib_unregister_mad_agent(port->mad_agent); + spin_lock_irq(&cm.state_lock); + cur_mad_agent = port->mad_agent; + port->mad_agent = NULL; + spin_unlock_irq(&cm.state_lock); + ib_unregister_mad_agent(cur_mad_agent); cm_remove_port_fs(port); } + device_unregister(cm_dev->device); kfree(cm_dev); } @@ -3989,6 +4082,7 @@ static int __init ib_cm_init(void) INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); + spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5f65a78b27c9..2a6fc47a1dfb 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1094,47 +1094,47 @@ static void cma_save_ib_info(struct sockaddr *src_addr, } } -static void cma_save_ip4_info(struct sockaddr *src_addr, - struct sockaddr *dst_addr, +static void cma_save_ip4_info(struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { - struct sockaddr_in *ip4; - if (src_addr) { - ip4 = (struct sockaddr_in *)src_addr; - ip4->sin_family = AF_INET; - ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; - ip4->sin_port = local_port; + *src_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->dst_addr.ip4.addr, + .sin_port = local_port, + }; } if (dst_addr) { - ip4 = (struct sockaddr_in *)dst_addr; - ip4->sin_family = AF_INET; - ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; - ip4->sin_port = hdr->port; + *dst_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->src_addr.ip4.addr, + .sin_port = hdr->port, + }; } } -static void cma_save_ip6_info(struct sockaddr *src_addr, - struct sockaddr *dst_addr, +static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, + struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { - struct sockaddr_in6 *ip6; - if (src_addr) { - ip6 = (struct sockaddr_in6 *)src_addr; - ip6->sin6_family = AF_INET6; - ip6->sin6_addr = hdr->dst_addr.ip6; - ip6->sin6_port = local_port; + *src_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->dst_addr.ip6, + .sin6_port = local_port, + }; } if (dst_addr) { - ip6 = (struct sockaddr_in6 *)dst_addr; - ip6->sin6_family = AF_INET6; - ip6->sin6_addr = hdr->src_addr.ip6; - ip6->sin6_port = hdr->port; + *dst_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->src_addr.ip6, + .sin6_port = hdr->port, + }; } } @@ -1159,10 +1159,12 @@ static int cma_save_ip_info(struct sockaddr *src_addr, switch (cma_get_ip_ver(hdr)) { case 4: - cma_save_ip4_info(src_addr, dst_addr, hdr, port); + cma_save_ip4_info((struct sockaddr_in *)src_addr, + (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: - cma_save_ip6_info(src_addr, dst_addr, hdr, port); + cma_save_ip6_info((struct sockaddr_in6 *)src_addr, + (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; @@ -2436,6 +2438,18 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -2461,6 +2475,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; if (addr->dev_addr.bound_dev_if) { + unsigned long supported_gids; + ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; @@ -2484,7 +2500,12 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->net = &init_net; route->path_rec->ifindex = ndev->ifindex; - route->path_rec->gid_type = id_priv->gid_type; + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + route->path_rec->gid_type = + cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); } if (!ndev) { ret = -ENODEV; @@ -4369,7 +4390,7 @@ static int __init cma_init(void) { int ret; - cma_wq = create_singlethread_workqueue("rdma_cm"); + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 357624f8b9d3..5495e22839a7 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1160,7 +1160,7 @@ static int __init iw_cm_init(void) if (ret) pr_err("iw_cm: couldn't register netlink callbacks\n"); - iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 2d49228f28b2..40cbd6bdb73b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3160,7 +3160,7 @@ static int ib_mad_port_open(struct ib_device *device, goto error3; } - port_priv->pd = ib_alloc_pd(device); + port_priv->pd = ib_alloc_pd(device, 0); if (IS_ERR(port_priv->pd)) { dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); @@ -3177,7 +3177,7 @@ static int ib_mad_port_open(struct ib_device *device, goto error7; snprintf(name, sizeof name, "ib_mad%d", port_num); - port_priv->wq = create_singlethread_workqueue(name); + port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!port_priv->wq) { ret = -ENOMEM; goto error8; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 51c79b2fb0b8..e51b739f6ea3 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -873,7 +873,7 @@ int mcast_init(void) { int ret; - mcast_wq = create_singlethread_workqueue("ib_mcast"); + mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); if (!mcast_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index b9bf7aa055e7..81b742ca1639 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -2015,7 +2015,7 @@ int ib_sa_init(void) goto err2; } - ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq"); + ib_nl_wq = alloc_ordered_workqueue("ib_nl_sa_wq", WQ_MEM_RECLAIM); if (!ib_nl_wq) { ret = -ENOMEM; goto err3; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 15defefecb4f..c1fb545e8d78 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1193,7 +1193,7 @@ static ssize_t set_node_desc(struct device *device, if (!dev->modify_device) return -EIO; - memcpy(desc.node_desc, buf, min_t(int, count, 64)); + memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX)); ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); if (ret) return ret; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2825ece91d3c..9520154f1d7c 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1638,7 +1638,8 @@ static int ucma_open(struct inode *inode, struct file *filp) if (!file) return -ENOMEM; - file->close_wq = create_singlethread_workqueue("ucma_close_id"); + file->close_wq = alloc_ordered_workqueue("ucma_close_id", + WQ_MEM_RECLAIM); if (!file->close_wq) { kfree(file); return -ENOMEM; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c68746ce6624..84b4eff90395 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -94,6 +94,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, unsigned long dma_attrs = 0; struct scatterlist *sg, *sg_list_start; int need_release = 0; + unsigned int gup_flags = FOLL_WRITE; if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -174,7 +175,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - if (npages == 0) { + if (npages == 0 || npages > UINT_MAX) { ret = -EINVAL; goto out; } @@ -183,6 +184,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret) goto out; + if (!umem->writable) + gup_flags |= FOLL_FORCE; + need_release = 1; sg_list_start = umem->sg_head.sgl; @@ -190,7 +194,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - 1, !umem->writable, page_list, vma_list); + gup_flags, page_list, vma_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 75077a018675..1f0fe3217f23 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 off; int j, k, ret = 0, start_idx, npages = 0; u64 base_virt_addr; + unsigned int flags = 0; if (access_mask == 0) return -EINVAL; @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, goto out_put_task; } + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, - 0, local_page_list, NULL); + flags, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f6647318138d..cb3f515a2285 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -571,7 +571,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, pd->device = ib_dev; pd->uobject = uobj; - pd->local_mr = NULL; + pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); uobj->object = pd; @@ -3078,51 +3078,102 @@ out_put: return ret ? ret : in_len; } +static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec) +{ + /* Returns user space filter size, includes padding */ + return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; +} + +static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, + u16 ib_real_filter_sz) +{ + /* + * User space filter structures must be 64 bit aligned, otherwise this + * may pass, but we won't handle additional new attributes. + */ + + if (kern_filter_size > ib_real_filter_sz) { + if (memchr_inv(kern_spec_filter + + ib_real_filter_sz, 0, + kern_filter_size - ib_real_filter_sz)) + return -EINVAL; + return ib_real_filter_sz; + } + return kern_filter_size; +} + static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { + ssize_t actual_filter_sz; + ssize_t kern_filter_sz; + ssize_t ib_filter_sz; + void *kern_spec_mask; + void *kern_spec_val; + if (kern_spec->reserved) return -EINVAL; ib_spec->type = kern_spec->type; + kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); + /* User flow spec size must be aligned to 4 bytes */ + if (kern_filter_sz != ALIGN(kern_filter_sz, 4)) + return -EINVAL; + + kern_spec_val = (void *)kern_spec + + sizeof(struct ib_uverbs_flow_spec_hdr); + kern_spec_mask = kern_spec_val + kern_filter_sz; + switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: - ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); - if (ib_spec->eth.size != kern_spec->eth.size) + ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) return -EINVAL; - memcpy(&ib_spec->eth.val, &kern_spec->eth.val, - sizeof(struct ib_flow_eth_filter)); - memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, - sizeof(struct ib_flow_eth_filter)); + ib_spec->size = sizeof(struct ib_flow_spec_eth); + memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_IPV4: - ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); - if (ib_spec->ipv4.size != kern_spec->ipv4.size) + ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) return -EINVAL; - memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, - sizeof(struct ib_flow_ipv4_filter)); - memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, - sizeof(struct ib_flow_ipv4_filter)); + ib_spec->size = sizeof(struct ib_flow_spec_ipv4); + memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_IPV6: - ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6); - if (ib_spec->ipv6.size != kern_spec->ipv6.size) + ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv6); + memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz); + + if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) || + (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20)) return -EINVAL; - memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val, - sizeof(struct ib_flow_ipv6_filter)); - memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask, - sizeof(struct ib_flow_ipv6_filter)); break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: - ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); - if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size) + ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) return -EINVAL; - memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, - sizeof(struct ib_flow_tcp_udp_filter)); - memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, - sizeof(struct ib_flow_tcp_udp_filter)); + ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp); + memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz); break; default: return -EINVAL; @@ -3654,7 +3705,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, goto err_uobj; } - flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); + flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs * + sizeof(union ib_flow_spec), GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; @@ -4173,6 +4225,23 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.device_cap_flags_ex = attr.device_cap_flags; resp.response_length += sizeof(resp.device_cap_flags_ex); + + if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps)) + goto end; + + resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts; + resp.rss_caps.max_rwq_indirection_tables = + attr.rss_caps.max_rwq_indirection_tables; + resp.rss_caps.max_rwq_indirection_table_size = + attr.rss_caps.max_rwq_indirection_table_size; + + resp.response_length += sizeof(resp.rss_caps); + + if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq)) + goto end; + + resp.max_wq_type_rq = attr.max_wq_type_rq; + resp.response_length += sizeof(resp.max_wq_type_rq); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 0012fa58c105..44b1104eb168 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -262,12 +262,9 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - if (qp != qp->real_qp) { - ib_close_qp(qp); - } else { + if (qp == qp->real_qp) ib_uverbs_detach_umcast(qp, uqp); - ib_destroy_qp(qp); - } + ib_destroy_qp(qp); ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index f2b776efab3a..83687646da68 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -227,9 +227,11 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ -struct ib_pd *ib_alloc_pd(struct ib_device *device) +struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, + const char *caller) { struct ib_pd *pd; + int mr_access_flags = 0; pd = device->alloc_pd(device, NULL, NULL); if (IS_ERR(pd)) @@ -237,26 +239,46 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device) pd->device = device; pd->uobject = NULL; - pd->local_mr = NULL; + pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); + pd->flags = flags; if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; - else { + else + mr_access_flags |= IB_ACCESS_LOCAL_WRITE; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + pr_warn("%s: enabling unsafe global rkey\n", caller); + mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; + } + + if (mr_access_flags) { struct ib_mr *mr; - mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + mr = pd->device->get_dma_mr(pd, mr_access_flags); if (IS_ERR(mr)) { ib_dealloc_pd(pd); - return (struct ib_pd *)mr; + return ERR_CAST(mr); } - pd->local_mr = mr; - pd->local_dma_lkey = pd->local_mr->lkey; + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + mr->need_inval = false; + + pd->__internal_mr = mr; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + pd->local_dma_lkey = pd->__internal_mr->lkey; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) + pd->unsafe_global_rkey = pd->__internal_mr->rkey; } + return pd; } -EXPORT_SYMBOL(ib_alloc_pd); +EXPORT_SYMBOL(__ib_alloc_pd); /** * ib_dealloc_pd - Deallocates a protection domain. @@ -270,10 +292,10 @@ void ib_dealloc_pd(struct ib_pd *pd) { int ret; - if (pd->local_mr) { - ret = ib_dereg_mr(pd->local_mr); + if (pd->__internal_mr) { + ret = pd->device->dereg_mr(pd->__internal_mr); WARN_ON(ret); - pd->local_mr = NULL; + pd->__internal_mr = NULL; } /* uverbs manipulates usecnt with proper locking, while the kabi @@ -821,7 +843,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (ret) { pr_err("failed to init MR pool ret= %d\n", ret); ib_destroy_qp(qp); - qp = ERR_PTR(ret); + return ERR_PTR(ret); } } @@ -1391,29 +1413,6 @@ EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ -struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - mr = pd->device->get_dma_mr(pd, mr_access_flags); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - mr->need_inval = false; - } - - return mr; -} -EXPORT_SYMBOL(ib_get_dma_mr); - int ib_dereg_mr(struct ib_mr *mr) { struct ib_pd *pd = mr->pd; @@ -1812,13 +1811,13 @@ EXPORT_SYMBOL(ib_set_vf_guid); * * Constraints: * - The first sg element is allowed to have an offset. - * - Each sg element must be aligned to page_size (or physically - * contiguous to the previous element). In case an sg element has a - * non contiguous offset, the mapping prefix will not include it. + * - Each sg element must either be aligned to page_size or virtually + * contiguous to the previous element. In case an sg element has a + * non-contiguous offset, the mapping prefix will not include it. * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. - * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS_REG, non of these + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. |