From a1a8e4a85cf7daff8b26c7b8698442ef677b4f97 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Jun 2019 15:02:01 -0300 Subject: rdma: Delete the ib_ucm module This has been marked CONFIG_BROKEN for over a year now with no complaints. Delete the whole thing for good. The module provided the /dev/infiniband/ucmX interface. Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_cm.h | 326 ----------------------------------------- 1 file changed, 326 deletions(-) delete mode 100644 include/uapi/rdma/ib_user_cm.h (limited to 'include/uapi') diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h deleted file mode 100644 index e2709bb8cb18..000000000000 --- a/include/uapi/rdma/ib_user_cm.h +++ /dev/null @@ -1,326 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_USER_CM_H -#define IB_USER_CM_H - -#include -#include - -#define IB_USER_CM_ABI_VERSION 5 - -enum { - IB_USER_CM_CMD_CREATE_ID, - IB_USER_CM_CMD_DESTROY_ID, - IB_USER_CM_CMD_ATTR_ID, - - IB_USER_CM_CMD_LISTEN, - IB_USER_CM_CMD_NOTIFY, - - IB_USER_CM_CMD_SEND_REQ, - IB_USER_CM_CMD_SEND_REP, - IB_USER_CM_CMD_SEND_RTU, - IB_USER_CM_CMD_SEND_DREQ, - IB_USER_CM_CMD_SEND_DREP, - IB_USER_CM_CMD_SEND_REJ, - IB_USER_CM_CMD_SEND_MRA, - IB_USER_CM_CMD_SEND_LAP, - IB_USER_CM_CMD_SEND_APR, - IB_USER_CM_CMD_SEND_SIDR_REQ, - IB_USER_CM_CMD_SEND_SIDR_REP, - - IB_USER_CM_CMD_EVENT, - IB_USER_CM_CMD_INIT_QP_ATTR, -}; -/* - * command ABI structures. - */ -struct ib_ucm_cmd_hdr { - __u32 cmd; - __u16 in; - __u16 out; -}; - -struct ib_ucm_create_id { - __aligned_u64 uid; - __aligned_u64 response; -}; - -struct ib_ucm_create_id_resp { - __u32 id; -}; - -struct ib_ucm_destroy_id { - __aligned_u64 response; - __u32 id; - __u32 reserved; -}; - -struct ib_ucm_destroy_id_resp { - __u32 events_reported; -}; - -struct ib_ucm_attr_id { - __aligned_u64 response; - __u32 id; - __u32 reserved; -}; - -struct ib_ucm_attr_id_resp { - __be64 service_id; - __be64 service_mask; - __be32 local_id; - __be32 remote_id; -}; - -struct ib_ucm_init_qp_attr { - __aligned_u64 response; - __u32 id; - __u32 qp_state; -}; - -struct ib_ucm_listen { - __be64 service_id; - __be64 service_mask; - __u32 id; - __u32 reserved; -}; - -struct ib_ucm_notify { - __u32 id; - __u32 event; -}; - -struct ib_ucm_private_data { - __aligned_u64 data; - __u32 id; - __u8 len; - __u8 reserved[3]; -}; - -struct ib_ucm_req { - __u32 id; - __u32 qpn; - __u32 qp_type; - __u32 psn; - __be64 sid; - __aligned_u64 data; - __aligned_u64 primary_path; - __aligned_u64 alternate_path; - __u8 len; - __u8 peer_to_peer; - __u8 responder_resources; - __u8 initiator_depth; - __u8 remote_cm_response_timeout; - __u8 flow_control; - __u8 local_cm_response_timeout; - __u8 retry_count; - __u8 rnr_retry_count; - __u8 max_cm_retries; - __u8 srq; - __u8 reserved[5]; -}; - -struct ib_ucm_rep { - __aligned_u64 uid; - __aligned_u64 data; - __u32 id; - __u32 qpn; - __u32 psn; - __u8 len; - __u8 responder_resources; - __u8 initiator_depth; - __u8 target_ack_delay; - __u8 failover_accepted; - __u8 flow_control; - __u8 rnr_retry_count; - __u8 srq; - __u8 reserved[4]; -}; - -struct ib_ucm_info { - __u32 id; - __u32 status; - __aligned_u64 info; - __aligned_u64 data; - __u8 info_len; - __u8 data_len; - __u8 reserved[6]; -}; - -struct ib_ucm_mra { - __aligned_u64 data; - __u32 id; - __u8 len; - __u8 timeout; - __u8 reserved[2]; -}; - -struct ib_ucm_lap { - __aligned_u64 path; - __aligned_u64 data; - __u32 id; - __u8 len; - __u8 reserved[3]; -}; - -struct ib_ucm_sidr_req { - __u32 id; - __u32 timeout; - __be64 sid; - __aligned_u64 data; - __aligned_u64 path; - __u16 reserved_pkey; - __u8 len; - __u8 max_cm_retries; - __u8 reserved[4]; -}; - -struct ib_ucm_sidr_rep { - __u32 id; - __u32 qpn; - __u32 qkey; - __u32 status; - __aligned_u64 info; - __aligned_u64 data; - __u8 info_len; - __u8 data_len; - __u8 reserved[6]; -}; -/* - * event notification ABI structures. - */ -struct ib_ucm_event_get { - __aligned_u64 response; - __aligned_u64 data; - __aligned_u64 info; - __u8 data_len; - __u8 info_len; - __u8 reserved[6]; -}; - -struct ib_ucm_req_event_resp { - struct ib_user_path_rec primary_path; - struct ib_user_path_rec alternate_path; - __be64 remote_ca_guid; - __u32 remote_qkey; - __u32 remote_qpn; - __u32 qp_type; - __u32 starting_psn; - __u8 responder_resources; - __u8 initiator_depth; - __u8 local_cm_response_timeout; - __u8 flow_control; - __u8 remote_cm_response_timeout; - __u8 retry_count; - __u8 rnr_retry_count; - __u8 srq; - __u8 port; - __u8 reserved[7]; -}; - -struct ib_ucm_rep_event_resp { - __be64 remote_ca_guid; - __u32 remote_qkey; - __u32 remote_qpn; - __u32 starting_psn; - __u8 responder_resources; - __u8 initiator_depth; - __u8 target_ack_delay; - __u8 failover_accepted; - __u8 flow_control; - __u8 rnr_retry_count; - __u8 srq; - __u8 reserved[5]; -}; - -struct ib_ucm_rej_event_resp { - __u32 reason; - /* ari in ib_ucm_event_get info field. */ -}; - -struct ib_ucm_mra_event_resp { - __u8 timeout; - __u8 reserved[3]; -}; - -struct ib_ucm_lap_event_resp { - struct ib_user_path_rec path; -}; - -struct ib_ucm_apr_event_resp { - __u32 status; - /* apr info in ib_ucm_event_get info field. */ -}; - -struct ib_ucm_sidr_req_event_resp { - __u16 pkey; - __u8 port; - __u8 reserved; -}; - -struct ib_ucm_sidr_rep_event_resp { - __u32 status; - __u32 qkey; - __u32 qpn; - /* info in ib_ucm_event_get info field. */ -}; - -#define IB_UCM_PRES_DATA 0x01 -#define IB_UCM_PRES_INFO 0x02 -#define IB_UCM_PRES_PRIMARY 0x04 -#define IB_UCM_PRES_ALTERNATE 0x08 - -struct ib_ucm_event_resp { - __aligned_u64 uid; - __u32 id; - __u32 event; - __u32 present; - __u32 reserved; - union { - struct ib_ucm_req_event_resp req_resp; - struct ib_ucm_rep_event_resp rep_resp; - struct ib_ucm_rej_event_resp rej_resp; - struct ib_ucm_mra_event_resp mra_resp; - struct ib_ucm_lap_event_resp lap_resp; - struct ib_ucm_apr_event_resp apr_resp; - - struct ib_ucm_sidr_req_event_resp sidr_req_resp; - struct ib_ucm_sidr_rep_event_resp sidr_rep_resp; - - __u32 send_status; - } u; -}; - -#endif /* IB_USER_CM_H */ -- cgit v1.2.3 From 5d60c11154116e2127374d4178e952649612b69b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:38:17 -0300 Subject: RDMA: Move rdma_node_type to uapi/ This enum is exposed over the sysfs file 'node_type' and over netlink via RDMA_NLDEV_ATTR_DEV_NODE_TYPE, so declare it in the uapi headers. Signed-off-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 2 +- include/rdma/ib_verbs.h | 13 +------------ include/uapi/rdma/rdma_netlink.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 585e100706aa..588f1d195fd2 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -209,7 +209,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) +rdma_node_get_transport(unsigned int node_type) { if (node_type == RDMA_NODE_USNIC) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f357e03a85a6..973514ea17a7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -132,17 +132,6 @@ struct ib_gid_attr { u8 port_num; }; -enum rdma_node_type { - /* IB values map to NodeInfo:NodeType. */ - RDMA_NODE_IB_CA = 1, - RDMA_NODE_IB_SWITCH, - RDMA_NODE_IB_ROUTER, - RDMA_NODE_RNIC, - RDMA_NODE_USNIC, - RDMA_NODE_USNIC_UDP, - RDMA_NODE_UNSPECIFIED, -}; - enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, @@ -164,7 +153,7 @@ enum rdma_protocol_type { }; __attribute_const__ enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type); +rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 41db51367efa..f588e8551c6c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -147,6 +147,18 @@ enum { IWPM_NLA_HELLO_MAX }; +/* For RDMA_NLDEV_ATTR_DEV_NODE_TYPE */ +enum { + /* IB values map to NodeInfo:NodeType. */ + RDMA_NODE_IB_CA = 1, + RDMA_NODE_IB_SWITCH, + RDMA_NODE_IB_ROUTER, + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, + RDMA_NODE_UNSPECIFIED, +}; + /* * Local service operations: * RESOLVE - The client requests the local service to resolve a path. -- cgit v1.2.3 From 0e2d00eb6fd45f2a645f4874286bdc5b4b53782b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:38:18 -0300 Subject: RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload Allow userspace to issue a netlink query against the ib_device for something like "uverbs" and get back the char dev name, inode major/minor, and interface ABI information for "uverbs0". Since we are now in netlink this can also trigger a module autoload to make the uverbs device come into existence. Largely this will let us replace searching and reading inside sysfs to setup devices, and provides an alternative (using driver_id) to device name based provider binding for things like rxe. Signed-off-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 9 ++++ drivers/infiniband/core/device.c | 98 +++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/nldev.c | 94 +++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 4 ++ include/rdma/rdma_netlink.h | 2 + include/uapi/rdma/rdma_netlink.h | 14 ++++++ 6 files changed, 221 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index ff40a450b5d2..a953c2fa2e78 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -88,6 +88,15 @@ typedef int (*nldev_callback)(struct ib_device *device, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb); +struct ib_client_nl_info { + struct sk_buff *nl_msg; + struct device *cdev; + unsigned int port; + u64 abi; +}; +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index abb169f31d0f..7db8566cdb89 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1726,6 +1726,104 @@ void ib_unregister_client(struct ib_client *client) } EXPORT_SYMBOL(ib_unregister_client); +static int __ib_get_global_client_nl_info(const char *client_name, + struct ib_client_nl_info *res) +{ + struct ib_client *client; + unsigned long index; + int ret = -ENOENT; + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + if (strcmp(client->name, client_name) != 0) + continue; + if (!client->get_global_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_global_nl_info(res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&clients_rwsem); + return ret; +} + +static int __ib_get_client_nl_info(struct ib_device *ibdev, + const char *client_name, + struct ib_client_nl_info *res) +{ + unsigned long index; + void *client_data; + int ret = -ENOENT; + + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked (&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || strcmp(client->name, client_name) != 0) + continue; + if (!client->get_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_nl_info(ibdev, client_data, res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + + /* + * The cdev is guaranteed valid as long as we are inside the + * client_data_rwsem as remove_one can't be called. Keep it + * valid for the caller. + */ + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&ibdev->client_data_rwsem); + + return ret; +} + +/** + * ib_get_client_nl_info - Fetch the nl_info from a client + * @device - IB device + * @client_name - Name of the client + * @res - Result of the query + */ +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res) +{ + int ret; + + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); +#ifdef CONFIG_MODULES + if (ret == -ENOENT) { + request_module("rdma-client-%s", client_name); + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); + } +#endif + if (ret) { + if (ret == -ENOENT) + return -EOPNOTSUPP; + return ret; + } + + if (WARN_ON(!res->cdev)) + return -EINVAL; + return 0; +} + /** * ib_set_client_data - Set IB client context * @device:Device to set context for diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 69188cbbd99b..16b5d6d4dd1c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -120,6 +120,12 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, + .len = 128 }, + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -1347,6 +1353,91 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char client_name[IB_DEVICE_NAME_MAX]; + struct ib_client_nl_info data = {}; + struct ib_device *ibdev = NULL; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) + return -EINVAL; + + if (nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)) >= sizeof(client_name)) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + ibdev = ib_device_get_by_index(sock_net(skb->sk), index); + if (!ibdev) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(ibdev, data.port)) { + err = -EINVAL; + goto out_put; + } + } else { + data.port = -1; + } + } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out_put; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_GET_CHARDEV), + 0, 0); + + data.nl_msg = msg; + err = ib_get_client_nl_info(ibdev, client_name, &data); + if (err) + goto out_nlmsg; + + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, + huge_encode_dev(data.cdev->devt), + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, + dev_name(data.cdev))) { + err = -EMSGSIZE; + goto out_data; + } + + nlmsg_end(msg, nlh); + put_device(data.cdev); + if (ibdev) + ib_device_put(ibdev); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +out_data: + put_device(data.cdev); +out_nlmsg: + nlmsg_free(msg); +out_put: + if (ibdev) + ib_device_put(ibdev); + return err; +} + static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1404,6 +1495,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_GET_CHARDEV] = { + .doit = nldev_get_chardev, + }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 973514ea17a7..a1265e9ce2d1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2684,10 +2684,14 @@ struct ib_device { u32 iw_driver_flags; }; +struct ib_client_nl_info; struct ib_client { const char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); + int (*get_nl_info)(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res); + int (*get_global_nl_info)(struct ib_client_nl_info *res); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 10732ab31ba2..c7acbe083428 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -110,4 +110,6 @@ void rdma_link_register(struct rdma_link_ops *ops); void rdma_link_unregister(struct rdma_link_ops *ops); #define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) +#define MODULE_ALIAS_RDMA_CLIENT(type) MODULE_ALIAS("rdma-client-" type) + #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index f588e8551c6c..9903db21a42c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -279,6 +279,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */ + RDMA_NLDEV_CMD_GET_CHARDEV, + RDMA_NLDEV_NUM_OPS }; @@ -491,6 +493,18 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_NET_NS_FD, /* u32 */ + /* + * Information about a chardev. + * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc) + * CHARDEV_ABI signals the ABI revision (historical) + * CHARDEV_NAME is the kernel name for the /dev/ file (no directory) + * CHARDEV is the 64 bit dev_t for the inode + */ + RDMA_NLDEV_ATTR_CHARDEV_TYPE, /* string */ + RDMA_NLDEV_ATTR_CHARDEV_NAME, /* string */ + RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ + RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ + /* * Always the end */ -- cgit v1.2.3 From 8f71bb0030b8816f57be142f95b3c7189c6eaf4c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:38:19 -0300 Subject: RDMA: Report available cdevs through RDMA_NLDEV_CMD_GET_CHARDEV Update the struct ib_client for all modules exporting cdevs related to the ibdevice to also implement RDMA_NLDEV_CMD_GET_CHARDEV. All cdevs are now autoloadable and discoverable by userspace over netlink instead of relying on sysfs. uverbs also exposes the DRIVER_ID for drivers that are able to support driver id binding in rdma-core. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 3 ++ drivers/infiniband/core/nldev.c | 1 + drivers/infiniband/core/ucma.c | 23 +++++++++++++ drivers/infiniband/core/user_mad.c | 51 +++++++++++++++++++++++++--- drivers/infiniband/core/uverbs_main.c | 32 ++++++++++++++++- drivers/infiniband/hw/cxgb3/iwch_provider.c | 1 + drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/mthca/mthca_provider.c | 1 + include/rdma/ib_verbs.h | 1 + include/uapi/rdma/rdma_netlink.h | 1 + 10 files changed, 110 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7db8566cdb89..1de4ae5d5e0e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2428,6 +2428,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; + dev_ops->uverbs_no_driver_id_binding |= + ops->uverbs_no_driver_id_binding; + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 16b5d6d4dd1c..3cad72a609ff 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -126,6 +126,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = 128 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 39823c842202..0274e9b704be 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -1788,6 +1790,19 @@ static struct miscdevice ucma_misc = { .fops = &ucma_fops, }; +static int ucma_get_global_nl_info(struct ib_client_nl_info *res) +{ + res->abi = RDMA_USER_CM_ABI_VERSION; + res->cdev = ucma_misc.this_device; + return 0; +} + +static struct ib_client rdma_cma_client = { + .name = "rdma_cm", + .get_global_nl_info = ucma_get_global_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); + static ssize_t show_abi_version(struct device *dev, struct device_attribute *attr, char *buf) @@ -1816,7 +1831,14 @@ static int __init ucma_init(void) ret = -ENOMEM; goto err2; } + + ret = ib_register_client(&rdma_cma_client); + if (ret) + goto err3; + return 0; +err3: + unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: @@ -1826,6 +1848,7 @@ err1: static void __exit ucma_cleanup(void) { + ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 671f07ba1fad..547090b41cfb 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -54,6 +54,7 @@ #include #include +#include #include "core_priv.h" @@ -1124,11 +1125,48 @@ static const struct file_operations umad_sm_fops = { .llseek = no_llseek, }; +static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_device *umad_dev = client_data; + + if (!rdma_is_port_valid(ibdev, res->port)) + return -EINVAL; + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev; + + return 0; +} + static struct ib_client umad_client = { .name = "umad", .add = ib_umad_add_one, - .remove = ib_umad_remove_one + .remove = ib_umad_remove_one, + .get_nl_info = ib_umad_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("umad"); + +static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_device *umad_dev = + ib_get_client_data(ibdev, &umad_client); + + if (!rdma_is_port_valid(ibdev, res->port)) + return -EINVAL; + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev; + + return 0; +} + +static struct ib_client issm_client = { + .name = "issm", + .get_nl_info = ib_issm_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("issm"); static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1387,13 +1425,17 @@ static int __init ib_umad_init(void) } ret = ib_register_client(&umad_client); - if (ret) { - pr_err("couldn't register ib_umad client\n"); + if (ret) goto out_class; - } + + ret = ib_register_client(&issm_client); + if (ret) + goto out_client; return 0; +out_client: + ib_unregister_client(&umad_client); out_class: class_unregister(&umad_class); @@ -1411,6 +1453,7 @@ out: static void __exit ib_umad_cleanup(void) { + ib_unregister_client(&issm_client); ib_unregister_client(&umad_client); class_unregister(&umad_class); unregister_chrdev_region(base_umad_dev, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 870b3dd35aac..11c13c1381cf 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -51,6 +51,7 @@ #include #include +#include #include "uverbs.h" #include "core_priv.h" @@ -1148,12 +1149,41 @@ static const struct file_operations uverbs_mmap_fops = { .compat_ioctl = ib_uverbs_ioctl, }; +static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int ret; + + if (res->port != -1) + return -EINVAL; + + res->abi = ibdev->ops.uverbs_abi_ver; + res->cdev = &uverbs_dev->dev; + + /* + * To support DRIVER_ID binding in userspace some of the driver need + * upgrading to expose their PCI dependent revision information + * through get_context instead of relying on modalias matching. When + * the drivers are fixed they can drop this flag. + */ + if (!ibdev->ops.uverbs_no_driver_id_binding) { + ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, + ibdev->ops.driver_id); + if (ret) + return ret; + } + return 0; +} + static struct ib_client uverbs_client = { .name = "uverbs", .no_kverbs_req = true, .add = ib_uverbs_add_one, - .remove = ib_uverbs_remove_one + .remove = ib_uverbs_remove_one, + .get_nl_info = ib_uverbs_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("uverbs"); static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index acba96f289cc..810fa96af2e9 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1230,6 +1230,7 @@ static const struct ib_device_ops iwch_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_CXGB3, .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION, + .uverbs_no_driver_id_binding = 1, .alloc_hw_stats = iwch_alloc_stats, .alloc_mr = iwch_alloc_mr, diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 3e45b119b0eb..c0e819ed8c9b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -417,6 +417,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HNS, .uverbs_abi_ver = 1, + .uverbs_no_driver_id_binding = 1, .add_gid = hns_roce_add_gid, .alloc_pd = hns_roce_alloc_pd, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index efd4e3d13ae2..d97124bee703 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1147,6 +1147,7 @@ static const struct ib_device_ops mthca_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MTHCA, .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION, + .uverbs_no_driver_id_binding = 1, .alloc_pd = mthca_alloc_pd, .alloc_ucontext = mthca_alloc_ucontext, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1265e9ce2d1..6f09fcc21d7a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2321,6 +2321,7 @@ struct ib_device_ops { struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; + unsigned int uverbs_no_driver_id_binding:1; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 9903db21a42c..b27c02185dcc 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -504,6 +504,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_CHARDEV_NAME, /* string */ RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ + RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, /* u64 */ /* * Always the end -- cgit v1.2.3 From 34d65cd837d0c77fac0c0da632c616030b2927e3 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 21 Jun 2019 17:00:44 -0400 Subject: RDMA/netlink: Audit policy settings for netlink attributes For all string attributes for which we don't currently accept the element as input, we only use it as output, set the string length to RDMA_NLDEV_ATTR_EMPTY_STRING which is defined as 1. That way we will only accept a null string for that element. This will prevent someone from writing a new input routine that uses the element without also updating the policy to have a valid value. Also while there, make sure the existing entries that are valid have the correct policy, if not, correct the policy. Remove unnecessary checks for nla_strlcpy() overflow once the policy has been set correctly. Signed-off-by: Doug Ledford Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 25 ++++++++++++------------- include/rdma/rdma_netlink.h | 6 ++++++ include/uapi/rdma/rdma_netlink.h | 4 ---- 3 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 6006d23d0410..5499f5629dc2 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -49,29 +49,29 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, - .len = 128 }, + .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = IB_DEVICE_NAME_MAX - 1}, + .len = IB_DEVICE_NAME_MAX }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, - .len = IB_FW_VERSION_NAME_MAX - 1}, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, @@ -92,7 +92,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, - .len = TASK_COMM_LEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, @@ -120,7 +120,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, - .len = 16 }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, @@ -1361,7 +1361,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; - char client_name[IB_DEVICE_NAME_MAX]; + char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; struct ib_client_nl_info data = {}; struct ib_device *ibdev = NULL; struct sk_buff *msg; @@ -1373,9 +1373,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; - if (nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], - sizeof(client_name)) >= sizeof(client_name)) - return -EINVAL; + nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c7acbe083428..6631624e4d7c 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -6,6 +6,12 @@ #include #include +enum { + RDMA_NLDEV_ATTR_EMPTY_STRING = 1, + RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, + RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE = 32, +}; + struct rdma_nl_cbs { int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index b27c02185dcc..650cee8c4bf1 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -284,10 +284,6 @@ enum rdma_nldev_command { RDMA_NLDEV_NUM_OPS }; -enum { - RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, -}; - enum rdma_nldev_print_type { RDMA_NLDEV_PRINT_TYPE_UNSPEC, RDMA_NLDEV_PRINT_TYPE_HEX, -- cgit v1.2.3 From 239b0e52d8aa64d2559c672fd8c29cf1fffc3ec7 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:17 -0400 Subject: IB/hfi1: Move rvt_cq_wc struct into uapi directory The rvt_cq_wc struct elements are shared between rdmavt and the providers but not in uapi directory. As per the comment in https://marc.info/?l=linux-rdma&m=152296522708522&w=2 The hfi1 driver and the rdma core driver are not using shared structures in the uapi directory. In that case, move rvt_cq_wc struct into the rvt-abi.h header file and create a rvt_k_cq_w for the kernel completion queue. Signed-off-by: Kamenee Arumugam Reviewed-by: Mike Marciniszyn Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 4 +- drivers/infiniband/sw/rdmavt/cq.c | 192 ++++++++++++++++++++++++-------------- include/rdma/rdmavt_cq.h | 22 +++-- include/rdma/rdmavt_qp.h | 32 +++++++ include/uapi/rdma/rvt-abi.h | 32 +++++++ 5 files changed, 205 insertions(+), 77 deletions(-) create mode 100644 include/uapi/rdma/rvt-abi.h (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 4e0e9fc0a777..41261e72c429 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -702,8 +702,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) sde ? sde->this_idx : 0, send_context, send_context ? send_context->sw_index : 0, - ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head, - ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail, + ib_cq_head(qp->ibqp.send_cq), + ib_cq_tail(qp->ibqp.send_cq), qp->pid, qp->s_state, qp->s_ack_state, diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index b46714a92b7a..2602ad8b8cb0 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -63,19 +63,33 @@ static struct workqueue_struct *comp_vector_wq; */ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { - struct rvt_cq_wc *wc; + struct ib_uverbs_wc *uqueue = NULL; + struct ib_wc *kqueue = NULL; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; unsigned long flags; u32 head; u32 next; + u32 tail; spin_lock_irqsave(&cq->lock, flags); + if (cq->ip) { + u_wc = cq->queue; + uqueue = &u_wc->uqueue[0]; + head = RDMA_READ_UAPI_ATOMIC(u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail); + } else { + k_wc = cq->kqueue; + kqueue = &k_wc->kqueue[0]; + head = k_wc->head; + tail = k_wc->tail; + } + /* - * Note that the head pointer might be writable by user processes. - * Take care to verify it is a sane value. + * Note that the head pointer might be writable by + * user processes.Take care to verify it is a sane value. */ - wc = cq->queue; - head = wc->head; if (head >= (unsigned)cq->ibcq.cqe) { head = cq->ibcq.cqe; next = 0; @@ -83,7 +97,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) next = head + 1; } - if (unlikely(next == wc->tail)) { + if (unlikely(next == tail)) { spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -96,27 +110,27 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) return; } trace_rvt_cq_enter(cq, entry, head); - if (cq->ip) { - wc->uqueue[head].wr_id = entry->wr_id; - wc->uqueue[head].status = entry->status; - wc->uqueue[head].opcode = entry->opcode; - wc->uqueue[head].vendor_err = entry->vendor_err; - wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = entry->ex.imm_data; - wc->uqueue[head].qp_num = entry->qp->qp_num; - wc->uqueue[head].src_qp = entry->src_qp; - wc->uqueue[head].wc_flags = entry->wc_flags; - wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = ib_lid_cpu16(entry->slid); - wc->uqueue[head].sl = entry->sl; - wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; - wc->uqueue[head].port_num = entry->port_num; + if (uqueue) { + uqueue[head].wr_id = entry->wr_id; + uqueue[head].status = entry->status; + uqueue[head].opcode = entry->opcode; + uqueue[head].vendor_err = entry->vendor_err; + uqueue[head].byte_len = entry->byte_len; + uqueue[head].ex.imm_data = entry->ex.imm_data; + uqueue[head].qp_num = entry->qp->qp_num; + uqueue[head].src_qp = entry->src_qp; + uqueue[head].wc_flags = entry->wc_flags; + uqueue[head].pkey_index = entry->pkey_index; + uqueue[head].slid = ib_lid_cpu16(entry->slid); + uqueue[head].sl = entry->sl; + uqueue[head].dlid_path_bits = entry->dlid_path_bits; + uqueue[head].port_num = entry->port_num; /* Make sure entry is written before the head index. */ - smp_wmb(); + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next); } else { - wc->kqueue[head] = *entry; + kqueue[head] = *entry; + k_wc->head = next; } - wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && @@ -179,8 +193,9 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, { struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq); - struct rvt_cq_wc *wc; + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; u32 sz; unsigned int entries = attr->cqe; int comp_vector = attr->comp_vector; @@ -204,22 +219,28 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, * We need to use vmalloc() in order to support mmap and large * numbers of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (entries + 1); - else - sz += sizeof(struct ib_wc) * (entries + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (entries + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (entries + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } + /* * Return the address of the WC as the offset to mmap. * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc); + int err; + + cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc); if (!cq->ip) { err = -ENOMEM; goto bail_wc; @@ -264,7 +285,10 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); INIT_WORK(&cq->comptask, send_complete); - cq->queue = wc; + if (u_wc) + cq->queue = u_wc; + else + cq->kqueue = k_wc; trace_rvt_create_cq(cq, attr); return 0; @@ -272,7 +296,8 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, bail_ip: kfree(cq->ip); bail_wc: - vfree(wc); + vfree(u_wc); + vfree(k_wc); return err; } @@ -322,9 +347,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; - if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - cq->queue->head != cq->queue->tail) - ret = 1; + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + if (cq->queue) { + if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) != + RDMA_READ_UAPI_ATOMIC(cq->queue->tail)) + ret = 1; + } else { + if (cq->kqueue->head != cq->kqueue->tail) + ret = 1; + } + } spin_unlock_irqrestore(&cq->lock, flags); @@ -340,12 +372,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *old_wc; - struct rvt_cq_wc *wc; u32 head, tail, n; int ret; u32 sz; struct rvt_dev_info *rdi = cq->rdi; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_cq_wc *old_u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; + struct rvt_k_cq_wc *old_k_wc = NULL; if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) return -EINVAL; @@ -353,17 +387,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); - else - sz += sizeof(struct ib_wc) * (cqe + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; - + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (cqe + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (cqe + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { __u64 offset = 0; @@ -378,11 +414,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) * Make sure head and tail are sane since they * might be user writable. */ - old_wc = cq->queue; - head = old_wc->head; + if (u_wc) { + old_u_wc = cq->queue; + head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail); + } else { + old_k_wc = cq->kqueue; + head = old_k_wc->head; + tail = old_k_wc->tail; + } + if (head > (u32)cq->ibcq.cqe) head = (u32)cq->ibcq.cqe; - tail = old_wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; if (head < tail) @@ -394,27 +437,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) goto bail_unlock; } for (n = 0; tail != head; n++) { - if (cq->ip) - wc->uqueue[n] = old_wc->uqueue[tail]; + if (u_wc) + u_wc->uqueue[n] = old_u_wc->uqueue[tail]; else - wc->kqueue[n] = old_wc->kqueue[tail]; + k_wc->kqueue[n] = old_k_wc->kqueue[tail]; if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; } cq->ibcq.cqe = cqe; - wc->head = n; - wc->tail = 0; - cq->queue = wc; + if (u_wc) { + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n); + RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0); + cq->queue = u_wc; + } else { + k_wc->head = n; + k_wc->tail = 0; + cq->kqueue = k_wc; + } spin_unlock_irq(&cq->lock); - vfree(old_wc); + if (u_wc) + vfree(old_u_wc); + else + vfree(old_k_wc); if (cq->ip) { struct rvt_mmap_info *ip = cq->ip; - rvt_update_mmap_info(rdi, ip, sz, wc); + rvt_update_mmap_info(rdi, ip, sz, u_wc); /* * Return the offset to mmap. @@ -438,7 +490,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) bail_unlock: spin_unlock_irq(&cq->lock); bail_free: - vfree(wc); + vfree(u_wc); + vfree(k_wc); + return ret; } @@ -456,7 +510,7 @@ bail_free: int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *wc; + struct rvt_k_cq_wc *wc; unsigned long flags; int npolled; u32 tail; @@ -467,7 +521,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) spin_lock_irqsave(&cq->lock, flags); - wc = cq->queue; + wc = cq->kqueue; tail = wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 75dc65c0bfb8..ab22860a63e2 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -60,19 +60,28 @@ */ #define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) +/* + * Define read macro that apply smp_load_acquire memory barrier + * when reading indice of circular buffer that mmaped to user space. + */ +#define RDMA_READ_UAPI_ATOMIC(member) smp_load_acquire(&(member).val) + +/* + * Define write macro that uses smp_store_release memory barrier + * when writing indice of circular buffer that mmaped to user space. + */ +#define RDMA_WRITE_UAPI_ATOMIC(member, x) smp_store_release(&(member).val, x) +#include + /* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so * it can be mmap'ed into user space. */ -struct rvt_cq_wc { +struct rvt_k_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - union { - /* these are actually size ibcq.cqe + 1 */ - struct ib_uverbs_wc uqueue[0]; - struct ib_wc kqueue[0]; - }; + struct ib_wc kqueue[]; }; /* @@ -88,6 +97,7 @@ struct rvt_cq { struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; struct rvt_mmap_info *ip; + struct rvt_k_cq_wc *kqueue; }; static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 84d0f36afc2f..7fcd687af278 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -820,6 +820,38 @@ struct rvt_qp_iter { int n; }; +/** + * ib_cq_tail - Return tail index of cq buffer + * @send_cq - The cq for send + * + * This is called in qp_iter_print to get tail + * of cq buffer. + */ +static inline u32 ib_cq_tail(struct ib_cq *send_cq) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(send_cq); + + return ibcq_to_rvtcq(send_cq)->ip ? + RDMA_READ_UAPI_ATOMIC(cq->queue->tail) : + ibcq_to_rvtcq(send_cq)->kqueue->tail; +} + +/** + * ib_cq_head - Return head index of cq buffer + * @send_cq - The cq for send + * + * This is called in qp_iter_print to get head + * of cq buffer. + */ +static inline u32 ib_cq_head(struct ib_cq *send_cq) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(send_cq); + + return ibcq_to_rvtcq(send_cq)->ip ? + RDMA_READ_UAPI_ATOMIC(cq->queue->head) : + ibcq_to_rvtcq(send_cq)->kqueue->head; +} + struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h new file mode 100644 index 000000000000..8e5f7e0c15fe --- /dev/null +++ b/include/uapi/rdma/rvt-abi.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +#ifndef RVT_ABI_USER_H +#define RVT_ABI_USER_H + +#include +#include +#ifndef RDMA_ATOMIC_UAPI +#define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name +#endif + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + /* index of next entry to fill */ + RDMA_ATOMIC_UAPI(__u32, head); + /* index of next ib_poll_cq() entry */ + RDMA_ATOMIC_UAPI(__u32, tail); + + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[]; +}; + +#endif /* RVT_ABI_USER_H */ -- cgit v1.2.3 From dabac6e460ce8473f1e685432a8ab7818d81a1f1 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:24 -0400 Subject: IB/hfi1: Move receive work queue struct into uapi directory The rvt_rwqe and rvt_rwq struct elements are shared between rdmavt and the providers but are not in uapi directory. As per the comment in https://marc.info/?l=linux-rdma&m=152296522708522&w=2, The hfi1 driver and the rdma core driver are not using shared structures in the uapi directory. Move rvt_rwqe and rvt_rwq struct into rvt-abi.h header in uapi directory. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 152 +++++++++++++++++++++++++++---------- drivers/infiniband/sw/rdmavt/qp.h | 2 + drivers/infiniband/sw/rdmavt/rc.c | 10 ++- drivers/infiniband/sw/rdmavt/srq.c | 59 +++++++------- include/rdma/rdmavt_qp.h | 52 ++++++++----- include/uapi/rdma/rvt-abi.h | 29 +++++++ 6 files changed, 212 insertions(+), 92 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 0d804a58f954..1384060f175d 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -802,6 +802,46 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) } } +/** + * rvt_alloc_rq - allocate memory for user or kernel buffer + * @rq: receive queue data structure + * @size: number of request queue entries + * @node: The NUMA node + * @udata: True if user data is available or not false + * + * Return: If memory allocation failed, return -ENONEM + * This function is used by both shared receive + * queues and non-shared receive queues to allocate + * memory. + */ +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata) +{ + if (udata) { + rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size); + if (!rq->wq) + goto bail; + /* need kwq with no buffers */ + rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->wq->wq; + } else { + /* need kwq with buffers */ + rq->kwq = + vzalloc_node(sizeof(struct rvt_krwq) + size, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->kwq->wq; + } + + spin_lock_init(&rq->lock); + return 0; +bail: + rvt_free_rq(rq); + return -ENOMEM; +} + /** * rvt_init_qp - initialize the QP state to the reset state * @qp: the QP to init or reinit @@ -852,10 +892,6 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_tail_ack_queue = 0; qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; - if (qp->r_rq.wq) { - qp->r_rq.wq->head = 0; - qp->r_rq.wq->tail = 0; - } qp->r_sge.num_sge = 0; atomic_set(&qp->s_reserved_used, 0); } @@ -1046,17 +1082,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.max_sge = init_attr->cap.max_recv_sge; sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + sizeof(struct rvt_rwqe); - if (udata) - qp->r_rq.wq = vmalloc_user( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz); - else - qp->r_rq.wq = vzalloc_node( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz, - rdi->dparms.node); - if (!qp->r_rq.wq) + err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz, + rdi->dparms.node, udata); + if (err) { + ret = ERR_PTR(err); goto bail_driver_priv; + } } /* @@ -1202,8 +1233,7 @@ bail_qpn: rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); bail_rq_wq: - if (!qp->ip) - vfree(qp->r_rq.wq); + rvt_free_rq(&qp->r_rq); bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); @@ -1269,19 +1299,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) } wc.status = IB_WC_WR_FLUSH_ERR; - if (qp->r_rq.wq) { - struct rvt_rwq *wq; + if (qp->r_rq.kwq) { u32 head; u32 tail; + struct rvt_rwq *wq = NULL; + struct rvt_krwq *kwq = NULL; spin_lock(&qp->r_rq.lock); - + /* qp->ip used to validate if there is a user buffer mmaped */ + if (qp->ip) { + wq = qp->r_rq.wq; + head = RDMA_READ_UAPI_ATOMIC(wq->head); + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = qp->r_rq.kwq; + head = kwq->head; + tail = kwq->tail; + } /* sanity check pointers before trusting them */ - wq = qp->r_rq.wq; - head = wq->head; if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; while (tail != head) { @@ -1290,8 +1327,10 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) tail = 0; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } - wq->tail = tail; - + if (qp->ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; spin_unlock(&qp->r_rq.lock); } else if (qp->ibqp.event_handler) { ret = 1; @@ -1634,8 +1673,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (qp->ip) kref_put(&qp->ip->ref, rvt_release_mmap_info); - else - vfree(qp->r_rq.wq); + kvfree(qp->r_rq.kwq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); @@ -1721,7 +1759,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); - struct rvt_rwq *wq = qp->r_rq.wq; + struct rvt_krwq *wq = qp->r_rq.kwq; unsigned long flags; int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) && !qp->ibqp.srq; @@ -1746,7 +1784,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; - if (next == wq->tail) { + if (next == READ_ONCE(wq->tail)) { spin_unlock_irqrestore(&qp->r_rq.lock, flags); *bad_wr = wr; return -ENOMEM; @@ -1770,8 +1808,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, * Make sure queue entry is written * before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); } spin_unlock_irqrestore(&qp->r_rq.lock, flags); } @@ -2141,7 +2178,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); - struct rvt_rwq *wq; + struct rvt_krwq *wq; unsigned long flags; for (; wr; wr = wr->next) { @@ -2155,11 +2192,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, } spin_lock_irqsave(&srq->rq.lock, flags); - wq = srq->rq.wq; + wq = srq->rq.kwq; next = wq->head + 1; if (next >= srq->rq.size) next = 0; - if (next == wq->tail) { + if (next == READ_ONCE(wq->tail)) { spin_unlock_irqrestore(&srq->rq.lock, flags); *bad_wr = wr; return -ENOMEM; @@ -2171,8 +2208,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, for (i = 0; i < wr->num_sge; i++) wqe->sg_list[i] = wr->sg_list[i]; /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); spin_unlock_irqrestore(&srq->rq.lock, flags); } return 0; @@ -2229,6 +2265,25 @@ bad_lkey: return 0; } +/** + * get_rvt_head - get head indices of the circular buffer + * @rq: data structure for request queue entry + * @ip: the QP + * + * Return - head index value + */ +static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip) +{ + u32 head; + + if (ip) + head = RDMA_READ_UAPI_ATOMIC(rq->wq->head); + else + head = rq->kwq->head; + + return head; +} + /** * rvt_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP @@ -2243,21 +2298,26 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) { unsigned long flags; struct rvt_rq *rq; + struct rvt_krwq *kwq; struct rvt_rwq *wq; struct rvt_srq *srq; struct rvt_rwqe *wqe; void (*handler)(struct ib_event *, void *); u32 tail; + u32 head; int ret; + void *ip = NULL; if (qp->ibqp.srq) { srq = ibsrq_to_rvtsrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; rq = &srq->rq; + ip = srq->ip; } else { srq = NULL; handler = NULL; rq = &qp->r_rq; + ip = qp->ip; } spin_lock_irqsave(&rq->lock, flags); @@ -2265,17 +2325,24 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) ret = 0; goto unlock; } + if (ip) { + wq = rq->wq; + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = rq->kwq; + tail = kwq->tail; + } - wq = rq->wq; - tail = wq->tail; /* Validate tail before using it since it is user writable. */ if (tail >= rq->size) tail = 0; - if (unlikely(tail == wq->head)) { + + head = get_rvt_head(rq, ip); + if (unlikely(tail == head)) { ret = 0; goto unlock; } - /* Make sure entry is read after head index is read. */ + /* Make sure entry is read after the count is read. */ smp_rmb(); wqe = rvt_get_rwqe_ptr(rq, tail); /* @@ -2285,7 +2352,10 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) */ if (++tail >= rq->size) tail = 0; - wq->tail = tail; + if (ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; if (!wr_id_only && !init_sge(qp, wqe)) { ret = -1; goto unlock; @@ -2301,7 +2371,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) * Validate head pointer value and compute * the number of remaining WQEs. */ - n = wq->head; + n = get_rvt_head(rq, ip); if (n >= rq->size) n = 0; if (n < tail) diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 6db1619389b0..2cdba1283bf6 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); int rvt_wss_init(struct rvt_dev_info *rdi); void rvt_wss_exit(struct rvt_dev_info *rdi); +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 09f0cf538be6..44cc7ee1b321 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -104,15 +104,19 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp) } else { u32 min, max, x; u32 credits; - struct rvt_rwq *wq = qp->r_rq.wq; u32 head; u32 tail; /* sanity check pointers before trusting them */ - head = wq->head; + if (qp->ip) { + head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); + tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); + } else { + head = READ_ONCE(qp->r_rq.kwq->head); + tail = READ_ONCE(qp->r_rq.kwq->tail); + } if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; /* diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 8d6b3e764255..d306f6547cba 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -52,7 +52,7 @@ #include "srq.h" #include "vt.h" - +#include "qp.h" /** * rvt_driver_srq_init - init srq resources on a per driver basis * @rdi: rvt dev structure @@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct rvt_rwqe); - srq->rq.wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz, - dev->dparms.node); - if (!srq->rq.wq) { + if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz, + dev->dparms.node, udata)) { ret = -ENOMEM; goto bail_srq; } @@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, bail_ip: kfree(srq->ip); bail_wq: - vfree(srq->rq.wq); + rvt_free_rq(&srq->rq); bail_srq: return ret; } @@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device); - struct rvt_rwq *wq; + struct rvt_rq tmp_rq = {}; int ret = 0; if (attr_mask & IB_SRQ_MAX_WR) { - struct rvt_rwq *owq; + struct rvt_krwq *okwq = NULL; + struct rvt_rwq *owq = NULL; struct rvt_rwqe *p; u32 sz, size, n, head, tail; @@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, ((attr_mask & IB_SRQ_LIMIT) ? attr->srq_limit : srq->limit) > attr->max_wr) return -EINVAL; - sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + size * sz, - dev->dparms.node); - if (!wq) + if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node, + udata)) return -ENOMEM; - /* Check that we can write the offset to mmap. */ if (udata && udata->inlen >= sizeof(__u64)) { __u64 offset_addr; @@ -218,9 +211,15 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, * validate head and tail pointer values and compute * the number of remaining WQEs. */ - owq = srq->rq.wq; - head = owq->head; - tail = owq->tail; + if (udata) { + owq = srq->rq.wq; + head = RDMA_READ_UAPI_ATOMIC(owq->head); + tail = RDMA_READ_UAPI_ATOMIC(owq->tail); + } else { + okwq = srq->rq.kwq; + head = okwq->head; + tail = okwq->tail; + } if (head >= srq->rq.size || tail >= srq->rq.size) { ret = -EINVAL; goto bail_unlock; @@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_unlock; } n = 0; - p = wq->wq; + p = tmp_rq.kwq->curr_wq; while (tail != head) { struct rvt_rwqe *wqe; int i; @@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (++tail >= srq->rq.size) tail = 0; } - srq->rq.wq = wq; + srq->rq.kwq = tmp_rq.kwq; + if (udata) { + srq->rq.wq = tmp_rq.wq; + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n); + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0); + } else { + tmp_rq.kwq->head = n; + tmp_rq.kwq->tail = 0; + } srq->rq.size = size; - wq->head = n; - wq->tail = 0; if (attr_mask & IB_SRQ_LIMIT) srq->limit = attr->srq_limit; spin_unlock_irq(&srq->rq.lock); vfree(owq); + kvfree(okwq); if (srq->ip) { struct rvt_mmap_info *ip = srq->ip; struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device); u32 s = sizeof(struct rvt_rwq) + size * sz; - rvt_update_mmap_info(dev, ip, s, wq); + rvt_update_mmap_info(dev, ip, s, tmp_rq.wq); /* * Return the offset to mmap. @@ -301,7 +307,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, bail_unlock: spin_unlock_irq(&srq->rq.lock); bail_free: - vfree(wq); + rvt_free_rq(&tmp_rq); return ret; } @@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) spin_unlock(&dev->n_srqs_lock); if (srq->ip) kref_put(&srq->ip->ref, rvt_release_mmap_info); - else - vfree(srq->rq.wq); + kvfree(srq->rq.kwq); } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 7fcd687af278..ee55fd04f6da 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -52,6 +52,7 @@ #include #include #include +#include /* * Atomic bit definitions for r_aflags. */ @@ -177,33 +178,27 @@ struct rvt_swqe { struct rvt_sge sg_list[0]; }; -/* - * Receive work request queue entry. - * The size of the sg_list is determined when the QP (or SRQ) is created - * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). +/** + * struct rvt_krwq - kernel struct receive work request + * @head: index of next entry to fill + * @tail: index of next entry to pull + * @count: count is aproximate of total receive enteries posted + * @rvt_rwqe: struct of receive work request queue entry + * + * This structure is used to contain the head pointer, + * tail pointer and receive work queue entries for kernel + * mode user. */ -struct rvt_rwqe { - u64 wr_id; - u8 num_sge; - struct ib_sge sg_list[0]; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and receive work queue entries as a single memory allocation so - * it can be mmap'ed into user space. - * Note that the wq array elements are variable size so you can't - * just index into the array to get the N'th element; - * use get_rwqe_ptr() instead. - */ -struct rvt_rwq { +struct rvt_krwq { u32 head; /* new work requests posted to the head */ u32 tail; /* receives pull requests from here. */ - struct rvt_rwqe wq[0]; + struct rvt_rwqe *curr_wq; + struct rvt_rwqe wq[]; }; struct rvt_rq { struct rvt_rwq *wq; + struct rvt_krwq *kwq; u32 size; /* size of RWQE array */ u8 max_sge; /* protect changes in this struct */ @@ -472,7 +467,7 @@ static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp, static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) { return (struct rvt_rwqe *) - ((char *)rq->wq->wq + + ((char *)rq->kwq->curr_wq + (sizeof(struct rvt_rwqe) + rq->max_sge * sizeof(struct ib_sge)) * n); } @@ -852,6 +847,21 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq) ibcq_to_rvtcq(send_cq)->kqueue->head; } +/** + * rvt_free_rq - free memory allocated for rvt_rq struct + * @rvt_rq: request queue data structure + * + * This function should only be called if the rvt_mmap_info() + * has not succeeded. + */ +static inline void rvt_free_rq(struct rvt_rq *rq) +{ + kvfree(rq->kwq); + rq->kwq = NULL; + vfree(rq->wq); + rq->wq = NULL; +} + struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index 8e5f7e0c15fe..d2e35d24f1a9 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -10,6 +10,7 @@ #include #include +#include #ifndef RDMA_ATOMIC_UAPI #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name #endif @@ -29,4 +30,32 @@ struct rvt_cq_wc { struct ib_uverbs_wc uqueue[]; }; +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + __u64 wr_id; + __u8 num_sge; + __u8 padding[7]; + struct ib_sge sg_list[]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() for user space and rvt_get_rwqe_ptr() + * for kernel space. + */ +struct rvt_rwq { + /* new work requests posted to the head */ + RDMA_ATOMIC_UAPI(__u32, head); + /* receives pull requests from here. */ + RDMA_ATOMIC_UAPI(__u32, tail); + struct rvt_rwqe wq[]; +}; #endif /* RVT_ABI_USER_H */ -- cgit v1.2.3 From 303ae1cdfdf7280ff4cfbbe65563b5ff15bb025b Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:27 +0200 Subject: rdma/siw: application interface Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_verbs.c | 1760 ++++++++++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_verbs.h | 91 ++ include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + include/uapi/rdma/siw-abi.h | 185 ++++ 4 files changed, 2037 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_verbs.c create mode 100644 drivers/infiniband/sw/siw/siw_verbs.h create mode 100644 include/uapi/rdma/siw-abi.h (limited to 'include/uapi') diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c new file mode 100644 index 000000000000..32dc79d0e898 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -0,0 +1,1760 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = SIW_QP_STATE_IDLE, + [IB_QPS_INIT] = SIW_QP_STATE_IDLE, + [IB_QPS_RTR] = SIW_QP_STATE_RTR, + [IB_QPS_RTS] = SIW_QP_STATE_RTS, + [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, + [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, + [IB_QPS_ERR] = SIW_QP_STATE_ERROR +}; + +static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { + [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", + [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", + [IB_QPS_ERR] = "ERR" +}; + +static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) +{ + struct siw_uobj *uobj; + struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); + u32 key; + + uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return SIW_INVAL_UOBJ_KEY; + + if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, + GFP_KERNEL) < 0) { + kfree(uobj); + return SIW_INVAL_UOBJ_KEY; + } + uobj->size = PAGE_ALIGN(size); + uobj->addr = vaddr; + + return key; +} + +static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, + unsigned long off, u32 size) +{ + struct siw_uobj *uobj = xa_load(&uctx->xa, off); + + if (uobj && uobj->size == size) + return uobj; + + return NULL; +} + +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct siw_ucontext *uctx = to_siw_ctx(ctx); + struct siw_uobj *uobj; + unsigned long off = vma->vm_pgoff; + int size = vma->vm_end - vma->vm_start; + int rv = -EINVAL; + + /* + * Must be page aligned + */ + if (vma->vm_start & (PAGE_SIZE - 1)) { + pr_warn("siw: mmap not page aligned\n"); + goto out; + } + uobj = siw_get_uobj(uctx, off, size); + if (!uobj) { + siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", + off, size); + goto out; + } + rv = remap_vmalloc_range(vma, uobj->addr, 0); + if (rv) + pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); +out: + return rv; +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_ctx->device); + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); + struct siw_uresp_alloc_ctx uresp = {}; + int rv; + + if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { + rv = -ENOMEM; + goto err_out; + } + xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); + ctx->uobj_nextkey = 0; + ctx->sdev = sdev; + + uresp.dev_id = sdev->vendor_part_id; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + + siw_dbg(base_ctx->device, "success. now %d context(s)\n", + atomic_read(&sdev->num_ctx)); + + return 0; + +err_out: + atomic_dec(&sdev->num_ctx); + siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, + atomic_read(&sdev->num_ctx)); + + return rv; +} + +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) +{ + struct siw_ucontext *uctx = to_siw_ctx(base_ctx); + void *entry; + unsigned long index; + + /* + * Make sure all user mmap objects are gone. Since QP, CQ + * and SRQ destroy routines destroy related objects, nothing + * should be found here. + */ + xa_for_each(&uctx->xa, index, entry) { + kfree(xa_erase(&uctx->xa, index)); + pr_warn("siw: dropping orphaned uobj at %lu\n", index); + } + xa_destroy(&uctx->xa); + atomic_dec(&uctx->sdev->num_ctx); +} + +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + if (udata->inlen || udata->outlen) + return -EINVAL; + + memset(attr, 0, sizeof(*attr)); + + /* Revisit atomic caps if RFC 7306 gets supported */ + attr->atomic_cap = 0; + attr->device_cap_flags = + IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; + attr->max_cq = sdev->attrs.max_cq; + attr->max_cqe = sdev->attrs.max_cqe; + attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; + attr->max_fmr = sdev->attrs.max_fmr; + attr->max_mr = sdev->attrs.max_mr; + attr->max_mw = sdev->attrs.max_mw; + attr->max_mr_size = ~0ull; + attr->max_pd = sdev->attrs.max_pd; + attr->max_qp = sdev->attrs.max_qp; + attr->max_qp_init_rd_atom = sdev->attrs.max_ird; + attr->max_qp_rd_atom = sdev->attrs.max_ord; + attr->max_qp_wr = sdev->attrs.max_qp_wr; + attr->max_recv_sge = sdev->attrs.max_sge; + attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; + attr->max_send_sge = sdev->attrs.max_sge; + attr->max_sge_rd = sdev->attrs.max_sge_rd; + attr->max_srq = sdev->attrs.max_srq; + attr->max_srq_sge = sdev->attrs.max_srq_sge; + attr->max_srq_wr = sdev->attrs.max_srq_wr; + attr->page_size_cap = PAGE_SIZE; + attr->vendor_id = SIW_VENDOR_ID; + attr->vendor_part_id = sdev->vendor_part_id; + + memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + memset(attr, 0, sizeof(*attr)); + + attr->active_mtu = attr->max_mtu; + attr->active_speed = 2; + attr->active_width = 2; + attr->gid_tbl_len = 1; + attr->max_msg_sz = -1; + attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; + attr->pkey_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->state = sdev->state; + /* + * All zero + * + * attr->lid = 0; + * attr->bad_pkey_cntr = 0; + * attr->qkey_viol_cntr = 0; + * attr->sm_lid = 0; + * attr->lmc = 0; + * attr->max_vl_num = 0; + * attr->sm_sl = 0; + * attr->subnet_timeout = 0; + * attr->init_type_repy = 0; + */ + return 0; +} + +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable) +{ + struct ib_port_attr attr; + int rv = siw_query_port(base_dev, port, &attr); + + if (rv) + return rv; + + port_immutable->pkey_tbl_len = attr.pkey_tbl_len; + port_immutable->gid_tbl_len = attr.gid_tbl_len; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) +{ + /* Report the default pkey */ + *pkey = 0xffff; + return 0; +} + +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { + atomic_dec(&sdev->num_pd); + return -ENOMEM; + } + siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); + + return 0; +} + +void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + siw_dbg_pd(pd, "free PD\n"); + atomic_dec(&sdev->num_pd); +} + +void siw_qp_get_ref(struct ib_qp *base_qp) +{ + siw_qp_get(to_siw_qp(base_qp)); +} + +void siw_qp_put_ref(struct ib_qp *base_qp) +{ + siw_qp_put(to_siw_qp(base_qp)); +} + +/* + * siw_create_qp() + * + * Create QP of requested size on given device. + * + * @pd: Protection Domain + * @attrs: Initial QP attributes. + * @udata: used to provide QP ID, SQ and RQ size back to user. + */ + +struct ib_qp *siw_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct siw_qp *qp = NULL; + struct siw_base_qp *siw_base_qp = NULL; + struct ib_device *base_dev = pd->device; + struct siw_device *sdev = to_siw_dev(base_dev); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_cq *scq = NULL, *rcq = NULL; + unsigned long flags; + int num_sqe, num_rqe, rv = 0; + + siw_dbg(base_dev, "create new QP\n"); + + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { + siw_dbg(base_dev, "too many QP's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->qp_type != IB_QPT_RC) { + siw_dbg(base_dev, "only RC QP's supported\n"); + rv = -EINVAL; + goto err_out; + } + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_send_sge > SIW_MAX_SGE) || + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { + siw_dbg(base_dev, "QP size error\n"); + rv = -EINVAL; + goto err_out; + } + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { + siw_dbg(base_dev, "max inline send: %d > %d\n", + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); + rv = -EINVAL; + goto err_out; + } + /* + * NOTE: we allow for zero element SQ and RQ WQE's SGL's + * but not for a QP unable to hold any WQE (SQ + RQ) + */ + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { + siw_dbg(base_dev, "QP must have send or receive queue\n"); + rv = -EINVAL; + goto err_out; + } + scq = to_siw_cq(attrs->send_cq); + rcq = to_siw_cq(attrs->recv_cq); + + if (!scq || (!rcq && !attrs->srq)) { + siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); + rv = -EINVAL; + goto err_out; + } + siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); + if (!siw_base_qp) { + rv = -ENOMEM; + goto err_out; + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + rv = -ENOMEM; + goto err_out; + } + siw_base_qp->qp = qp; + qp->ib_qp = &siw_base_qp->base_qp; + + init_rwsem(&qp->state_lock); + spin_lock_init(&qp->sq_lock); + spin_lock_init(&qp->rq_lock); + spin_lock_init(&qp->orq_lock); + + qp->kernel_verbs = !udata; + qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; + qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; + + rv = siw_qp_add(sdev, qp); + if (rv) + goto err_out; + + /* All queue indices are derived from modulo operations + * on a free running 'get' (consumer) and 'put' (producer) + * unsigned counter. Having queue sizes at power of two + * avoids handling counter wrap around. + */ + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (qp->kernel_verbs) + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); + else + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); + + if (qp->sendq == NULL) { + siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); + rv = -ENOMEM; + goto err_out_xa; + } + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; + else { + rv = -EINVAL; + goto err_out_xa; + } + } + qp->pd = pd; + qp->scq = scq; + qp->rcq = rcq; + + if (attrs->srq) { + /* + * SRQ support. + * Verbs 6.3.7: ignore RQ size, if SRQ present + * Verbs 6.3.5: do not check PD of SRQ against PD of QP + */ + qp->srq = to_siw_srq(attrs->srq); + qp->attrs.rq_size = 0; + siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n", + qp->qp_num, qp->srq); + } else if (num_rqe) { + if (qp->kernel_verbs) + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); + else + qp->recvq = + vmalloc_user(num_rqe * sizeof(struct siw_rqe)); + + if (qp->recvq == NULL) { + siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); + rv = -ENOMEM; + goto err_out_xa; + } + qp->attrs.rq_size = num_rqe; + } + qp->attrs.sq_size = num_sqe; + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; + + /* Make those two tunables fixed for now. */ + qp->tx_ctx.gso_seg_limit = 1; + qp->tx_ctx.zcopy_tx = zcopy_tx; + + qp->attrs.state = SIW_QP_STATE_IDLE; + + if (udata) { + struct siw_uresp_create_qp uresp = {}; + + uresp.num_sqe = num_sqe; + uresp.num_rqe = num_rqe; + uresp.qp_id = qp_id(qp); + + if (qp->sendq) { + qp->xa_sq_index = + siw_create_uobj(uctx, qp->sendq, + num_sqe * sizeof(struct siw_sqe)); + } + if (qp->recvq) { + qp->xa_rq_index = + siw_create_uobj(uctx, qp->recvq, + num_rqe * sizeof(struct siw_rqe)); + } + if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || + qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out_xa; + } + uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; + uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out_xa; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out_xa; + } + qp->tx_cpu = siw_get_tx_cpu(sdev); + if (qp->tx_cpu < 0) { + rv = -EINVAL; + goto err_out_xa; + } + INIT_LIST_HEAD(&qp->devq); + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&qp->devq, &sdev->qp_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + return qp->ib_qp; + +err_out_xa: + xa_erase(&sdev->qp_xa, qp_id(qp)); +err_out: + kfree(siw_base_qp); + + if (qp) { + if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + vfree(qp->sendq); + vfree(qp->recvq); + kfree(qp); + } + atomic_dec(&sdev->num_qp); + + return ERR_PTR(rv); +} + +/* + * Minimum siw_query_qp() verb interface. + * + * @qp_attr_mask is not used but all available information is provided + */ +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct siw_qp *qp; + struct siw_device *sdev; + + if (base_qp && qp_attr && qp_init_attr) { + qp = to_siw_qp(base_qp); + sdev = to_siw_dev(base_qp->device); + } else { + return -EINVAL; + } + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; + qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->qp_type = base_qp->qp_type; + qp_init_attr->send_cq = base_qp->send_cq; + qp_init_attr->recv_cq = base_qp->recv_cq; + qp_init_attr->srq = base_qp->srq; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct siw_qp_attrs new_attrs; + enum siw_qp_attr_mask siw_attr_mask = 0; + struct siw_qp *qp = to_siw_qp(base_qp); + int rv = 0; + + if (!attr_mask) + return 0; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; + + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + new_attrs.flags |= SIW_RDMA_READ_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; + } + if (attr_mask & IB_QP_STATE) { + siw_dbg_qp(qp, "desired IB QP state: %s\n", + ib_qp_state_to_string[attr->qp_state]); + + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; + + if (new_attrs.state > SIW_QP_STATE_RTS) + qp->tx_ctx.tx_suspend = 1; + + siw_attr_mask |= SIW_QP_ATTR_STATE; + } + if (!siw_attr_mask) + goto out; + + down_write(&qp->state_lock); + + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); + + up_write(&qp->state_lock); +out: + return rv; +} + +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_qp_attrs qp_attrs; + + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); + + /* + * Mark QP as in process of destruction to prevent from + * any async callbacks to RDMA core + */ + qp->attrs.flags |= SIW_QP_IN_DESTROY; + qp->rx_stream.rx_suspend = 1; + + if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + down_write(&qp->state_lock); + + qp_attrs.state = SIW_QP_STATE_ERROR; + siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); + + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + up_write(&qp->state_lock); + + kfree(qp->tx_ctx.mpa_crc_hd); + kfree(qp->rx_stream.mpa_crc_hd); + + qp->scq = qp->rcq = NULL; + + siw_qp_put(qp); + kfree(siw_base_qp); + + return 0; +} + +/* + * siw_copy_inline_sgl() + * + * Prepare sgl of inlined data for sending. For userland callers + * function checks if given buffer addresses and len's are within + * process context bounds. + * Data from all provided sge's are copied together into the wqe, + * referenced by a single sge. + */ +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, + struct siw_sqe *sqe) +{ + struct ib_sge *core_sge = core_wr->sg_list; + void *kbuf = &sqe->sge[1]; + int num_sge = core_wr->num_sge, bytes = 0; + + sqe->sge[0].laddr = (u64)kbuf; + sqe->sge[0].lkey = 0; + + while (num_sge--) { + if (!core_sge->length) { + core_sge++; + continue; + } + bytes += core_sge->length; + if (bytes > SIW_MAX_INLINE) { + bytes = -EINVAL; + break; + } + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, + core_sge->length); + + kbuf += core_sge->length; + core_sge++; + } + sqe->sge[0].length = bytes > 0 ? bytes : 0; + sqe->num_sge = bytes > 0 ? 1 : 0; + + return bytes; +} + +/* + * siw_post_send() + * + * Post a list of S-WR's to a SQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_wqe *wqe = tx_wqe(qp); + + unsigned long flags; + int rv = 0; + + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { + up_read(&qp->state_lock); + *bad_wr = wr; + siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (wr && !qp->kernel_verbs) { + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + while (wr) { + u32 idx = qp->sq_put % qp->attrs.sq_size; + struct siw_sqe *sqe = &qp->sendq[idx]; + + if (sqe->flags) { + siw_dbg_qp(qp, "sq full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.sq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + sqe->id = wr->wr_id; + + if ((wr->send_flags & IB_SEND_SIGNALED) || + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) + sqe->flags |= SIW_WQE_SIGNALLED; + + if (wr->send_flags & IB_SEND_FENCE) + sqe->flags |= SIW_WQE_READ_FENCE; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + sqe->flags |= SIW_WQE_SOLICITED; + + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, sqe->sge, + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (rv <= 0) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + if (wr->opcode == IB_WR_SEND) + sqe->opcode = SIW_OP_SEND; + else { + sqe->opcode = SIW_OP_SEND_REMOTE_INV; + sqe->rkey = wr->ex.invalidate_rkey; + } + break; + + case IB_WR_RDMA_READ_WITH_INV: + case IB_WR_RDMA_READ: + /* + * iWarp restricts RREAD sink to SGL containing + * 1 SGE only. we could relax to SGL with multiple + * elements referring the SAME ltag or even sending + * a private per-rreq tag referring to a checked + * local sgl with MULTIPLE ltag's. + */ + if (unlikely(wr->num_sge != 1)) { + rv = -EINVAL; + break; + } + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); + /* + * NOTE: zero length RREAD is allowed! + */ + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->num_sge = 1; + + if (wr->opcode == IB_WR_RDMA_READ) + sqe->opcode = SIW_OP_READ; + else + sqe->opcode = SIW_OP_READ_LOCAL_INV; + break; + + case IB_WR_RDMA_WRITE: + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, &sqe->sge[0], + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (unlikely(rv < 0)) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->opcode = SIW_OP_WRITE; + break; + + case IB_WR_REG_MR: + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; + sqe->rkey = reg_wr(wr)->key; + sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; + sqe->opcode = SIW_OP_REG_MR; + break; + + case IB_WR_LOCAL_INV: + sqe->rkey = wr->ex.invalidate_rkey; + sqe->opcode = SIW_OP_INVAL_STAG; + break; + + default: + siw_dbg_qp(qp, "ib wr type %d unsupported\n", + wr->opcode); + rv = -EINVAL; + break; + } + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", + sqe->opcode, sqe->flags, (void *)sqe->id); + + if (unlikely(rv < 0)) + break; + + /* make SQE only valid after completely written */ + smp_wmb(); + sqe->flags |= SIW_WQE_VALID; + + qp->sq_put++; + wr = wr->next; + } + + /* + * Send directly if SQ processing is not in progress. + * Eventual immediate errors (rv < 0) do not affect the involved + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ + * processing, if new work is already pending. But rv must be passed + * to caller. + */ + if (wqe->wr_status != SIW_WR_IDLE) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + goto skip_direct_sending; + } + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto skip_direct_sending; + + if (qp->kernel_verbs) { + rv = siw_sq_start(qp); + } else { + qp->tx_ctx.in_syscall = 1; + + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) + siw_qp_cm_drop(qp, 0); + + qp->tx_ctx.in_syscall = 0; + } +skip_direct_sending: + + up_read(&qp->state_lock); + + if (rv >= 0) + return 0; + /* + * Immediate error + */ + siw_dbg_qp(qp, "error %d\n", rv); + + *bad_wr = wr; + return rv; +} + +/* + * siw_post_receive() + * + * Post a list of R-WR's to a RQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + unsigned long flags; + int rv = 0; + + if (qp->srq) { + *bad_wr = wr; + return -EOPNOTSUPP; /* what else from errno.h? */ + } + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + return -ENOTCONN; + } + if (!qp->kernel_verbs) { + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + if (qp->attrs.state > SIW_QP_STATE_RTS) { + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + /* + * Serialize potentially multiple producers. + * Not needed for single threaded consumer side. + */ + spin_lock_irqsave(&qp->rq_lock, flags); + + while (wr) { + u32 idx = qp->rq_put % qp->attrs.rq_size; + struct siw_rqe *rqe = &qp->recvq[idx]; + + if (rqe->flags) { + siw_dbg_qp(qp, "RQ full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.rq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* make sure RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + qp->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&qp->rq_lock, flags); + + up_read(&qp->state_lock); + + if (rv < 0) { + siw_dbg_qp(qp, "error %d\n", rv); + *bad_wr = wr; + } + return rv > 0 ? 0 : rv; +} + +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + siw_dbg_cq(cq, "free CQ resources\n"); + + siw_cq_flush(cq); + + if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + + atomic_dec(&sdev->num_cq); + + vfree(cq->queue); +} + +/* + * siw_create_cq() + * + * Populate CQ of requested size + * + * @base_cq: CQ as allocated by RDMA midlayer + * @attr: Initial CQ attributes + * @udata: relates to user context + */ + +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_cq *cq = to_siw_cq(base_cq); + int rv, size = attr->cqe; + + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { + siw_dbg(base_cq->device, "too many CQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (size < 1 || size > sdev->attrs.max_cqe) { + siw_dbg(base_cq->device, "CQ size error: %d\n", size); + rv = -EINVAL; + goto err_out; + } + size = roundup_pow_of_two(size); + cq->base_cq.cqe = size; + cq->num_cqe = size; + cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; + + if (!udata) { + cq->kernel_verbs = 1; + cq->queue = vzalloc(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } else { + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } + if (cq->queue == NULL) { + rv = -ENOMEM; + goto err_out; + } + get_random_bytes(&cq->id, 4); + siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); + + spin_lock_init(&cq->lock); + + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; + + if (udata) { + struct siw_uresp_create_cq uresp = {}; + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + cq->xa_cq_index = + siw_create_uobj(ctx, cq->queue, + size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; + uresp.cq_id = cq->id; + uresp.num_cqe = size; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + return 0; + +err_out: + siw_dbg(base_cq->device, "CQ creation failed: %d", rv); + + if (cq && cq->queue) { + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + vfree(cq->queue); + } + atomic_dec(&sdev->num_cq); + + return rv; +} + +/* + * siw_poll_cq() + * + * Reap CQ entries if available and copy work completion status into + * array of WC's provided by caller. Returns number of reaped CQE's. + * + * @base_cq: Base CQ contained in siw CQ. + * @num_cqe: Maximum number of CQE's to reap. + * @wc: Array of work completions to be filled by siw. + */ +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + int i; + + for (i = 0; i < num_cqe; i++) { + if (!siw_reap_cqe(cq, wc)) + break; + wc++; + } + return i; +} + +/* + * siw_req_notify_cq() + * + * Request notification for new CQE's added to that CQ. + * Defined flags: + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification + * event if a WQE with notification flag set enters the CQ + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification + * event if a WQE enters the CQ. + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the + * number of not reaped CQE's regardless of its notification + * type and current or new CQ notification settings. + * + * @base_cq: Base CQ contained in siw CQ. + * @flags: Requested notification flags. + */ +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + + siw_dbg_cq(cq, "flags: 0x%02x\n", flags); + + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + /* CQ event for next solicited completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); + else + /* CQ event for any signalled completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); + + if (flags & IB_CQ_REPORT_MISSED_EVENTS) + return cq->cq_put - cq->cq_get; + + return 0; +} + +/* + * siw_dereg_mr() + * + * Release Memory Region. + * + * @base_mr: Base MR contained in siw MR. + * @udata: points to user context, unused. + */ +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) +{ + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_device *sdev = to_siw_dev(base_mr->device); + + siw_dbg_mem(mr->mem, "deregister MR\n"); + + atomic_dec(&sdev->num_mr); + + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + + return 0; +} + +/* + * siw_reg_user_mr() + * + * Register Memory Region. + * + * @pd: Protection Domain + * @start: starting address of MR (virtual address) + * @len: len of MR + * @rnic_va: not used by siw + * @rights: MR access rights + * @udata: user buffer to communicate STag and Key. + */ +struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata) +{ + struct siw_mr *mr = NULL; + struct siw_umem *umem = NULL; + struct siw_ureq_reg_mr ureq; + struct siw_device *sdev = to_siw_dev(pd->device); + + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); + int rv; + + siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n", + (unsigned long long)start, (unsigned long long)rnic_va, + (unsigned long long)len); + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (!len) { + rv = -EINVAL; + goto err_out; + } + if (mem_limit != RLIM_INFINITY) { + unsigned long num_pages = + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; + mem_limit >>= PAGE_SHIFT; + + if (num_pages > mem_limit - current->mm->locked_vm) { + siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", + num_pages, mem_limit, + current->mm->locked_vm); + rv = -ENOMEM; + goto err_out; + } + } + umem = siw_umem_get(start, len, ib_access_writable(rights)); + if (IS_ERR(umem)) { + rv = PTR_ERR(umem); + siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); + umem = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); + if (rv) + goto err_out; + + if (udata) { + struct siw_uresp_reg_mr uresp = {}; + struct siw_mem *mem = mr->mem; + + if (udata->inlen < sizeof(ureq)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); + if (rv) + goto err_out; + + mr->base_mr.lkey |= ureq.stag_key; + mr->base_mr.rkey |= ureq.stag_key; + mem->stag |= ureq.stag_key; + uresp.stag = mem->stag; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + mr->mem->stag_valid = 1; + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + if (mr) { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } else { + if (umem) + siw_umem_release(umem, false); + } + return ERR_PTR(rv); +} + +struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + struct siw_pbl *pbl = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (mr_type != IB_MR_TYPE_MEM_REG) { + siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); + rv = -EOPNOTSUPP; + goto err_out; + } + if (max_sge > SIW_MAX_SGE_PBL) { + siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); + rv = -ENOMEM; + goto err_out; + } + pbl = siw_pbl_alloc(max_sge); + if (IS_ERR(pbl)) { + rv = PTR_ERR(pbl); + siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); + pbl = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); + if (rv) + goto err_out; + + mr->mem->is_pbl = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + + if (!mr) { + kfree(pbl); + } else { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } + siw_dbg_pd(pd, "failed: %d\n", rv); + + return ERR_PTR(rv); +} + +/* Just used to count number of pages being mapped */ +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) +{ + return 0; +} + +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off) +{ + struct scatterlist *slp; + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_mem *mem = mr->mem; + struct siw_pbl *pbl = mem->pbl; + struct siw_pble *pble; + u64 pbl_size; + int i, rv; + + if (!pbl) { + siw_dbg_mem(mem, "no PBL allocated\n"); + return -EINVAL; + } + pble = pbl->pbe; + + if (pbl->max_buf < num_sle) { + siw_dbg_mem(mem, "too many SGE's: %d > %d\n", + mem->pbl->max_buf, num_sle); + return -ENOMEM; + } + for_each_sg(sl, slp, num_sle, i) { + if (sg_dma_len(slp) == 0) { + siw_dbg_mem(mem, "empty SGE\n"); + return -EINVAL; + } + if (i == 0) { + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = 0; + pbl_size = pble->size; + pbl->num_buf = 1; + } else { + /* Merge PBL entries if adjacent */ + if (pble->addr + pble->size == sg_dma_address(slp)) { + pble->size += sg_dma_len(slp); + } else { + pble++; + pbl->num_buf++; + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = pbl_size; + } + pbl_size += sg_dma_len(slp); + } + siw_dbg_mem(mem, + "sge[%d], size %llu, addr 0x%016llx, total %llu\n", + i, pble->size, pble->addr, pbl_size); + } + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); + if (rv > 0) { + mem->len = base_mr->length; + mem->va = base_mr->iova; + siw_dbg_mem(mem, + "%llu bytes, start 0x%016llx, %u SLE to %u entries\n", + mem->len, mem->va, num_sle, pbl->num_buf); + } + return rv; +} + +/* + * siw_get_dma_mr() + * + * Create a (empty) DMA memory region, where no umem is attached. + */ +struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); + if (rv) + goto err_out; + + mr->mem->stag_valid = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + if (rv) + kfree(mr); + + atomic_dec(&sdev->num_mr); + + return ERR_PTR(rv); +} + +/* + * siw_create_srq() + * + * Create Shared Receive Queue of attributes @init_attrs + * within protection domain given by @pd. + * + * @base_srq: Base SRQ contained in siw SRQ. + * @init_attrs: SRQ init attributes. + * @udata: points to user context + */ +int siw_create_srq(struct ib_srq *base_srq, + struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct ib_srq_attr *attrs = &init_attrs->attr; + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + int rv; + + if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { + siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || + attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { + rv = -EINVAL; + goto err_out; + } + srq->max_sge = attrs->max_sge; + srq->num_rqe = roundup_pow_of_two(attrs->max_wr); + srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; + srq->limit = attrs->srq_limit; + if (srq->limit) + srq->armed = 1; + + srq->kernel_verbs = !udata; + + if (udata) + srq->recvq = + vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); + else + srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->recvq == NULL) { + rv = -ENOMEM; + goto err_out; + } + if (udata) { + struct siw_uresp_create_srq uresp = {}; + + srq->xa_srq_index = siw_create_uobj( + ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.srq_key = srq->xa_srq_index; + uresp.num_rqe = srq->num_rqe; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + spin_lock_init(&srq->lock); + + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq); + + return 0; + +err_out: + if (srq->recvq) { + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + vfree(srq->recvq); + } + atomic_dec(&sdev->num_srq); + + return rv; +} + +/* + * siw_modify_srq() + * + * Modify SRQ. The caller may resize SRQ and/or set/reset notification + * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. + * + * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE + * parameter. siw_modify_srq() does not check the attrs->max_sge param. + */ +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&srq->lock, flags); + + if (attr_mask & IB_SRQ_MAX_WR) { + /* resize request not yet supported */ + rv = -EOPNOTSUPP; + goto out; + } + if (attr_mask & IB_SRQ_LIMIT) { + if (attrs->srq_limit) { + if (unlikely(attrs->srq_limit > srq->num_rqe)) { + rv = -EINVAL; + goto out; + } + srq->armed = 1; + } else { + srq->armed = 0; + } + srq->limit = attrs->srq_limit; + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return rv; +} + +/* + * siw_query_srq() + * + * Query SRQ attributes. + */ +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + + spin_lock_irqsave(&srq->lock, flags); + + attrs->max_wr = srq->num_rqe; + attrs->max_sge = srq->max_sge; + attrs->srq_limit = srq->limit; + + spin_unlock_irqrestore(&srq->lock, flags); + + return 0; +} + +/* + * siw_destroy_srq() + * + * Destroy SRQ. + * It is assumed that the SRQ is not referenced by any + * QP anymore - the code trusts the RDMA core environment to keep track + * of QP references. + */ +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + + vfree(srq->recvq); + atomic_dec(&sdev->num_srq); +} + +/* + * siw_post_srq_recv() + * + * Post a list of receive queue elements to SRQ. + * NOTE: The function does not check or lock a certain SRQ state + * during the post operation. The code simply trusts the + * RDMA core environment. + * + * @base_srq: Base SRQ contained in siw SRQ + * @wr: List of R-WR's + * @bad_wr: Updated to failing WR if posting fails. + */ +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + if (unlikely(!srq->kernel_verbs)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", + srq); + rv = -EINVAL; + goto out; + } + /* + * Serialize potentially multiple producers. + * Also needed to serialize potentially multiple + * consumers. + */ + spin_lock_irqsave(&srq->lock, flags); + + while (wr) { + u32 idx = srq->rq_put % srq->num_rqe; + struct siw_rqe *rqe = &srq->recvq[idx]; + + if (rqe->flags) { + siw_dbg_pd(base_srq->pd, "SRQ full\n"); + rv = -ENOMEM; + break; + } + if (unlikely(wr->num_sge > srq->max_sge)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: too many sge's: %d\n", srq, + wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* Make sure S-RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + srq->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&srq->lock, flags); +out: + if (unlikely(rv < 0)) { + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv); + *bad_wr = wr; + } + return rv; +} + +void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_qp *base_qp = qp->ib_qp; + + /* + * Do not report asynchronous errors on QP which gets + * destroyed via verbs interface (siw_destroy_qp()) + */ + if (qp->attrs.flags & SIW_QP_IN_DESTROY) + return; + + event.event = etype; + event.device = base_qp->device; + event.element.qp = base_qp; + + if (base_qp->event_handler) { + siw_dbg_qp(qp, "reporting event %d\n", etype); + base_qp->event_handler(&event, base_qp->qp_context); + } +} + +void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_cq *base_cq = &cq->base_cq; + + event.event = etype; + event.device = base_cq->device; + event.element.cq = base_cq; + + if (base_cq->event_handler) { + siw_dbg_cq(cq, "reporting CQ event %d\n", etype); + base_cq->event_handler(&event, base_cq->cq_context); + } +} + +void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_srq *base_srq = &srq->base_srq; + + event.event = etype; + event.device = base_srq->device; + event.element.srq = base_srq; + + if (base_srq->event_handler) { + siw_dbg_pd(srq->base_srq.pd, + "reporting SRQ event %d\n", etype); + base_srq->event_handler(&event, base_srq->srq_context); + } +} + +void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) +{ + struct ib_event event; + + event.event = etype; + event.device = &sdev->base_dev; + event.element.port_num = port; + + siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); + + ib_dispatch_event(&event); +} diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h new file mode 100644 index 000000000000..1910869281cb --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_VERBS_H +#define _SIW_VERBS_H + +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_cm.h" + +/* + * siw_copy_sgl() + * + * Copy SGL from RDMA core representation to local + * representation. + */ +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge, + int num_sge) +{ + while (num_sge--) { + siw_sge->laddr = sge->addr; + siw_sge->length = sge->length; + siw_sge->lkey = sge->lkey; + + siw_sge++; + sge++; + } +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata); +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable); +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata); +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey); +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid); +int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata); +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata); +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata); +int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc); +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags); +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata); +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata); +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights); +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off); +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata); +int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr, + struct ib_udata *udata); +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, struct ib_udata *udata); +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr); +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata); +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void siw_qp_event(struct siw_qp *qp, enum ib_event_type type); +void siw_cq_event(struct siw_cq *cq, enum ib_event_type type); +void siw_srq_event(struct siw_srq *srq, enum ib_event_type type); +void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type); + +#endif diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 26213f49f5c8..64c14cb0022f 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -103,6 +103,7 @@ enum rdma_driver_id { RDMA_DRIVER_HFI1, RDMA_DRIVER_QIB, RDMA_DRIVER_EFA, + RDMA_DRIVER_SIW, }; #endif diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h new file mode 100644 index 000000000000..3dd8071ace7b --- /dev/null +++ b/include/uapi/rdma/siw-abi.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_USER_H +#define _SIW_USER_H + +#include + +#define SIW_NODE_DESC_COMMON "Software iWARP stack" +#define SIW_ABI_VERSION 1 +#define SIW_MAX_SGE 6 +#define SIW_UOBJ_MAX_KEY 0x08FFFF +#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1) + +struct siw_uresp_create_cq { + __u32 cq_id; + __u32 num_cqe; + __aligned_u64 cq_key; +}; + +struct siw_uresp_create_qp { + __u32 qp_id; + __u32 num_sqe; + __u32 num_rqe; + __u32 pad; + __aligned_u64 sq_key; + __aligned_u64 rq_key; +}; + +struct siw_ureq_reg_mr { + __u8 stag_key; + __u8 reserved[3]; + __u32 pad; +}; + +struct siw_uresp_reg_mr { + __u32 stag; + __u32 pad; +}; + +struct siw_uresp_create_srq { + __u32 num_rqe; + __u32 pad; + __aligned_u64 srq_key; +}; + +struct siw_uresp_alloc_ctx { + __u32 dev_id; + __u32 pad; +}; + +enum siw_opcode { + SIW_OP_WRITE, + SIW_OP_READ, + SIW_OP_READ_LOCAL_INV, + SIW_OP_SEND, + SIW_OP_SEND_WITH_IMM, + SIW_OP_SEND_REMOTE_INV, + + /* Unsupported */ + SIW_OP_FETCH_AND_ADD, + SIW_OP_COMP_AND_SWAP, + + SIW_OP_RECEIVE, + /* provider internal SQE */ + SIW_OP_READ_RESPONSE, + /* + * below opcodes valid for + * in-kernel clients only + */ + SIW_OP_INVAL_STAG, + SIW_OP_REG_MR, + SIW_NUM_OPCODES +}; + +/* Keep it same as ibv_sge to allow for memcpy */ +struct siw_sge { + __aligned_u64 laddr; + __u32 length; + __u32 lkey; +}; + +/* + * Inline data are kept within the work request itself occupying + * the space of sge[1] .. sge[n]. Therefore, inline data cannot be + * supported if SIW_MAX_SGE is below 2 elements. + */ +#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1)) + +#if SIW_MAX_SGE < 2 +#error "SIW_MAX_SGE must be at least 2" +#endif + +enum siw_wqe_flags { + SIW_WQE_VALID = 1, + SIW_WQE_INLINE = (1 << 1), + SIW_WQE_SIGNALLED = (1 << 2), + SIW_WQE_SOLICITED = (1 << 3), + SIW_WQE_READ_FENCE = (1 << 4), + SIW_WQE_REM_INVAL = (1 << 5), + SIW_WQE_COMPLETED = (1 << 6) +}; + +/* Send Queue Element */ +struct siw_sqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* Contains enum siw_opcode values */ + __u8 opcode; + __u32 rkey; + union { + __aligned_u64 raddr; + __aligned_u64 base_mr; + }; + union { + struct siw_sge sge[SIW_MAX_SGE]; + __aligned_u64 access; + }; +}; + +/* Receive Queue Element */ +struct siw_rqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* + * only used by kernel driver, + * ignored if set by user + */ + __u8 opcode; + __u32 unused; + struct siw_sge sge[SIW_MAX_SGE]; +}; + +enum siw_notify_flags { + SIW_NOTIFY_NOT = (0), + SIW_NOTIFY_SOLICITED = (1 << 0), + SIW_NOTIFY_NEXT_COMPLETION = (1 << 1), + SIW_NOTIFY_MISSED_EVENTS = (1 << 2), + SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION | + SIW_NOTIFY_MISSED_EVENTS +}; + +enum siw_wc_status { + SIW_WC_SUCCESS, + SIW_WC_LOC_LEN_ERR, + SIW_WC_LOC_PROT_ERR, + SIW_WC_LOC_QP_OP_ERR, + SIW_WC_WR_FLUSH_ERR, + SIW_WC_BAD_RESP_ERR, + SIW_WC_LOC_ACCESS_ERR, + SIW_WC_REM_ACCESS_ERR, + SIW_WC_REM_INV_REQ_ERR, + SIW_WC_GENERAL_ERR, + SIW_NUM_WC_STATUS +}; + +struct siw_cqe { + __aligned_u64 id; + __u8 flags; + __u8 opcode; + __u16 status; + __u32 bytes; + union { + __aligned_u64 imm_data; + __u32 inval_stag; + }; + /* QP number or QP pointer */ + union { + struct ib_qp *base_qp; + __aligned_u64 qp_id; + }; +}; + +/* + * Shared structure between user and kernel + * to control CQ arming. + */ +struct siw_cq_ctrl { + __aligned_u64 notify; +}; +#endif -- cgit v1.2.3 From 2afc5e1b9c340ff20848c8dd8fb60342617bce52 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:29 +0300 Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD and its initial implementation. This object is from type class FD and will be used to read DEVX async events. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 95 +++++++++++++++++++++++++++++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 10 ++++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 4 ++ 3 files changed, 109 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 931f587dfb8f..ed01523f0f02 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -33,6 +33,17 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +struct devx_async_event_file { + struct ib_uobject uobj; + /* Head of events that are subscribed to this FD */ + struct list_head subscribed_events_list; + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; + struct mlx5_ib_dev *dev; + u8 omit_data:1; +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; @@ -1365,6 +1376,37 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( return 0; } +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE); + struct devx_async_event_file *ev_file; + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 flags; + int err; + + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA); + + if (err) + return err; + + ev_file = container_of(uobj, struct devx_async_event_file, + uobj); + spin_lock_init(&ev_file->lock); + INIT_LIST_HEAD(&ev_file->event_list); + init_waitqueue_head(&ev_file->poll_wait); + if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA) + ev_file->omit_data = 1; + INIT_LIST_HEAD(&ev_file->subscribed_events_list); + ev_file->dev = dev; + return 0; +} + static void devx_query_callback(int status, struct mlx5_async_work *context) { struct devx_async_data *async_data = @@ -1719,6 +1761,32 @@ static const struct file_operations devx_async_cmd_event_fops = { .llseek = no_llseek, }; +static ssize_t devx_async_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + return -EINVAL; +} + +static __poll_t devx_async_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + return 0; +} + +static int devx_async_event_close(struct inode *inode, struct file *filp) +{ + uverbs_close_fd(filp); + return 0; +} + +static const struct file_operations devx_async_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_event_read, + .poll = devx_async_event_poll, + .release = devx_async_event_close, + .llseek = no_llseek, +}; + static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { @@ -1738,6 +1806,12 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, return 0; }; +static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +}; + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_DEVX_UMEM_REG, UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, @@ -1903,6 +1977,24 @@ DECLARE_UVERBS_NAMED_OBJECT( O_RDONLY), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + enum mlx5_ib_uapi_devx_create_event_channel_flags, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file), + devx_hot_unplug_async_event_file, + &devx_async_event_fops, "[devx_async_event]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)); + static bool devx_is_supported(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -1923,5 +2015,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), {}, }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index d404c951954c..6ad8f4f11ddd 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -127,16 +127,26 @@ enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), }; +enum mlx5_ib_devx_async_event_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, +}; + enum mlx5_ib_devx_async_cmd_fd_methods { MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), }; +enum mlx5_ib_devx_async_event_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + enum mlx5_ib_objects { MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_OBJECT_DEVX_OBJ, MLX5_IB_OBJECT_DEVX_UMEM, MLX5_IB_OBJECT_FLOW_MATCHER, MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, }; enum mlx5_ib_flow_matcher_create_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index a8f34c237458..b44691315d39 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -63,5 +63,9 @@ enum mlx5_ib_uapi_dm_type { MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM, }; +enum mlx5_ib_uapi_devx_create_event_channel_flags { + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0, +}; + #endif -- cgit v1.2.3 From 7597385371425febdaa8c6a1da3625d4ffff16f5 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:31 +0300 Subject: IB/mlx5: Enable subscription for device events over DEVX Enable subscription for device events over DEVX. Each subscription is added to the two level xarray data structure according to its event number and the DEVX object information in case was given with the given target fd. Those events will be reported over the given fd once will occur. Downstream patches will mange the dispatching to any subscription. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 560 ++++++++++++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 + 2 files changed, 562 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index a9affc905bfa..9c21cafc44a6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -14,6 +14,7 @@ #include #include #include "mlx5_ib.h" +#include #define UVERBS_MODULE_NAME mlx5_ib #include @@ -33,6 +34,40 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +/* first level XA value data structure */ +struct devx_event { + struct xarray object_ids; /* second XA level, Key = object id */ + struct list_head unaffiliated_list; +}; + +/* second level XA value data structure */ +struct devx_obj_event { + struct rcu_head rcu; + struct list_head obj_sub_list; +}; + +struct devx_event_subscription { + struct list_head file_list; /* headed in ev_file-> + * subscribed_events_list + */ + struct list_head xa_list; /* headed in devx_event->unaffiliated_list or + * devx_obj_event->obj_sub_list + */ + struct list_head obj_list; /* headed in devx_object */ + struct list_head event_list; /* headed in ev_file->event_list or in + * temp list via subscription + */ + + u8 is_cleaned:1; + u32 xa_key_level1; + u32 xa_key_level2; + struct rcu_head rcu; + u64 cookie; + struct devx_async_event_file *ev_file; + struct file *filp; /* Upon hot unplug we need a direct access to */ + struct eventfd_ctx *eventfd; +}; + struct devx_async_event_file { struct ib_uobject uobj; /* Head of events that are subscribed to this FD */ @@ -55,6 +90,7 @@ struct devx_obj { struct mlx5_ib_devx_mr devx_mr; struct mlx5_core_dct core_dct; }; + struct list_head event_sub; /* holds devx_event_subscription entries */ }; struct devx_umem { @@ -160,6 +196,104 @@ bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id) return false; } +static bool is_legacy_unaffiliated_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PORT_CHANGE: + return true; + default: + return false; + } +} + +static bool is_legacy_obj_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_CQ_ERROR: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_DCT_DRAINED: + case MLX5_EVENT_TYPE_COMP: + return true; + default: + return false; + } +} + +static u16 get_legacy_obj_type(u16 opcode) +{ + switch (opcode) { + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_EVENT_QUEUE_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_EVENT_QUEUE_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_EVENT_QUEUE_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return 0; + } +} + +static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num) +{ + u16 opcode; + + opcode = (obj->obj_id >> 32) & 0xffff; + + if (is_legacy_obj_event_num(event_num)) + return get_legacy_obj_type(opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + return (obj->obj_id >> 48); + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_OBJ_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_OBJ_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_OBJ_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_OBJ_TYPE_DCT; + case MLX5_CMD_OP_CREATE_TIR: + return MLX5_OBJ_TYPE_TIR; + case MLX5_CMD_OP_CREATE_TIS: + return MLX5_OBJ_TYPE_TIS; + case MLX5_CMD_OP_CREATE_PSV: + return MLX5_OBJ_TYPE_PSV; + case MLX5_OBJ_TYPE_MKEY: + return MLX5_OBJ_TYPE_MKEY; + case MLX5_CMD_OP_CREATE_RMP: + return MLX5_OBJ_TYPE_RMP; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + return MLX5_OBJ_TYPE_XRC_SRQ; + case MLX5_CMD_OP_CREATE_XRQ: + return MLX5_OBJ_TYPE_XRQ; + case MLX5_CMD_OP_CREATE_RQT: + return MLX5_OBJ_TYPE_RQT; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + return MLX5_OBJ_TYPE_FLOW_COUNTER; + case MLX5_CMD_OP_CREATE_CQ: + return MLX5_OBJ_TYPE_CQ; + default: + return 0; + } +} + +static u32 get_dec_obj_id(u64 obj_id) +{ + return (obj_id & 0xffffffff); +} + /* * As the obj_id in the firmware is not globally unique the object type * must be considered upon checking for a valid object id. @@ -1126,14 +1260,47 @@ static void devx_cleanup_mkey(struct devx_obj *obj) mlx5_base_mkey(obj->devx_mr.mmkey.key)); } +static void devx_cleanup_subscription(struct mlx5_ib_dev *dev, + struct devx_event_subscription *sub) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + if (sub->is_cleaned) + return; + + sub->is_cleaned = 1; + list_del_rcu(&sub->xa_list); + + if (list_empty(&sub->obj_list)) + return; + + list_del_rcu(&sub->obj_list); + /* check whether key level 1 for this obj_sub_list is empty */ + event = xa_load(&dev->devx_event_table.event_xa, + sub->xa_key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + sub->xa_key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + static int devx_obj_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) { u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct mlx5_devx_event_table *devx_event_table; struct devx_obj *obj = uobject->object; + struct devx_event_subscription *sub_entry, *tmp; + struct mlx5_ib_dev *dev; int ret; + dev = mlx5_udata_to_mdev(&attrs->driver_udata); if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); @@ -1145,10 +1312,14 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, if (ib_is_destroy_retryable(ret, why, uobject)) return ret; - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { - struct mlx5_ib_dev *dev = - mlx5_udata_to_mdev(&attrs->driver_udata); + devx_event_table = &dev->devx_event_table; + + mutex_lock(&devx_event_table->event_xa_lock); + list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list) + devx_cleanup_subscription(dev, sub_entry); + mutex_unlock(&devx_event_table->event_xa_lock); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu, devx_free_indirect_mkey); return ret; @@ -1220,6 +1391,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( uobj->object = obj; obj->mdev = dev->mdev; + INIT_LIST_HEAD(&obj->event_sub); devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); @@ -1404,6 +1576,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)( ev_file->omit_data = 1; INIT_LIST_HEAD(&ev_file->subscribed_events_list); ev_file->dev = dev; + get_device(&dev->ib_dev.dev); return 0; } @@ -1516,6 +1689,331 @@ sub_bytes: return err; } +static void +subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + /* Level 1 is valid for future use, no need to free */ + if (!is_level2) + return; + + event = xa_load(&devx_event_table->event_xa, key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, + key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + +static int +subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_obj_event *obj_event; + struct devx_event *event; + int err; + + event = xa_load(&devx_event_table->event_xa, key_level1); + if (!event) { + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + INIT_LIST_HEAD(&event->unaffiliated_list); + xa_init(&event->object_ids); + + err = xa_insert(&devx_event_table->event_xa, + key_level1, + event, + GFP_KERNEL); + if (err) { + kfree(event); + return err; + } + } + + if (!is_level2) + return 0; + + obj_event = xa_load(&event->object_ids, key_level2); + if (!obj_event) { + obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL); + if (!obj_event) + /* Level1 is valid for future use, no need to free */ + return -ENOMEM; + + err = xa_insert(&event->object_ids, + key_level2, + obj_event, + GFP_KERNEL); + if (err) + return err; + INIT_LIST_HEAD(&obj_event->obj_sub_list); + } + + return 0; +} + +static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + int i; + + for (i = 0; i < num_events; i++) { + if (obj) { + if (!is_legacy_obj_event_num(event_type_num_list[i])) + return false; + } else if (!is_legacy_unaffiliated_event_num( + event_type_num_list[i])) { + return false; + } + } + + return true; +} + +#define MAX_SUPP_EVENT_NUM 255 +static bool is_valid_events(struct mlx5_core_dev *dev, + int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + __be64 *aff_events; + __be64 *unaff_events; + int mask_entry; + int mask_bit; + int i; + + if (MLX5_CAP_GEN(dev, event_cap)) { + aff_events = MLX5_CAP_DEV_EVENT(dev, + user_affiliated_events); + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + } else { + return is_valid_events_legacy(num_events, event_type_num_list, + obj); + } + + for (i = 0; i < num_events; i++) { + if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM) + return false; + + mask_entry = event_type_num_list[i] / 64; + mask_bit = event_type_num_list[i] % 64; + + if (obj) { + /* CQ completion */ + if (event_type_num_list[i] == 0) + continue; + + if (!(be64_to_cpu(aff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + + continue; + } + + if (!(be64_to_cpu(unaff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + } + + return true; +} + +#define MAX_NUM_EVENTS 16 +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *devx_uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + struct ib_uobject *fd_uobj; + struct devx_obj *obj = NULL; + struct devx_async_event_file *ev_file; + struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table; + u16 *event_type_num_list; + struct devx_event_subscription *event_sub, *tmp_sub; + struct list_head sub_list; + int redirect_fd; + bool use_eventfd = false; + int num_events; + int num_alloc_xa_entries = 0; + u16 obj_type = 0; + u64 cookie = 0; + u32 obj_id = 0; + int err; + int i; + + if (!c->devx_uid) + return -EINVAL; + + if (!IS_ERR(devx_uobj)) { + obj = (struct devx_obj *)devx_uobj->object; + if (obj) + obj_id = get_dec_obj_id(obj->obj_id); + } + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_event_file, + uobj); + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) { + err = uverbs_copy_from(&redirect_fd, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM); + if (err) + return err; + + use_eventfd = true; + } + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) { + if (use_eventfd) + return -EINVAL; + + err = uverbs_copy_from(&cookie, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE); + if (err) + return err; + } + + num_events = uverbs_attr_ptr_get_array_size( + attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + sizeof(u16)); + + if (num_events < 0) + return num_events; + + if (num_events > MAX_NUM_EVENTS) + return -EINVAL; + + event_type_num_list = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST); + + if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj)) + return -EINVAL; + + INIT_LIST_HEAD(&sub_list); + + /* Protect from concurrent subscriptions to same XA entries to allow + * both to succeed + */ + mutex_lock(&devx_event_table->event_xa_lock); + for (i = 0; i < num_events; i++) { + u32 key_level1; + + if (obj) + obj_type = get_dec_obj_type(obj, + event_type_num_list[i]); + key_level1 = event_type_num_list[i] | obj_type << 16; + + err = subscribe_event_xa_alloc(devx_event_table, + key_level1, + obj, + obj_id); + if (err) + goto err; + + num_alloc_xa_entries++; + event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL); + if (!event_sub) + goto err; + + list_add_tail(&event_sub->event_list, &sub_list); + if (use_eventfd) { + event_sub->eventfd = + eventfd_ctx_fdget(redirect_fd); + + if (IS_ERR(event_sub)) { + err = PTR_ERR(event_sub->eventfd); + event_sub->eventfd = NULL; + goto err; + } + } + + event_sub->cookie = cookie; + event_sub->ev_file = ev_file; + event_sub->filp = fd_uobj->object; + /* May be needed upon cleanup the devx object/subscription */ + event_sub->xa_key_level1 = key_level1; + event_sub->xa_key_level2 = obj_id; + INIT_LIST_HEAD(&event_sub->obj_list); + } + + /* Once all the allocations and the XA data insertions were done we + * can go ahead and add all the subscriptions to the relevant lists + * without concern of a failure. + */ + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + struct devx_event *event; + struct devx_obj_event *obj_event; + + list_del_init(&event_sub->event_list); + + spin_lock_irq(&ev_file->lock); + list_add_tail_rcu(&event_sub->file_list, + &ev_file->subscribed_events_list); + spin_unlock_irq(&ev_file->lock); + + event = xa_load(&devx_event_table->event_xa, + event_sub->xa_key_level1); + WARN_ON(!event); + + if (!obj) { + list_add_tail_rcu(&event_sub->xa_list, + &event->unaffiliated_list); + continue; + } + + obj_event = xa_load(&event->object_ids, obj_id); + WARN_ON(!obj_event); + list_add_tail_rcu(&event_sub->xa_list, + &obj_event->obj_sub_list); + list_add_tail_rcu(&event_sub->obj_list, + &obj->event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return 0; + +err: + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + list_del(&event_sub->event_list); + + subscribe_event_xa_dealloc(devx_event_table, + event_sub->xa_key_level1, + obj, + obj_id); + + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + + kfree(event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return err; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1682,14 +2180,21 @@ void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) { struct mlx5_devx_event_table *table = &dev->devx_event_table; + struct devx_event_subscription *sub, *tmp; + struct devx_event *event; void *entry; unsigned long id; mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb); - - xa_for_each(&table->event_xa, id, entry) + mutex_lock(&dev->devx_event_table.event_xa_lock); + xa_for_each(&table->event_xa, id, entry) { + event = entry; + list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list, + xa_list) + devx_cleanup_subscription(dev, sub); kfree(entry); - + } + mutex_unlock(&dev->devx_event_table.event_xa_lock); xa_destroy(&table->event_xa); } @@ -1805,7 +2310,26 @@ static __poll_t devx_async_event_poll(struct file *filp, static int devx_async_event_close(struct inode *inode, struct file *filp) { + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub, *event_sub_tmp; + + mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock); + /* delete the subscriptions which are related to this FD */ + list_for_each_entry_safe(event_sub, event_sub_tmp, + &ev_file->subscribed_events_list, file_list) { + devx_cleanup_subscription(ev_file->dev, event_sub); + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + + list_del_rcu(&event_sub->file_list); + /* subscription may not be used by the read API any more */ + kfree_rcu(event_sub, rcu); + } + + mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock); + uverbs_close_fd(filp); + put_device(&ev_file->dev->ib_dev.dev); return 0; } @@ -1973,10 +2497,32 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + UVERBS_ATTR_MIN_SIZE(sizeof(u16)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), - &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN)); + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)); DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup), diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 6ad8f4f11ddd..d0da070cf0ab 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -51,6 +51,7 @@ enum mlx5_ib_devx_methods { MLX5_IB_METHOD_DEVX_OTHER = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_METHOD_DEVX_QUERY_UAR, MLX5_IB_METHOD_DEVX_QUERY_EQN, + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, }; enum mlx5_ib_devx_other_attrs { @@ -93,6 +94,14 @@ enum mlx5_ib_devx_obj_query_async_attrs { MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, }; +enum mlx5_ib_devx_subscribe_event_attrs { + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, +}; + enum mlx5_ib_devx_query_eqn_attrs { MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, -- cgit v1.2.3 From 5ec9d8ee87c627a2c981d871e41f6e2a942f53fd Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:32 +0300 Subject: IB/mlx5: Implement DEVX dispatching event Implement DEVX dispatching event by looking up for the applicable subscriptions for the reported event and using their target fd to signal/set the event. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 303 +++++++++++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 + 2 files changed, 305 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9c21cafc44a6..867b9778c063 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -34,6 +34,11 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +struct devx_async_event_data { + struct list_head list; /* headed in ev_file->event_list */ + struct mlx5_ib_uapi_devx_async_event_hdr hdr; +}; + /* first level XA value data structure */ struct devx_event { struct xarray object_ids; /* second XA level, Key = object id */ @@ -77,6 +82,8 @@ struct devx_async_event_file { struct list_head event_list; struct mlx5_ib_dev *dev; u8 omit_data:1; + u8 is_overflow_err:1; + u8 is_destroyed:1; }; #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) @@ -289,6 +296,29 @@ static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num) } } +static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe) +{ + switch (event_type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + return eqe->data.qp_srq.type; + case MLX5_EVENT_TYPE_CQ_ERROR: + return 0; + case MLX5_EVENT_TYPE_DCT_DRAINED: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return MLX5_GET(affiliated_event_header, &eqe->data, obj_type); + } +} + static u32 get_dec_obj_id(u64 obj_id) { return (obj_id & 0xffffffff); @@ -2161,10 +2191,170 @@ static int devx_umem_cleanup(struct ib_uobject *uobject, return 0; } +static bool is_unaffiliated_event(struct mlx5_core_dev *dev, + unsigned long event_type) +{ + __be64 *unaff_events; + int mask_entry; + int mask_bit; + + if (!MLX5_CAP_GEN(dev, event_cap)) + return is_legacy_unaffiliated_event_num(event_type); + + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + WARN_ON(event_type > MAX_SUPP_EVENT_NUM); + + mask_entry = event_type / 64; + mask_bit = event_type % 64; + + if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit))) + return false; + + return true; +} + +static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data) +{ + struct mlx5_eqe *eqe = data; + u32 obj_id = 0; + + switch (event_type) { + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + break; + case MLX5_EVENT_TYPE_DCT_DRAINED: + obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + break; + case MLX5_EVENT_TYPE_CQ_ERROR: + obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff; + break; + default: + obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id); + break; + } + + return obj_id; +} + +static int deliver_event(struct devx_event_subscription *event_sub, + const void *data) +{ + struct devx_async_event_file *ev_file; + struct devx_async_event_data *event_data; + unsigned long flags; + + ev_file = event_sub->ev_file; + + if (ev_file->omit_data) { + spin_lock_irqsave(&ev_file->lock, flags); + if (!list_empty(&event_sub->event_list)) { + spin_unlock_irqrestore(&ev_file->lock, flags); + return 0; + } + + list_add_tail(&event_sub->event_list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + return 0; + } + + event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe), + GFP_ATOMIC); + if (!event_data) { + spin_lock_irqsave(&ev_file->lock, flags); + ev_file->is_overflow_err = 1; + spin_unlock_irqrestore(&ev_file->lock, flags); + return -ENOMEM; + } + + event_data->hdr.cookie = event_sub->cookie; + memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe)); + + spin_lock_irqsave(&ev_file->lock, flags); + list_add_tail(&event_data->list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + + return 0; +} + +static void dispatch_event_fd(struct list_head *fd_list, + const void *data) +{ + struct devx_event_subscription *item; + + list_for_each_entry_rcu(item, fd_list, xa_list) { + if (!get_file_rcu(item->filp)) + continue; + + if (item->eventfd) { + eventfd_signal(item->eventfd, 1); + fput(item->filp); + continue; + } + + deliver_event(item, data); + fput(item->filp); + } +} + static int devx_event_notifier(struct notifier_block *nb, unsigned long event_type, void *data) { - return NOTIFY_DONE; + struct mlx5_devx_event_table *table; + struct mlx5_ib_dev *dev; + struct devx_event *event; + struct devx_obj_event *obj_event; + u16 obj_type = 0; + bool is_unaffiliated; + u32 obj_id; + + /* Explicit filtering to kernel events which may occur frequently */ + if (event_type == MLX5_EVENT_TYPE_CMD || + event_type == MLX5_EVENT_TYPE_PAGE_REQUEST) + return NOTIFY_OK; + + table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb); + dev = container_of(table, struct mlx5_ib_dev, devx_event_table); + is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type); + + if (!is_unaffiliated) + obj_type = get_event_obj_type(event_type, data); + + rcu_read_lock(); + event = xa_load(&table->event_xa, event_type | (obj_type << 16)); + if (!event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + if (is_unaffiliated) { + dispatch_event_fd(&event->unaffiliated_list, data); + rcu_read_unlock(); + return NOTIFY_OK; + } + + obj_id = devx_get_obj_id_from_event(event_type, data); + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + dispatch_event_fd(&obj_event->obj_sub_list, data); + + rcu_read_unlock(); + return NOTIFY_OK; } void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) @@ -2299,19 +2489,108 @@ static const struct file_operations devx_async_cmd_event_fops = { static ssize_t devx_async_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { - return -EINVAL; + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub; + struct devx_async_event_data *uninitialized_var(event); + int ret = 0; + size_t eventsz; + bool omit_data; + void *event_data; + + omit_data = ev_file->omit_data; + + spin_lock_irq(&ev_file->lock); + + if (ev_file->is_overflow_err) { + ev_file->is_overflow_err = 0; + spin_unlock_irq(&ev_file->lock); + return -EOVERFLOW; + } + + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + + while (list_empty(&ev_file->event_list)) { + spin_unlock_irq(&ev_file->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(ev_file->poll_wait, + (!list_empty(&ev_file->event_list) || + ev_file->is_destroyed))) { + return -ERESTARTSYS; + } + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + } + + if (omit_data) { + event_sub = list_first_entry(&ev_file->event_list, + struct devx_event_subscription, + event_list); + eventsz = sizeof(event_sub->cookie); + event_data = &event_sub->cookie; + } else { + event = list_first_entry(&ev_file->event_list, + struct devx_async_event_data, list); + eventsz = sizeof(struct mlx5_eqe) + + sizeof(struct mlx5_ib_uapi_devx_async_event_hdr); + event_data = &event->hdr; + } + + if (eventsz > count) { + spin_unlock_irq(&ev_file->lock); + return -EINVAL; + } + + if (omit_data) + list_del_init(&event_sub->event_list); + else + list_del(&event->list); + + spin_unlock_irq(&ev_file->lock); + + if (copy_to_user(buf, event_data, eventsz)) + /* This points to an application issue, not a kernel concern */ + ret = -EFAULT; + else + ret = eventsz; + + if (!omit_data) + kfree(event); + return ret; } static __poll_t devx_async_event_poll(struct file *filp, struct poll_table_struct *wait) { - return 0; + struct devx_async_event_file *ev_file = filp->private_data; + __poll_t pollflags = 0; + + poll_wait(filp, &ev_file->poll_wait, wait); + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (!list_empty(&ev_file->event_list)) + pollflags = EPOLLIN | EPOLLRDNORM; + spin_unlock_irq(&ev_file->lock); + + return pollflags; } static int devx_async_event_close(struct inode *inode, struct file *filp) { struct devx_async_event_file *ev_file = filp->private_data; struct devx_event_subscription *event_sub, *event_sub_tmp; + struct devx_async_event_data *entry, *tmp; mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock); /* delete the subscriptions which are related to this FD */ @@ -2328,6 +2607,15 @@ static int devx_async_event_close(struct inode *inode, struct file *filp) mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock); + /* free the pending events allocation */ + if (!ev_file->omit_data) { + spin_lock_irq(&ev_file->lock); + list_for_each_entry_safe(entry, tmp, + &ev_file->event_list, list) + kfree(entry); /* read can't come any more */ + spin_unlock_irq(&ev_file->lock); + } + uverbs_close_fd(filp); put_device(&ev_file->dev->ib_dev.dev); return 0; @@ -2363,6 +2651,15 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { + struct devx_async_event_file *ev_file = + container_of(uobj, struct devx_async_event_file, + uobj); + + spin_lock_irq(&ev_file->lock); + ev_file->is_destroyed = 1; + spin_unlock_irq(&ev_file->lock); + + wake_up_interruptible(&ev_file->poll_wait); return 0; }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index b44691315d39..7e9900b0e746 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -67,5 +67,10 @@ enum mlx5_ib_uapi_devx_create_event_channel_flags { MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0, }; +struct mlx5_ib_uapi_devx_async_event_hdr { + __aligned_u64 cookie; + __u8 out_data[]; +}; + #endif -- cgit v1.2.3 From 413d3347503bc39e17577eaf16451fd492a68558 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:34 +0300 Subject: RDMA/counter: Add set/clear per-port auto mode support Add an API to support set/clear per-port auto mode. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/counters.c | 74 ++++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/device.c | 5 +++ include/rdma/ib_verbs.h | 2 ++ include/rdma/rdma_counter.h | 24 +++++++++++++ include/uapi/rdma/rdma_netlink.h | 26 ++++++++++++++ 6 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/counters.c (limited to 'include/uapi') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 42f1b2a4f746..09881bd5f12d 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - nldev.o restrack.o + nldev.o restrack.o counters.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c new file mode 100644 index 000000000000..6167914fba06 --- /dev/null +++ b/drivers/infiniband/core/counters.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ +#include +#include + +#include "core_priv.h" +#include "restrack.h" + +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE) + +static int __counter_set_mode(struct rdma_counter_mode *curr, + enum rdma_nl_counter_mode new_mode, + enum rdma_nl_counter_mask new_mask) +{ + if ((new_mode == RDMA_COUNTER_MODE_AUTO) && + ((new_mask & (~ALL_AUTO_MODE_MASKS)) || + (curr->mode != RDMA_COUNTER_MODE_NONE))) + return -EINVAL; + + curr->mode = new_mode; + curr->mask = new_mask; + return 0; +} + +/** + * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode + * + * When @on is true, the @mask must be set + */ +int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, + bool on, enum rdma_nl_counter_mask mask) +{ + struct rdma_port_counter *port_counter; + int ret; + + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + if (on) { + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_AUTO, mask); + } else { + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) { + ret = -EINVAL; + goto out; + } + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_NONE, 0); + } + +out: + mutex_unlock(&port_counter->lock); + return ret; +} + +void rdma_counter_init(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port; + + if (!dev->ops.alloc_hw_stats || !dev->port_data) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; + mutex_init(&port_counter->lock); + } +} + +void rdma_counter_release(struct ib_device *dev) +{ +} diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8a6ccb936dfe..6579865e4866 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "core_priv.h" #include "restrack.h" @@ -492,10 +493,12 @@ static void ib_device_release(struct device *device) if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); + rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } + xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); @@ -1316,6 +1319,8 @@ int ib_register_device(struct ib_device *device, const char *name) ib_device_register_rdmacg(device); + rdma_counter_init(device); + /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 26e9c2594913..3d19c056fbc0 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -2119,6 +2120,7 @@ struct ib_port_data { spinlock_t netdev_lock; struct net_device __rcu *netdev; struct hlist_node ndev_hash_link; + struct rdma_port_counter port_counter; }; /* rdma netdev type - specifies protocol type */ diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 283ac1a0cdb7..8dd2619c015d 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -6,8 +6,26 @@ #ifndef _RDMA_COUNTER_H_ #define _RDMA_COUNTER_H_ +#include + #include #include +#include + +struct auto_mode_param { + int qp_type; +}; + +struct rdma_counter_mode { + enum rdma_nl_counter_mode mode; + enum rdma_nl_counter_mask mask; + struct auto_mode_param param; +}; + +struct rdma_port_counter { + struct rdma_counter_mode mode; + struct mutex lock; +}; struct rdma_counter { struct rdma_restrack_entry res; @@ -15,4 +33,10 @@ struct rdma_counter { uint32_t id; u8 port; }; + +void rdma_counter_init(struct ib_device *dev); +void rdma_counter_release(struct ib_device *dev); +int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, + bool on, enum rdma_nl_counter_mask mask); + #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 650cee8c4bf1..e3cd912e9cef 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -507,4 +507,30 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_MAX }; + +/* + * Supported counter bind modes. All modes are mutual-exclusive. + */ +enum rdma_nl_counter_mode { + RDMA_COUNTER_MODE_NONE, + + /* + * A qp is bound with a counter automatically during initialization + * based on the auto mode (e.g., qp type, ...) + */ + RDMA_COUNTER_MODE_AUTO, + + /* + * Always the end + */ + RDMA_COUNTER_MODE_MAX, +}; + +/* + * Supported criteria in counter auto mode. + * Currently only "qp type" is supported + */ +enum rdma_nl_counter_mask { + RDMA_COUNTER_MASK_QP_TYPE = 1, +}; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From b47ae6f803b727952dfb37afd83e51c465147b85 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:39 +0300 Subject: RDMA/nldev: Allow counter auto mode configration through RDMA netlink Provide an option to enable/disable per-port counter auto mode through RDMA netlink. Limit it to users with ADMIN capability only. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 78 ++++++++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 8 +++++ 2 files changed, 86 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index d9ebfb50962b..9a4cf285f447 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -126,6 +126,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, @@ -1482,6 +1485,78 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } +static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + u32 index, port, mode, mask = 0; + struct ib_device *device; + struct sk_buff *msg; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + /* Currently only counter for QP is supported */ + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); + if (mode != RDMA_COUNTER_MODE_AUTO) { + ret = -EMSGSIZE; + goto err_msg; + } + + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + + ret = rdma_counter_set_auto_mode(device, port, + mask ? true : false, mask); + if (ret) + goto err_msg; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { + ret = -EMSGSIZE; + goto err_msg; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1535,6 +1610,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, + }, + [RDMA_NLDEV_CMD_STAT_SET] = { + .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, }; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index e3cd912e9cef..0cb47d23fd86 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -281,6 +281,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET_CHARDEV, + RDMA_NLDEV_CMD_STAT_SET, + RDMA_NLDEV_NUM_OPS }; @@ -488,6 +490,12 @@ enum rdma_nldev_attr { * File descriptor handle of the net namespace object */ RDMA_NLDEV_NET_NS_FD, /* u32 */ + /* + * Counter-specific attributes. + */ + RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ + RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ + RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ /* * Information about a chardev. -- cgit v1.2.3 From c4ffee7c9bdba7b189df3251e375c4c7e93a91ac Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:40 +0300 Subject: RDMA/netlink: Implement counter dumpit calback This patch adds the ability to return all available counters together with their properties and hwstats. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 26 ++++- drivers/infiniband/core/device.c | 2 + drivers/infiniband/core/nldev.c | 213 +++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 10 ++ include/rdma/rdma_counter.h | 3 + include/uapi/rdma/rdma_netlink.h | 22 ++-- 6 files changed, 268 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 615ee731a1de..3741b9e5126a 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -59,7 +59,7 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, { struct rdma_counter *counter; - if (!dev->ops.counter_dealloc) + if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -69,16 +69,25 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, counter->device = dev; counter->port = port; counter->res.type = RDMA_RESTRACK_COUNTER; + counter->stats = dev->ops.counter_alloc_stats(counter); + if (!counter->stats) + goto err_stats; + counter->mode.mode = mode; kref_init(&counter->kref); mutex_init(&counter->lock); return counter; + +err_stats: + kfree(counter); + return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { rdma_restrack_del(&counter->res); + kfree(counter->stats); kfree(counter); } @@ -275,6 +284,21 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) return 0; } +int rdma_counter_query_stats(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + int ret; + + if (!dev->ops.counter_update_stats) + return -EINVAL; + + mutex_lock(&counter->lock); + ret = dev->ops.counter_update_stats(counter); + mutex_unlock(&counter->lock); + + return ret; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f3181b74c863..bdf61499e6d5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2471,9 +2471,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); + SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_unbind_qp); + SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 9a4cf285f447..cebc15b23b15 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -129,6 +129,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, @@ -636,6 +643,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, err: return -EMSGSIZE; } +static int fill_stat_counter_mode(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_counter_mode *m = &counter->mode; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) + return -EMSGSIZE; + + if (m->mode == RDMA_COUNTER_MODE_AUTO) + if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) + return -EMSGSIZE; + + return 0; +} + +static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_qps(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + int ret = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + + rt = &counter->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_is_visible_in_pid_ns(res)) + continue; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + continue; + + if (!qp->counter || (qp->counter->id != counter->id)) + continue; + + ret = fill_stat_counter_qp_entry(msg, qp->qp_num); + if (ret) + goto err; + } + + xa_unlock(&rt->xa); + nla_nest_end(msg, table_attr); + return 0; + +err: + xa_unlock(&rt->xa); + nla_nest_cancel(msg, table_attr); + return ret; +} + +static int fill_stat_hwcounter_entry(struct sk_buff *msg, + const char *name, u64 value) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + name)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, + value, RDMA_NLDEV_ATTR_PAD)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_hwcounters(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_hw_stats *st = counter->stats; + struct nlattr *table_attr; + int i; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < st->num_counters; i++) + if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i])) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, + uint32_t port) +{ + struct rdma_counter *counter = + container_of(res, struct rdma_counter, res); + + if (port && port != counter->port) + return 0; + + /* Dump it even query failed */ + rdma_counter_query_stats(counter); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || + fill_res_name_pid(msg, &counter->res) || + fill_stat_counter_mode(msg, counter) || + fill_stat_counter_qps(msg, counter) || + fill_stat_counter_hwcounters(msg, counter)) + return -EMSGSIZE; + + return 0; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1003,6 +1156,13 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, + [RDMA_RESTRACK_COUNTER] = { + .fill_res_func = fill_res_counter_entry, + .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET, + .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, + .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, + .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, + }, }; static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1239,6 +1399,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); @@ -1557,6 +1718,54 @@ err: return ret; } +static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_doit(skb, nlh, extack); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_dumpit(skb, cb); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1615,6 +1824,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_STAT_GET] = { + .doit = nldev_stat_get_doit, + .dump = nldev_stat_get_dumpit, + }, }; void __init nldev_init(void) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0205472eb73a..0c5151a12ae4 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2503,6 +2503,16 @@ struct ib_device_ops { * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); + /** + * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in + * the driver initialized data. + */ + struct rdma_hw_stats *(*counter_alloc_stats)( + struct rdma_counter *counter); + /** + * counter_update_stats - Query the stats value of this counter + */ + int (*counter_update_stats)(struct rdma_counter *counter); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_cq); diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 9f93a2403c9c..f2a5c8efc404 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -37,6 +37,7 @@ struct rdma_counter { struct kref kref; struct rdma_counter_mode mode; struct mutex lock; + struct rdma_hw_stats *stats; u8 port; }; @@ -47,4 +48,6 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port); int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); +int rdma_counter_query_stats(struct rdma_counter *counter); + #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0cb47d23fd86..18dd88c0add8 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -283,6 +283,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_SET, + RDMA_NLDEV_CMD_STAT_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -490,13 +492,6 @@ enum rdma_nldev_attr { * File descriptor handle of the net namespace object */ RDMA_NLDEV_NET_NS_FD, /* u32 */ - /* - * Counter-specific attributes. - */ - RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ - RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ - RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ - /* * Information about a chardev. * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc) @@ -509,6 +504,19 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, /* u64 */ + /* + * Counter-specific attributes. + */ + RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ + RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ + RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ + RDMA_NLDEV_ATTR_STAT_COUNTER, /* nested table */ + RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_STAT_COUNTER_ID, /* u32 */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTERS, /* nested table */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, /* string */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, /* u64 */ /* * Always the end -- cgit v1.2.3 From 1bd8e0a9d0fd1be03d2833a0c15ac676bdf275d8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:43 +0300 Subject: RDMA/counter: Allow manual mode configuration support In manual mode a QP is bound to a counter manually. If counter is not specified then a new one will be allocated. Manual mode is enabled when user binds a QP, and disabled when the last manually bound QP is unbound. When auto-mode is turned off and there are counters left, manual mode is enabled so that the user is able to access these counters. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 219 ++++++++++++++++++++++++++++++++++++- include/rdma/rdma_counter.h | 7 ++ include/uapi/rdma/rdma_netlink.h | 6 + 3 files changed, 229 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 8810a8a8d1f5..0ebe36e9fa7b 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -27,7 +27,9 @@ static int __counter_set_mode(struct rdma_counter_mode *curr, /** * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * - * When @on is true, the @mask must be set + * When @on is true, the @mask must be set; When @on is false, it goes + * into manual mode if there's any counter, so that the user is able to + * manually access them. */ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, bool on, enum rdma_nl_counter_mask mask) @@ -45,8 +47,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, ret = -EINVAL; goto out; } - ret = __counter_set_mode(&port_counter->mode, - RDMA_COUNTER_MODE_NONE, 0); + + if (port_counter->num_counters) + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_MANUAL, 0); + else + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_NONE, 0); } out: @@ -57,7 +64,9 @@ out: static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, enum rdma_nl_counter_mode mode) { + struct rdma_port_counter *port_counter; struct rdma_counter *counter; + int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; @@ -73,12 +82,27 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, if (!counter->stats) goto err_stats; + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + if (mode == RDMA_COUNTER_MODE_MANUAL) { + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_MANUAL, 0); + if (ret) + goto err_mode; + } + + port_counter->num_counters++; + mutex_unlock(&port_counter->lock); + counter->mode.mode = mode; kref_init(&counter->kref); mutex_init(&counter->lock); return counter; +err_mode: + mutex_unlock(&port_counter->lock); + kfree(counter->stats); err_stats: kfree(counter); return NULL; @@ -86,6 +110,18 @@ err_stats: static void rdma_counter_free(struct rdma_counter *counter) { + struct rdma_port_counter *port_counter; + + port_counter = &counter->device->port_data[counter->port].port_counter; + mutex_lock(&port_counter->lock); + port_counter->num_counters--; + if (!port_counter->num_counters && + (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) + __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE, + 0); + + mutex_unlock(&port_counter->lock); + rdma_restrack_del(&counter->res); kfree(counter->stats); kfree(counter); @@ -363,6 +399,183 @@ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index) return sum; } +static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) +{ + struct rdma_restrack_entry *res = NULL; + struct ib_qp *qp = NULL; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); + if (IS_ERR(res)) + return NULL; + + if (!rdma_is_visible_in_pid_ns(res)) + goto err; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + goto err; + + return qp; + +err: + rdma_restrack_put(&qp->res); + return NULL; +} + +static int rdma_counter_bind_qp_manual(struct rdma_counter *counter, + struct ib_qp *qp) +{ + if ((counter->device != qp->device) || (counter->port != qp->port)) + return -EINVAL; + + return __rdma_counter_bind_qp(counter, qp); +} + +static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, + u32 counter_id) +{ + struct rdma_restrack_entry *res; + struct rdma_counter *counter; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); + if (IS_ERR(res)) + return NULL; + + if (!rdma_is_visible_in_pid_ns(res)) { + rdma_restrack_put(res); + return NULL; + } + + counter = container_of(res, struct rdma_counter, res); + kref_get(&counter->kref); + rdma_restrack_put(res); + + return counter; +} + +/** + * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id + */ +int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + counter = rdma_get_counter_by_id(dev, counter_id); + if (!counter) { + ret = -ENOENT; + goto err; + } + + if (counter->res.task != qp->res.task) { + ret = -EINVAL; + goto err_task; + } + + ret = rdma_counter_bind_qp_manual(counter, qp); + if (ret) + goto err_task; + + rdma_restrack_put(&qp->res); + return 0; + +err_task: + kref_put(&counter->kref, counter_release); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/** + * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it + * The id of new counter is returned in @counter_id + */ +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, + u32 qp_num, u32 *counter_id) +{ + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto err; + } + + counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL); + if (!counter) { + ret = -ENOMEM; + goto err; + } + + ret = rdma_counter_bind_qp_manual(counter, qp); + if (ret) + goto err_bind; + + if (counter_id) + *counter_id = counter->id; + + rdma_counter_res_add(counter, qp); + + rdma_restrack_put(&qp->res); + return ret; + +err_bind: + rdma_counter_free(counter); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/** + * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter + */ +int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto out; + } + + port_counter = &dev->port_data[port].port_counter; + if (!qp->counter || qp->counter->id != counter_id || + port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { + ret = -EINVAL; + goto out; + } + + ret = rdma_counter_unbind_qp(qp, false); + +out: + rdma_restrack_put(&qp->res); + return ret; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index bf2c3578768f..6603e10eb352 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -28,6 +28,7 @@ struct rdma_counter_mode { struct rdma_port_counter { struct rdma_counter_mode mode; struct rdma_hw_stats *hstats; + unsigned int num_counters; struct mutex lock; }; @@ -51,5 +52,11 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); int rdma_counter_query_stats(struct rdma_counter *counter); u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index); +int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id); +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, + u32 qp_num, u32 *counter_id); +int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id); #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 18dd88c0add8..ec86fab3d040 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -536,6 +536,12 @@ enum rdma_nl_counter_mode { */ RDMA_COUNTER_MODE_AUTO, + /* + * Which qp are bound with which counter is explicitly specified + * by the user + */ + RDMA_COUNTER_MODE_MANUAL, + /* * Always the end */ -- cgit v1.2.3 From b389327df90530d47931d0f5616b5cd6abb96c96 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:44 +0300 Subject: RDMA/nldev: Allow counter manual mode configration through RDMA netlink Provide an option to allow users to manually bind a qp with a counter through RDMA netlink. Limit it to users with ADMIN capability only. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 111 ++++++++++++++++++++++++++++++++++----- include/rdma/rdma_counter.h | 3 ++ include/uapi/rdma/rdma_netlink.h | 2 + 3 files changed, 103 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index cebc15b23b15..3d750eca53d5 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1649,8 +1649,8 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + u32 index, port, mode, mask = 0, qpn, cntn = 0; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; - u32 index, port, mode, mask = 0; struct ib_device *device; struct sk_buff *msg; int ret; @@ -1688,30 +1688,111 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, 0, 0); mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); - if (mode != RDMA_COUNTER_MODE_AUTO) { - ret = -EMSGSIZE; - goto err_msg; + if (mode == RDMA_COUNTER_MODE_AUTO) { + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + + ret = rdma_counter_set_auto_mode(device, port, + mask ? true : false, mask); + if (ret) + goto err_msg; + } else { + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + ret = rdma_counter_bind_qpn(device, port, qpn, cntn); + } else { + ret = rdma_counter_bind_qpn_alloc(device, port, + qpn, &cntn); + } + if (ret) + goto err_msg; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } } - if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) - mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_fill: + rdma_counter_unbind_qpn(device, port, qpn, cntn); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port, qpn, cntn; + int ret; - ret = rdma_counter_set_auto_mode(device, port, - mask ? true : false, mask); + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || + !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || + !tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); if (ret) - goto err_msg; + goto err_unbind; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) || - nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; - goto err_msg; + goto err_fill; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); -err_msg: +err_fill: + rdma_counter_bind_qpn(device, port, qpn, cntn); +err_unbind: nlmsg_free(msg); err: ib_device_put(device); @@ -1828,6 +1909,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_get_doit, .dump = nldev_stat_get_dumpit, }, + [RDMA_NLDEV_CMD_STAT_DEL] = { + .doit = nldev_stat_del_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, }; void __init nldev_init(void) diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 6603e10eb352..68827700ba95 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -58,5 +58,8 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, u32 qp_num, u32 *counter_id); int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, u32 qp_num, u32 counter_id); +int rdma_counter_get_mode(struct ib_device *dev, u8 port, + enum rdma_nl_counter_mode *mode, + enum rdma_nl_counter_mask *mask); #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index ec86fab3d040..ce6fd66e7aa3 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -285,6 +285,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_GET, /* can dump */ + RDMA_NLDEV_CMD_STAT_DEL, + RDMA_NLDEV_NUM_OPS }; -- cgit v1.2.3 From f10ff380fd7dfba4a36d40f8dd00fe17da8a1a10 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 8 Jul 2019 12:17:48 -0300 Subject: RDMA/rvt: Do not use a kernel header in the ABI rvt was using ib_sge as part of it's ABI, which is not allowed. Introduce a new struct with the same layout and use it instead. Fixes: dabac6e460ce ("IB/hfi1: Move receive work queue struct into uapi directory") Reported-by: Stephen Rothwell Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 32 +++++++++++++++++++++++++++----- include/uapi/rdma/rvt-abi.h | 9 +++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 11b4d3c1efd4..0b0a241c57ff 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1847,8 +1847,11 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* * Make sure queue entry is written * before the head index. @@ -2250,8 +2253,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* Make sure queue entry is written before the head index. */ smp_store_release(&wq->head, next); spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); @@ -2259,6 +2265,22 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, return 0; } +/* + * rvt used the internal kernel struct as part of its ABI, for now make sure + * the kernel struct does not change layout. FIXME: rvt should never cast the + * user struct to a kernel struct. + */ +static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge) +{ + BUILD_BUG_ON(offsetof(struct ib_sge, addr) != + offsetof(struct rvt_wqe_sge, addr)); + BUILD_BUG_ON(offsetof(struct ib_sge, length) != + offsetof(struct rvt_wqe_sge, length)); + BUILD_BUG_ON(offsetof(struct ib_sge, lkey) != + offsetof(struct rvt_wqe_sge, lkey)); + return (struct ib_sge *)sge; +} + /* * Validate a RWQE and fill in the SGE state. * Return 1 if OK. @@ -2282,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) continue; /* Check LKEY */ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], + NULL, rvt_cast_sge(&wqe->sg_list[i]), IB_ACCESS_LOCAL_WRITE); if (unlikely(ret <= 0)) goto bad_lkey; diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index d2e35d24f1a9..7328293c715c 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -10,11 +10,16 @@ #include #include -#include #ifndef RDMA_ATOMIC_UAPI #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name #endif +struct rvt_wqe_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + /* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so @@ -39,7 +44,7 @@ struct rvt_rwqe { __u64 wr_id; __u8 num_sge; __u8 padding[7]; - struct ib_sge sg_list[]; + struct rvt_wqe_sge sg_list[]; }; /* -- cgit v1.2.3 From f8fc8cd9c612c31f92b19b72f619fa043ec76e5e Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 8 Jul 2019 13:59:04 +0300 Subject: RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink Added parameter in ib_device for enabling dynamic interrupt moderation so that it can be configured in userspace using rdma tool. In order to set adaptive-moderation for an ib device the command is: rdma dev set [DEV] adaptive-moderation [on|off] Please set on/off. rdma dev show 0: mlx5_0: node_type ca fw 16.26.0055 node_guid 248a:0703:00a5:29d0 sys_image_guid 248a:0703:00a5:29d0 adaptive-moderation on rdma resource show cq dev mlx5_0 cqn 0 cqe 1023 users 4 poll-ctx UNBOUND_WORKQUEUE adaptive-moderation off comm [ib_core] Signed-off-by: Yamin Friedman Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/device.c | 9 +++++++++ drivers/infiniband/core/nldev.c | 14 ++++++++++++++ include/uapi/rdma/rdma_netlink.h | 5 +++++ 5 files changed, 30 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index f277cb7aea29..85e103b147cc 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -7,6 +7,7 @@ menuconfig INFINIBAND depends on m || IPV6 != m depends on !ALPHA select IRQ_POLL + select DIMLIB ---help--- Core support for InfiniBand (IB). Make sure to also select any protocols you wish to use as well as drivers for your diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a953c2fa2e78..888d89ce81df 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns; int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index bdf61499e6d5..7f4affe8a10d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -448,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) return 0; } +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) +{ + if (use_dim > 1) + return -EINVAL; + ibdev->use_cq_dim = use_dim; + + return 0; +} + static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index a4431ed566b6..d9f2a30e6467 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -52,6 +52,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, + [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX }, @@ -252,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) + return -EMSGSIZE; /* * Link type is determined on first port and mlx4 device @@ -552,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) goto err; if (!rdma_is_kernel_res(res) && @@ -870,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_done; } + if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { + u8 use_dim; + + use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); + err = ib_device_set_dim(device, use_dim); + goto done; + } + done: ib_device_put(device); put_done: diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index ce6fd66e7aa3..8e277783fa96 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -520,6 +520,11 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, /* string */ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, /* u64 */ + /* + * CQ adaptive moderatio (DIM) + */ + RDMA_NLDEV_ATTR_DEV_DIM, /* u8 */ + /* * Always the end */ -- cgit v1.2.3