diff options
Diffstat (limited to 'net')
79 files changed, 1973 insertions, 926 deletions
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 1852e383afd6..553ed4ecb6a0 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -680,7 +680,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) goto error; /* Create the Protection Domain */ - rdma->pd = ib_alloc_pd(rdma->cm_id->device); + rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); if (IS_ERR(rdma->pd)) goto error; diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index c68ff3dcb926..e49121ee55f6 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -20,8 +20,6 @@ #include "main.h" -#include <linux/kconfig.h> - struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c8135680c43e..e2288421fe6b 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -21,8 +21,6 @@ SOFTWARE IS DISCLAIMED. */ -#include <asm/unaligned.h> - #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> @@ -973,33 +971,58 @@ void __hci_req_enable_advertising(struct hci_request *req) static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) { - size_t name_len; + size_t complete_len; + size_t short_len; int max_len; max_len = HCI_MAX_AD_LENGTH - ad_len - 2; - name_len = strlen(hdev->dev_name); - if (name_len > 0 && max_len > 0) { - - if (name_len > max_len) { - name_len = max_len; - ptr[1] = EIR_NAME_SHORT; - } else - ptr[1] = EIR_NAME_COMPLETE; - - ptr[0] = name_len + 1; + complete_len = strlen(hdev->dev_name); + short_len = strlen(hdev->short_name); + + /* no space left for name */ + if (max_len < 1) + return ad_len; + + /* no name set */ + if (!complete_len) + return ad_len; + + /* complete name fits and is eq to max short name len or smaller */ + if (complete_len <= max_len && + complete_len <= HCI_MAX_SHORT_NAME_LENGTH) { + return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE, + hdev->dev_name, complete_len); + } - memcpy(ptr + 2, hdev->dev_name, name_len); + /* short name set and fits */ + if (short_len && short_len <= max_len) { + return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, + hdev->short_name, short_len); + } - ad_len += (name_len + 2); - ptr += (name_len + 2); + /* no short name set so shorten complete name */ + if (!short_len) { + return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, + hdev->dev_name, max_len); } return ad_len; } +static u8 append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len) +{ + return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance); +} + static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) { - return append_local_name(hdev, ptr, 0); + u8 scan_rsp_len = 0; + + if (hdev->appearance) { + scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); + } + + return append_local_name(hdev, ptr, scan_rsp_len); } static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, @@ -1016,18 +1039,13 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, instance_flags = adv_instance->flags; if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) { - ptr[0] = 3; - ptr[1] = EIR_APPEARANCE; - put_unaligned_le16(hdev->appearance, ptr + 2); - scan_rsp_len += 4; - ptr += 4; + scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); } - memcpy(ptr, adv_instance->scan_rsp_data, + memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data, adv_instance->scan_rsp_len); scan_rsp_len += adv_instance->scan_rsp_len; - ptr += adv_instance->scan_rsp_len; if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME) scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len); diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index ac1e11006f38..6b06629245a8 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -20,6 +20,8 @@ SOFTWARE IS DISCLAIMED. */ +#include <asm/unaligned.h> + #define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock) #define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock) @@ -103,3 +105,24 @@ static inline void hci_update_background_scan(struct hci_dev *hdev) void hci_request_setup(struct hci_dev *hdev); void hci_request_cancel_all(struct hci_dev *hdev); + +static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, + u8 *data, u8 data_len) +{ + eir[eir_len++] = sizeof(type) + data_len; + eir[eir_len++] = type; + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; + + return eir_len; +} + +static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) +{ + eir[eir_len++] = sizeof(type) + sizeof(data); + eir[eir_len++] = type; + put_unaligned_le16(data, &eir[eir_len]); + eir_len += sizeof(data); + + return eir_len; +} diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 19b8a5e9420d..736038085feb 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -867,27 +867,6 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, sizeof(rp)); } -static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, - u8 data_len) -{ - eir[eir_len++] = sizeof(type) + data_len; - eir[eir_len++] = type; - memcpy(&eir[eir_len], data, data_len); - eir_len += data_len; - - return eir_len; -} - -static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) -{ - eir[eir_len++] = sizeof(type) + sizeof(data); - eir[eir_len++] = type; - put_unaligned_le16(data, &eir[eir_len]); - eir_len += sizeof(data); - - return eir_len; -} - static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) { u16 eir_len = 0; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index e657258e1f2c..8bd569695e76 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -217,6 +217,7 @@ static const struct brport_attribute *brport_attrs[] = { #endif &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, + &brport_attr_multicast_flood, NULL }; diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 84cbed630c4b..6a5180903e7b 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_CEPH_LIB) += libceph.o libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ mon_client.o \ + cls_lock_client.o \ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ debugfs.o \ auth.o auth_none.o \ diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 2bc5965fdd1e..c822b3ae1bd3 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -82,7 +82,10 @@ void ceph_auth_reset(struct ceph_auth_client *ac) mutex_unlock(&ac->mutex); } -int ceph_entity_name_encode(const char *name, void **p, void *end) +/* + * EntityName, not to be confused with entity_name_t + */ +int ceph_auth_entity_name_encode(const char *name, void **p, void *end) { int len = strlen(name); @@ -111,7 +114,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) monhdr->session_mon = cpu_to_le16(-1); monhdr->session_mon_tid = 0; - ceph_encode_32(&p, 0); /* no protocol, yet */ + ceph_encode_32(&p, CEPH_AUTH_UNKNOWN); /* no protocol, yet */ lenp = p; p += sizeof(u32); @@ -124,7 +127,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) for (i = 0; i < num; i++) ceph_encode_32(&p, supported_protocols[i]); - ret = ceph_entity_name_encode(ac->name, &p, end); + ret = ceph_auth_entity_name_encode(ac->name, &p, end); if (ret < 0) goto out; ceph_decode_need(&p, end, sizeof(u64), bad); @@ -259,9 +262,7 @@ int ceph_build_auth(struct ceph_auth_client *ac, int ret = 0; mutex_lock(&ac->mutex); - if (!ac->protocol) - ret = ceph_auth_build_hello(ac, msg_buf, msg_len); - else if (ac->ops->should_authenticate(ac)) + if (ac->ops->should_authenticate(ac)) ret = ceph_build_auth_request(ac, msg_buf, msg_len); mutex_unlock(&ac->mutex); return ret; diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 5f836f02ae36..df45e467c81f 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -46,7 +46,7 @@ static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac, int ret; ceph_encode_8_safe(&p, end, 1, e_range); - ret = ceph_entity_name_encode(ac->name, &p, end); + ret = ceph_auth_entity_name_encode(ac->name, &p, end); if (ret < 0) return ret; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index bddfcf6f09c2..464e88599b9d 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -566,11 +566,17 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) } EXPORT_SYMBOL(ceph_print_client_options); -u64 ceph_client_id(struct ceph_client *client) +struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client) +{ + return &client->msgr.inst.addr; +} +EXPORT_SYMBOL(ceph_client_addr); + +u64 ceph_client_gid(struct ceph_client *client) { return client->monc.auth->global_id; } -EXPORT_SYMBOL(ceph_client_id); +EXPORT_SYMBOL(ceph_client_gid); /* * create a fresh client instance @@ -685,7 +691,8 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) return client->auth_err; } - pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid); + pr_info("client%llu fsid %pU\n", ceph_client_gid(client), + &client->fsid); ceph_debugfs_client_init(client); return 0; diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 3773a4fa11e3..19b7d8aa915c 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -15,6 +15,7 @@ const char *ceph_entity_type_name(int type) default: return "unknown"; } } +EXPORT_SYMBOL(ceph_entity_type_name); const char *ceph_osd_op_name(int op) { diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c new file mode 100644 index 000000000000..50f040fdb2a9 --- /dev/null +++ b/net/ceph/cls_lock_client.c @@ -0,0 +1,325 @@ +#include <linux/ceph/ceph_debug.h> + +#include <linux/types.h> +#include <linux/slab.h> + +#include <linux/ceph/cls_lock_client.h> +#include <linux/ceph/decode.h> + +/** + * ceph_cls_lock - grab rados lock for object + * @oid, @oloc: object to lock + * @lock_name: the name of the lock + * @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED) + * @cookie: user-defined identifier for this instance of the lock + * @tag: user-defined tag + * @desc: user-defined lock description + * @flags: lock flags + * + * All operations on the same lock should use the same tag. + */ +int ceph_cls_lock(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *cookie, + char *tag, char *desc, u8 flags) +{ + int lock_op_buf_size; + int name_len = strlen(lock_name); + int cookie_len = strlen(cookie); + int tag_len = strlen(tag); + int desc_len = strlen(desc); + void *p, *end; + struct page *lock_op_page; + struct timespec mtime; + int ret; + + lock_op_buf_size = name_len + sizeof(__le32) + + cookie_len + sizeof(__le32) + + tag_len + sizeof(__le32) + + desc_len + sizeof(__le32) + + sizeof(struct ceph_timespec) + + /* flag and type */ + sizeof(u8) + sizeof(u8) + + CEPH_ENCODING_START_BLK_LEN; + if (lock_op_buf_size > PAGE_SIZE) + return -E2BIG; + + lock_op_page = alloc_page(GFP_NOIO); + if (!lock_op_page) + return -ENOMEM; + + p = page_address(lock_op_page); + end = p + lock_op_buf_size; + + /* encode cls_lock_lock_op struct */ + ceph_start_encoding(&p, 1, 1, + lock_op_buf_size - CEPH_ENCODING_START_BLK_LEN); + ceph_encode_string(&p, end, lock_name, name_len); + ceph_encode_8(&p, type); + ceph_encode_string(&p, end, cookie, cookie_len); + ceph_encode_string(&p, end, tag, tag_len); + ceph_encode_string(&p, end, desc, desc_len); + /* only support infinite duration */ + memset(&mtime, 0, sizeof(mtime)); + ceph_encode_timespec(p, &mtime); + p += sizeof(struct ceph_timespec); + ceph_encode_8(&p, flags); + + dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", + __func__, lock_name, type, cookie, tag, desc, flags); + ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + lock_op_page, lock_op_buf_size, NULL, NULL); + + dout("%s: status %d\n", __func__, ret); + __free_page(lock_op_page); + return ret; +} +EXPORT_SYMBOL(ceph_cls_lock); + +/** + * ceph_cls_unlock - release rados lock for object + * @oid, @oloc: object to lock + * @lock_name: the name of the lock + * @cookie: user-defined identifier for this instance of the lock + */ +int ceph_cls_unlock(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, char *cookie) +{ + int unlock_op_buf_size; + int name_len = strlen(lock_name); + int cookie_len = strlen(cookie); + void *p, *end; + struct page *unlock_op_page; + int ret; + + unlock_op_buf_size = name_len + sizeof(__le32) + + cookie_len + sizeof(__le32) + + CEPH_ENCODING_START_BLK_LEN; + if (unlock_op_buf_size > PAGE_SIZE) + return -E2BIG; + + unlock_op_page = alloc_page(GFP_NOIO); + if (!unlock_op_page) + return -ENOMEM; + + p = page_address(unlock_op_page); + end = p + unlock_op_buf_size; + + /* encode cls_lock_unlock_op struct */ + ceph_start_encoding(&p, 1, 1, + unlock_op_buf_size - CEPH_ENCODING_START_BLK_LEN); + ceph_encode_string(&p, end, lock_name, name_len); + ceph_encode_string(&p, end, cookie, cookie_len); + + dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); + ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + unlock_op_page, unlock_op_buf_size, NULL, NULL); + + dout("%s: status %d\n", __func__, ret); + __free_page(unlock_op_page); + return ret; +} +EXPORT_SYMBOL(ceph_cls_unlock); + +/** + * ceph_cls_break_lock - release rados lock for object for specified client + * @oid, @oloc: object to lock + * @lock_name: the name of the lock + * @cookie: user-defined identifier for this instance of the lock + * @locker: current lock owner + */ +int ceph_cls_break_lock(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, char *cookie, + struct ceph_entity_name *locker) +{ + int break_op_buf_size; + int name_len = strlen(lock_name); + int cookie_len = strlen(cookie); + struct page *break_op_page; + void *p, *end; + int ret; + + break_op_buf_size = name_len + sizeof(__le32) + + cookie_len + sizeof(__le32) + + sizeof(u8) + sizeof(__le64) + + CEPH_ENCODING_START_BLK_LEN; + if (break_op_buf_size > PAGE_SIZE) + return -E2BIG; + + break_op_page = alloc_page(GFP_NOIO); + if (!break_op_page) + return -ENOMEM; + + p = page_address(break_op_page); + end = p + break_op_buf_size; + + /* encode cls_lock_break_op struct */ + ceph_start_encoding(&p, 1, 1, + break_op_buf_size - CEPH_ENCODING_START_BLK_LEN); + ceph_encode_string(&p, end, lock_name, name_len); + ceph_encode_copy(&p, locker, sizeof(*locker)); + ceph_encode_string(&p, end, cookie, cookie_len); + + dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, + cookie, ENTITY_NAME(*locker)); + ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + break_op_page, break_op_buf_size, NULL, NULL); + + dout("%s: status %d\n", __func__, ret); + __free_page(break_op_page); + return ret; +} +EXPORT_SYMBOL(ceph_cls_break_lock); + +void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers) +{ + int i; + + for (i = 0; i < num_lockers; i++) + kfree(lockers[i].id.cookie); + kfree(lockers); +} +EXPORT_SYMBOL(ceph_free_lockers); + +static int decode_locker(void **p, void *end, struct ceph_locker *locker) +{ + u8 struct_v; + u32 len; + char *s; + int ret; + + ret = ceph_start_decoding(p, end, 1, "locker_id_t", &struct_v, &len); + if (ret) + return ret; + + ceph_decode_copy(p, &locker->id.name, sizeof(locker->id.name)); + s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO); + if (IS_ERR(s)) + return PTR_ERR(s); + + locker->id.cookie = s; + + ret = ceph_start_decoding(p, end, 1, "locker_info_t", &struct_v, &len); + if (ret) + return ret; + + *p += sizeof(struct ceph_timespec); /* skip expiration */ + ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr)); + ceph_decode_addr(&locker->info.addr); + len = ceph_decode_32(p); + *p += len; /* skip description */ + + dout("%s %s%llu cookie %s addr %s\n", __func__, + ENTITY_NAME(locker->id.name), locker->id.cookie, + ceph_pr_addr(&locker->info.addr.in_addr)); + return 0; +} + +static int decode_lockers(void **p, void *end, u8 *type, char **tag, + struct ceph_locker **lockers, u32 *num_lockers) +{ + u8 struct_v; + u32 struct_len; + char *s; + int i; + int ret; + + ret = ceph_start_decoding(p, end, 1, "cls_lock_get_info_reply", + &struct_v, &struct_len); + if (ret) + return ret; + + *num_lockers = ceph_decode_32(p); + *lockers = kcalloc(*num_lockers, sizeof(**lockers), GFP_NOIO); + if (!*lockers) + return -ENOMEM; + + for (i = 0; i < *num_lockers; i++) { + ret = decode_locker(p, end, *lockers + i); + if (ret) + goto err_free_lockers; + } + + *type = ceph_decode_8(p); + s = ceph_extract_encoded_string(p, end, NULL, GFP_NOIO); + if (IS_ERR(s)) { + ret = PTR_ERR(s); + goto err_free_lockers; + } + + *tag = s; + return 0; + +err_free_lockers: + ceph_free_lockers(*lockers, *num_lockers); + return ret; +} + +/* + * On success, the caller is responsible for: + * + * kfree(tag); + * ceph_free_lockers(lockers, num_lockers); + */ +int ceph_cls_lock_info(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 *type, char **tag, + struct ceph_locker **lockers, u32 *num_lockers) +{ + int get_info_op_buf_size; + int name_len = strlen(lock_name); + struct page *get_info_op_page, *reply_page; + size_t reply_len; + void *p, *end; + int ret; + + get_info_op_buf_size = name_len + sizeof(__le32) + + CEPH_ENCODING_START_BLK_LEN; + if (get_info_op_buf_size > PAGE_SIZE) + return -E2BIG; + + get_info_op_page = alloc_page(GFP_NOIO); + if (!get_info_op_page) + return -ENOMEM; + + reply_page = alloc_page(GFP_NOIO); + if (!reply_page) { + __free_page(get_info_op_page); + return -ENOMEM; + } + + p = page_address(get_info_op_page); + end = p + get_info_op_buf_size; + + /* encode cls_lock_get_info_op struct */ + ceph_start_encoding(&p, 1, 1, + get_info_op_buf_size - CEPH_ENCODING_START_BLK_LEN); + ceph_encode_string(&p, end, lock_name, name_len); + + dout("%s lock_name %s\n", __func__, lock_name); + ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info", + CEPH_OSD_FLAG_READ, get_info_op_page, + get_info_op_buf_size, reply_page, &reply_len); + + dout("%s: status %d\n", __func__, ret); + if (ret >= 0) { + p = page_address(reply_page); + end = p + reply_len; + + ret = decode_lockers(&p, end, type, tag, lockers, num_lockers); + } + + __free_page(get_info_op_page); + __free_page(reply_page); + return ret; +} +EXPORT_SYMBOL(ceph_cls_lock_info); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 5fcfb98f309e..a421e905331a 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -245,7 +245,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, /* compute 2^44*log2(input+1) */ static __u64 crush_ln(unsigned int xin) { - unsigned int x = xin, x1; + unsigned int x = xin; int iexpon, index1, index2; __u64 RH, LH, LL, xl64, result; @@ -253,9 +253,15 @@ static __u64 crush_ln(unsigned int xin) /* normalize input */ iexpon = 15; - while (!(x & 0x18000)) { - x <<= 1; - iexpon--; + + /* + * figure out number of bits we need to shift and + * do it in one step instead of iteratively + */ + if (!(x & 0x18000)) { + int bits = __builtin_clz(x & 0x1FFFF) - 16; + x <<= bits; + iexpon = 15 - bits; } index1 = (x >> 8) << 1; @@ -267,12 +273,11 @@ static __u64 crush_ln(unsigned int xin) /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */ xl64 = (__s64)x * RH; xl64 >>= 48; - x1 = xl64; result = iexpon; result <<= (12 + 32); - index2 = x1 & 0xff; + index2 = xl64 & 0xff; /* LL ~ 2^48*log2(1.0+index2/2^15) */ LL = __LL_tbl[index2]; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index ef34a02719d7..a8effc8b7280 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -835,6 +835,83 @@ int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, } EXPORT_SYMBOL(ceph_monc_get_version_async); +static void handle_command_ack(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + void *p = msg->front.iov_base; + void *const end = p + msg->front_alloc_len; + u64 tid = le64_to_cpu(msg->hdr.tid); + + dout("%s msg %p tid %llu\n", __func__, msg, tid); + + ceph_decode_need(&p, end, sizeof(struct ceph_mon_request_header) + + sizeof(u32), bad); + p += sizeof(struct ceph_mon_request_header); + + mutex_lock(&monc->mutex); + req = lookup_generic_request(&monc->generic_request_tree, tid); + if (!req) { + mutex_unlock(&monc->mutex); + return; + } + + req->result = ceph_decode_32(&p); + __finish_generic_request(req); + mutex_unlock(&monc->mutex); + + complete_generic_request(req); + return; + +bad: + pr_err("corrupt mon_command ack, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +int ceph_monc_blacklist_add(struct ceph_mon_client *monc, + struct ceph_entity_addr *client_addr) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_command *h; + int ret = -ENOMEM; + int len; + + req = alloc_generic_request(monc, GFP_NOIO); + if (!req) + goto out; + + req->request = ceph_msg_new(CEPH_MSG_MON_COMMAND, 256, GFP_NOIO, true); + if (!req->request) + goto out; + + req->reply = ceph_msg_new(CEPH_MSG_MON_COMMAND_ACK, 512, GFP_NOIO, + true); + if (!req->reply) + goto out; + + mutex_lock(&monc->mutex); + register_generic_request(req); + h = req->request->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + h->num_strs = cpu_to_le32(1); + len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \ + \"blacklistop\": \"add\", \ + \"addr\": \"%pISpc/%u\" }", + &client_addr->in_addr, le32_to_cpu(client_addr->nonce)); + h->str_len = cpu_to_le32(len); + send_generic_request(monc, req); + mutex_unlock(&monc->mutex); + + ret = wait_generic_request(req); +out: + put_generic_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_monc_blacklist_add); + /* * Resend pending generic requests. */ @@ -1139,6 +1216,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_get_version_reply(monc, msg); break; + case CEPH_MSG_MON_COMMAND_ACK: + handle_command_ack(monc, msg); + break; + case CEPH_MSG_MON_MAP: ceph_monc_handle_map(monc, msg); break; @@ -1178,6 +1259,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, m = ceph_msg_get(monc->m_subscribe_ack); break; case CEPH_MSG_STATFS_REPLY: + case CEPH_MSG_MON_COMMAND_ACK: return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: m = ceph_msg_get(monc->m_auth_reply); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a97e7b506612..d9bf7a1d0a58 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,6 +338,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, ceph_osd_data_release(&op->notify.request_data); ceph_osd_data_release(&op->notify.response_data); break; + case CEPH_OSD_OP_LIST_WATCHERS: + ceph_osd_data_release(&op->list_watchers.response_data); + break; default: break; } @@ -863,6 +866,8 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_NOTIFY: dst->notify.cookie = cpu_to_le64(src->notify.cookie); break; + case CEPH_OSD_OP_LIST_WATCHERS: + break; case CEPH_OSD_OP_SETALLOCHINT: dst->alloc_hint.expected_object_size = cpu_to_le64(src->alloc_hint.expected_object_size); @@ -1445,6 +1450,10 @@ static void setup_request_data(struct ceph_osd_request *req, ceph_osdc_msg_data_add(req->r_reply, &op->extent.osd_data); break; + case CEPH_OSD_OP_LIST_WATCHERS: + ceph_osdc_msg_data_add(req->r_reply, + &op->list_watchers.response_data); + break; /* both */ case CEPH_OSD_OP_CALL: @@ -3891,12 +3900,121 @@ int ceph_osdc_watch_check(struct ceph_osd_client *osdc, return ret; } +static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) +{ + u8 struct_v; + u32 struct_len; + int ret; + + ret = ceph_start_decoding(p, end, 2, "watch_item_t", + &struct_v, &struct_len); + if (ret) + return ret; + + ceph_decode_copy(p, &item->name, sizeof(item->name)); + item->cookie = ceph_decode_64(p); + *p += 4; /* skip timeout_seconds */ + if (struct_v >= 2) { + ceph_decode_copy(p, &item->addr, sizeof(item->addr)); + ceph_decode_addr(&item->addr); + } + + dout("%s %s%llu cookie %llu addr %s\n", __func__, + ENTITY_NAME(item->name), item->cookie, + ceph_pr_addr(&item->addr.in_addr)); + return 0; +} + +static int decode_watchers(void **p, void *end, + struct ceph_watch_item **watchers, + u32 *num_watchers) +{ + u8 struct_v; + u32 struct_len; + int i; + int ret; + + ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t", + &struct_v, &struct_len); + if (ret) + return ret; + + *num_watchers = ceph_decode_32(p); + *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO); + if (!*watchers) + return -ENOMEM; + + for (i = 0; i < *num_watchers; i++) { + ret = decode_watcher(p, end, *watchers + i); + if (ret) { + kfree(*watchers); + return ret; + } + } + + return 0; +} + +/* + * On success, the caller is responsible for: + * + * kfree(watchers); + */ +int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_watch_item **watchers, + u32 *num_watchers) +{ + struct ceph_osd_request *req; + struct page **pages; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = CEPH_OSD_FLAG_READ; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out_put_req; + } + + osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0); + ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers, + response_data), + pages, PAGE_SIZE, 0, false, true); + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + if (ret >= 0) { + void *p = page_address(pages[0]); + void *const end = p + req->r_ops[0].outdata_len; + + ret = decode_watchers(&p, end, watchers, num_watchers); + } + +out_put_req: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_list_watchers); + /* * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked */ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) { + dout("%s osdc %p\n", __func__, osdc); flush_workqueue(osdc->notify_wq); } EXPORT_SYMBOL(ceph_osdc_flush_notifies); @@ -3910,6 +4028,57 @@ void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc) EXPORT_SYMBOL(ceph_osdc_maybe_request_map); /* + * Execute an OSD class method on an object. + * + * @flags: CEPH_OSD_FLAG_* + * @resp_len: out param for reply length + */ +int ceph_osdc_call(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + const char *class, const char *method, + unsigned int flags, + struct page *req_page, size_t req_len, + struct page *resp_page, size_t *resp_len) +{ + struct ceph_osd_request *req; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = flags; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + + osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + if (req_page) + osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, + 0, false, false); + if (resp_page) + osd_req_op_cls_response_data_pages(req, 0, &resp_page, + PAGE_SIZE, 0, false, false); + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + if (ret >= 0) { + ret = req->r_ops[0].rval; + if (resp_page) + *resp_len = req->r_ops[0].outdata_len; + } + +out_put_req: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_call); + +/* * init, shutdown */ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 00d2601407c5..1a7c9a79a53c 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -26,7 +26,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_unlocked( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, 0, pages + got); + num_pages - got, pages + got, write_page ? FOLL_WRITE : 0); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/core/dev.c b/net/core/dev.c index f1fe26f66458..4bc19a164ba5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3845,7 +3845,7 @@ int netif_rx_ni(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx_ni); -static void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5198,7 +5198,7 @@ out_unlock: return work; } -static void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b06d2f46b83e..fb7348f13501 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1144,6 +1144,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; + memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); + vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cbd19d250947..1e3e0087245b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } -ssize_t skb_socket_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - - /* Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, spd); - lock_sock(sk); - - return ret; -} - /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags, - ssize_t (*splice_cb)(struct sock *, - struct pipe_inode_info *, - struct splice_pipe_desc *)) + unsigned int flags) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); if (spd.nr_pages) - ret = splice_cb(sk, pipe, &spd); + ret = splice_to_pipe(pipe, &spd); return ret; } diff --git a/net/core/sock.c b/net/core/sock.c index 038e660ef844..c73e28fc9c2a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1363,6 +1363,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); + mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); @@ -1399,6 +1400,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); @@ -1545,6 +1547,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + mem_cgroup_sk_alloc(newsk); cgroup_sk_alloc(&newsk->sk_cgrp_data); /* @@ -1569,9 +1572,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - sock_update_memcg(newsk); - if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 66ddcb60519a..7cf7d6e380c2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -258,7 +258,7 @@ int ping_init_sock(struct sock *sk) struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; - int i, j, count; + int i; kgid_t low, high; int ret = 0; @@ -270,16 +270,11 @@ int ping_init_sock(struct sock *sk) return 0; group_info = get_current_groups(); - count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); - for (j = 0; j < cp_count; j++) { - kgid_t gid = group_info->blocks[i][j]; - if (gid_lte(low, gid) && gid_lte(gid, high)) - goto out_release_group; - } + for (i = 0; i < group_info->ngroups; i++) { + kgid_t gid = group_info->gid[i]; - count -= cp_count; + if (gid_lte(low, gid) && gid_lte(gid, high)) + goto out_release_group; } ret = -EACCES; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f2be689a6c85..62d4d90c1389 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2265,7 +2265,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif) { + if (fl4->flowi4_oif && + !netif_index_is_l3_master(net, fl4->flowi4_oif)) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f253e5019d22..3251fe71f39f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -424,8 +424,6 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - if (mem_cgroup_sockets_enabled) - sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } @@ -691,8 +689,7 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, - min(rd_desc->count, len), tss->flags, - skb_socket_splice); + min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7ac37c314312..bd5e8d10893f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1871,9 +1871,6 @@ void tcp_v4_destroy_sock(struct sock *sk) local_bh_disable(); sk_sockets_allocated_dec(sk); local_bh_enable(); - - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index cbd9343751a2..d8983e15f859 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5729,6 +5729,7 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, return ret; } +static int minus_one = -1; static const int one = 1; static const int two_five_five = 255; @@ -5789,7 +5790,8 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.rtr_solicits, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minus_one, }, { .procname = "router_solicitation_interval", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 54cf7197c7ab..5a27ab4eab39 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1190,6 +1190,16 @@ out: return NULL; } +static void tcp_v6_restore_cb(struct sk_buff *skb) +{ + /* We need to move header back to the beginning if xfrm6_policy_check() + * and tcp_v6_fill_cb() are going to be called again. + * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. + */ + memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, + sizeof(struct inet6_skb_parm)); +} + /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1319,6 +1329,7 @@ ipv6_pktoptions: np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { skb_set_owner_r(opt_skb, sk); + tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); @@ -1352,15 +1363,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->sacked = 0; } -static void tcp_v6_restore_cb(struct sk_buff *skb) -{ - /* We need to move header back to the beginning if xfrm6_policy_check() - * and tcp_v6_fill_cb() are going to be called again. - */ - memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, - sizeof(struct inet6_skb_parm)); -} - static int tcp_v6_rcv(struct sk_buff *skb) { const struct tcphdr *th; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index b7f869a85ab7..7e08a4d3d77d 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1160,19 +1160,6 @@ out: return copied ? : err; } -static ssize_t kcm_sock_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - - release_sock(sk); - ret = splice_to_pipe(pipe, spd); - lock_sock(sk); - - return ret; -} - static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1202,8 +1189,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, if (len > rxm->full_len) len = rxm->full_len; - copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, - kcm_sock_splice); + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c9d90eb64046..fcb5d1df11e9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -65,49 +65,24 @@ static DEFINE_MUTEX(nf_hook_mutex); #define nf_entry_dereference(e) \ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) -static struct nf_hook_entry *nf_hook_entry_head(struct net *net, - const struct nf_hook_ops *reg) +static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg) { - struct nf_hook_entry *hook_head = NULL; - if (reg->pf != NFPROTO_NETDEV) - hook_head = nf_entry_dereference(net->nf.hooks[reg->pf] - [reg->hooknum]); - else if (reg->hooknum == NF_NETDEV_INGRESS) { + return net->nf.hooks[reg->pf]+reg->hooknum; + #ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { if (reg->dev && dev_net(reg->dev) == net) - hook_head = - nf_entry_dereference( - reg->dev->nf_hooks_ingress); -#endif + return ®->dev->nf_hooks_ingress; } - return hook_head; -} - -/* must hold nf_hook_mutex */ -static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg, - struct nf_hook_entry *entry) -{ - switch (reg->pf) { - case NFPROTO_NETDEV: -#ifdef CONFIG_NETFILTER_INGRESS - /* We already checked in nf_register_net_hook() that this is - * used from ingress. - */ - rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry); #endif - break; - default: - rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum], - entry); - break; - } + return NULL; } int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct nf_hook_entry *hooks_entry; - struct nf_hook_entry *entry; + struct nf_hook_entry __rcu **pp; + struct nf_hook_entry *entry, *p; if (reg->pf == NFPROTO_NETDEV) { #ifndef CONFIG_NETFILTER_INGRESS @@ -119,6 +94,10 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) return -EINVAL; } + pp = nf_hook_entry_head(net, reg); + if (!pp) + return -EINVAL; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; @@ -128,26 +107,15 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->next = NULL; mutex_lock(&nf_hook_mutex); - hooks_entry = nf_hook_entry_head(net, reg); - - if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) { - /* This is the case where we need to insert at the head */ - entry->next = hooks_entry; - hooks_entry = NULL; - } - - while (hooks_entry && - reg->priority >= hooks_entry->orig_ops->priority && - nf_entry_dereference(hooks_entry->next)) { - hooks_entry = nf_entry_dereference(hooks_entry->next); - } - if (hooks_entry) { - entry->next = nf_entry_dereference(hooks_entry->next); - rcu_assign_pointer(hooks_entry->next, entry); - } else { - nf_set_hooks_head(net, reg, entry); + /* Find the spot in the list */ + while ((p = nf_entry_dereference(*pp)) != NULL) { + if (reg->priority < p->orig_ops->priority) + break; + pp = &p->next; } + rcu_assign_pointer(entry->next, p); + rcu_assign_pointer(*pp, entry); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS @@ -163,33 +131,23 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct nf_hook_entry *hooks_entry; + struct nf_hook_entry __rcu **pp; + struct nf_hook_entry *p; - mutex_lock(&nf_hook_mutex); - hooks_entry = nf_hook_entry_head(net, reg); - if (hooks_entry && hooks_entry->orig_ops == reg) { - nf_set_hooks_head(net, reg, - nf_entry_dereference(hooks_entry->next)); - goto unlock; - } - while (hooks_entry && nf_entry_dereference(hooks_entry->next)) { - struct nf_hook_entry *next = - nf_entry_dereference(hooks_entry->next); - struct nf_hook_entry *nnext; + pp = nf_hook_entry_head(net, reg); + if (WARN_ON_ONCE(!pp)) + return; - if (next->orig_ops != reg) { - hooks_entry = next; - continue; + mutex_lock(&nf_hook_mutex); + while ((p = nf_entry_dereference(*pp)) != NULL) { + if (p->orig_ops == reg) { + rcu_assign_pointer(*pp, p->next); + break; } - nnext = nf_entry_dereference(next->next); - rcu_assign_pointer(hooks_entry->next, nnext); - hooks_entry = next; - break; + pp = &p->next; } - -unlock: mutex_unlock(&nf_hook_mutex); - if (!hooks_entry) { + if (!p) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } @@ -201,10 +159,10 @@ unlock: static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, hooks_entry); + nf_queue_nf_hook_drop(net, p); /* other cpu might still process nfqueue verdict that used reg */ synchronize_net(); - kfree(hooks_entry); + kfree(p); } EXPORT_SYMBOL(nf_unregister_net_hook); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 627f898c05b9..62bea4591054 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1832,7 +1832,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* Record the max length of recvmsg() calls for future allocations */ nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, - 16384); + SKB_WITH_OVERHEAD(32768)); copied = data_skb->len; if (len < copied) { @@ -2083,8 +2083,9 @@ static int netlink_dump(struct sock *sk) if (alloc_min_size < nlk->max_recvmsg_len) { alloc_size = nlk->max_recvmsg_len; - skb = alloc_skb(alloc_size, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY); + skb = alloc_skb(alloc_size, + (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c8c82e109c68..22087062bd10 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -343,7 +343,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) key->eth.cvlan.tci = 0; key->eth.cvlan.tpid = 0; - if (likely(skb_vlan_tag_present(skb))) { + if (skb_vlan_tag_present(skb)) { key->eth.vlan.tci = htons(skb->vlan_tci); key->eth.vlan.tpid = skb->vlan_proto; } else { diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 95c36147a6e1..e7da29021b38 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -176,7 +176,7 @@ static void do_setup(struct net_device *netdev) netdev->vlan_features = netdev->features; netdev->hw_enc_features = netdev->features; - netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; netdev->hw_features = netdev->features & ~NETIF_F_LLTX; eth_hw_addr_random(netdev); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 8f198437c724..7387418ac514 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -485,7 +485,8 @@ static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; - if (skb_vlan_tagged(skb)) + if (!skb_vlan_tag_present(skb) && + eth_type_vlan(skb->protocol)) length -= VLAN_HLEN; /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 33a4697d5539..11db0d619c00 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3952,6 +3952,7 @@ static int packet_notifier(struct notifier_block *this, } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); + fanout_release(sk); po->ifindex = -1; if (po->prot_hook.dev) dev_put(po->prot_hook.dev); diff --git a/net/rds/ib.c b/net/rds/ib.c index 7eaf887e46f8..5680d90b0b77 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -160,7 +160,7 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->dev = device; - rds_ibdev->pd = ib_alloc_pd(device); + rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { rds_ibdev->pd = NULL; goto put_dev; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 44c9c2b0b190..2d59c9be40e1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -678,9 +678,9 @@ static int rxrpc_release_sock(struct sock *sk) sk->sk_state = RXRPC_CLOSE; spin_unlock_bh(&sk->sk_receive_queue.lock); - if (rx->local && rx->local->service == rx) { + if (rx->local && rcu_access_pointer(rx->local->service) == rx) { write_lock(&rx->local->services_lock); - rx->local->service = NULL; + rcu_assign_pointer(rx->local->service, NULL); write_unlock(&rx->local->services_lock); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d38dffd78085..f60e35576526 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -398,6 +398,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ + RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_PINGING, /* Ping in process */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ }; @@ -410,6 +411,7 @@ enum rxrpc_call_event { RXRPC_CALL_EV_ABORT, /* need to generate abort */ RXRPC_CALL_EV_TIMER, /* Timer expired */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ + RXRPC_CALL_EV_PING, /* Ping send required */ }; /* @@ -466,6 +468,7 @@ struct rxrpc_call { struct rxrpc_sock __rcu *socket; /* socket responsible */ ktime_t ack_at; /* When deferred ACK needs to happen */ ktime_t resend_at; /* When next resend needs to happen */ + ktime_t ping_at; /* When next to send a ping */ ktime_t expire_at; /* When the call times out */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ @@ -558,8 +561,10 @@ struct rxrpc_call { rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */ - rxrpc_serial_t ackr_ping; /* Last ping sent */ - ktime_t ackr_ping_time; /* Time last ping sent */ + + /* ping management */ + rxrpc_serial_t ping_serial; /* Last ping sent */ + ktime_t ping_time; /* Time last ping sent */ /* transmission-phase ACK management */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ @@ -728,8 +733,10 @@ extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; enum rxrpc_timer_trace { rxrpc_timer_begin, rxrpc_timer_init_for_reply, + rxrpc_timer_init_for_send_reply, rxrpc_timer_expired, rxrpc_timer_set_for_ack, + rxrpc_timer_set_for_ping, rxrpc_timer_set_for_resend, rxrpc_timer_set_for_send, rxrpc_timer__nr_trace @@ -743,6 +750,7 @@ enum rxrpc_propose_ack_trace { rxrpc_propose_ack_ping_for_lost_ack, rxrpc_propose_ack_ping_for_lost_reply, rxrpc_propose_ack_ping_for_params, + rxrpc_propose_ack_processing_op, rxrpc_propose_ack_respond_to_ack, rxrpc_propose_ack_respond_to_ping, rxrpc_propose_ack_retry_tx, @@ -777,7 +785,7 @@ extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10]; extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9]; extern const char *const rxrpc_pkts[]; -extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; +extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; #include <trace/events/rxrpc.h> @@ -805,6 +813,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ +void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, enum rxrpc_propose_ack_trace); @@ -1068,7 +1077,8 @@ extern const s8 rxrpc_ack_priority[]; /* * output.c */ -int rxrpc_send_call_packet(struct rxrpc_call *, u8); +int rxrpc_send_ack_packet(struct rxrpc_call *, bool); +int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3cac231d8405..832d854c2d5c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -337,7 +337,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, /* Get the socket providing the service */ rx = rcu_dereference(local->service); - if (service_id == rx->srx.srx_service) + if (rx && service_id == rx->srx.srx_service) goto found_service; trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, @@ -565,7 +565,7 @@ out_discard: write_unlock_bh(&call->state_lock); write_unlock(&rx->call_lock); if (abort) { - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put); } diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 4f00476630b9..97a17ada4431 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -24,19 +24,20 @@ /* * Set the timer */ -void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, - ktime_t now) +void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, + ktime_t now) { unsigned long t_j, now_j = jiffies; ktime_t t; bool queue = false; - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { t = call->expire_at; - if (!ktime_after(t, now)) + if (!ktime_after(t, now)) { + trace_rxrpc_timer(call, why, now, now_j); + queue = true; goto out; + } if (!ktime_after(call->resend_at, now)) { call->resend_at = call->expire_at; @@ -54,6 +55,14 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, t = call->ack_at; } + if (!ktime_after(call->ping_at, now)) { + call->ping_at = call->expire_at; + if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) + queue = true; + } else if (ktime_before(call->ping_at, t)) { + t = call->ping_at; + } + t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); t_j += jiffies; @@ -68,16 +77,46 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, mod_timer(&call->timer, t_j); trace_rxrpc_timer(call, why, now, now_j); } - - if (queue) - rxrpc_queue_call(call); } out: + if (queue) + rxrpc_queue_call(call); +} + +/* + * Set the timer + */ +void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, + ktime_t now) +{ + read_lock_bh(&call->state_lock); + __rxrpc_set_timer(call, why, now); read_unlock_bh(&call->state_lock); } /* + * Propose a PING ACK be sent. + */ +static void rxrpc_propose_ping(struct rxrpc_call *call, + bool immediate, bool background) +{ + if (immediate) { + if (background && + !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) + rxrpc_queue_call(call); + } else { + ktime_t now = ktime_get_real(); + ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay); + + if (ktime_before(ping_at, call->ping_at)) { + call->ping_at = ping_at; + rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now); + } + } +} + +/* * propose an ACK be sent */ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, @@ -90,6 +129,14 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, ktime_t now, ack_at; s8 prior = rxrpc_ack_priority[ack_reason]; + /* Pings are handled specially because we don't want to accidentally + * lose a ping response by subsuming it into a ping. + */ + if (ack_reason == RXRPC_ACK_PING) { + rxrpc_propose_ping(call, immediate, background); + goto trace; + } + /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial * numbers, but we don't alter the timeout. */ @@ -125,7 +172,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, expiry = rxrpc_soft_ack_delay; break; - case RXRPC_ACK_PING: case RXRPC_ACK_IDLE: if (rxrpc_idle_ack_delay < expiry) expiry = rxrpc_idle_ack_delay; @@ -253,7 +299,7 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) goto out; rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, rxrpc_propose_ack_ping_for_lost_ack); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + rxrpc_send_ack_packet(call, true); goto out; } @@ -328,12 +374,13 @@ void rxrpc_process_call(struct work_struct *work) recheck_state: if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); goto recheck_state; } if (call->state == RXRPC_CALL_COMPLETE) { del_timer_sync(&call->timer); + rxrpc_notify_socket(call); goto out_put; } @@ -345,13 +392,17 @@ recheck_state: } if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) { - call->ack_at = call->expire_at; if (call->ackr_reason) { - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + rxrpc_send_ack_packet(call, false); goto recheck_state; } } + if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) { + rxrpc_send_ack_packet(call, true); + goto recheck_state; + } + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) { rxrpc_resend(call, now); goto recheck_state; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 364b42dc3dce..4353a29f3b57 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -205,6 +205,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); call->expire_at = expire_at; call->ack_at = expire_at; + call->ping_at = expire_at; call->resend_at = expire_at; call->timer.expires = jiffies + LONG_MAX / 2; rxrpc_set_timer(call, rxrpc_timer_begin, now); @@ -498,7 +499,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_got); rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3ad9f75031e3..44fb8d893c7d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -625,9 +625,9 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call, rxrpc_serial_t ping_serial; ktime_t ping_time; - ping_time = call->ackr_ping_time; + ping_time = call->ping_time; smp_rmb(); - ping_serial = call->ackr_ping; + ping_serial = call->ping_serial; if (!test_bit(RXRPC_CALL_PINGING, &call->flags) || before(orig_serial, ping_serial)) @@ -847,7 +847,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & RXRPC_TX_ANNO_LAST && - summary.nr_acks == call->tx_top - hard_ack) + summary.nr_acks == call->tx_top - hard_ack && + rxrpc_is_client_call(call)) rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, false, true, rxrpc_propose_ack_ping_for_lost_reply); @@ -938,6 +939,33 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, } /* + * Handle a new call on a channel implicitly completing the preceding call on + * that channel. + * + * TODO: If callNumber > call_id + 1, renegotiate security. + */ +static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, + struct rxrpc_call *call) +{ + switch (call->state) { + case RXRPC_CALL_SERVER_AWAIT_ACK: + rxrpc_call_completed(call); + break; + case RXRPC_CALL_COMPLETE: + break; + default: + if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, ESHUTDOWN)) { + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } + break; + } + + __rxrpc_disconnect_call(conn, call); + rxrpc_notify_socket(call); +} + +/* * post connection-level events to the connection * - this includes challenges, responses, some aborts and call terminal packet * retransmission. @@ -1145,6 +1173,16 @@ void rxrpc_data_ready(struct sock *udp_sk) } call = rcu_dereference(chan->call); + + if (sp->hdr.callNumber > chan->call_id) { + if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) { + rcu_read_unlock(); + goto reject_packet; + } + if (call) + rxrpc_input_implicit_end_call(conn, call); + call = NULL; + } } else { skew = 0; call = NULL; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 9d1c721bc4e8..6dee55fad2d3 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -93,10 +93,9 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_EXCEEDS_WINDOW] = 6, [RXRPC_ACK_NOSPACE] = 7, [RXRPC_ACK_PING_RESPONSE] = 8, - [RXRPC_ACK_PING] = 9, }; -const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { +const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL", "-?-" }; @@ -196,7 +195,9 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { [rxrpc_timer_begin] = "Begin ", [rxrpc_timer_expired] = "*EXPR*", [rxrpc_timer_init_for_reply] = "IniRpl", + [rxrpc_timer_init_for_send_reply] = "SndRpl", [rxrpc_timer_set_for_ack] = "SetAck", + [rxrpc_timer_set_for_ping] = "SetPng", [rxrpc_timer_set_for_send] = "SetTx ", [rxrpc_timer_set_for_resend] = "SetRTx", }; @@ -207,6 +208,7 @@ const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck", [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", [rxrpc_propose_ack_ping_for_params] = "Params ", + [rxrpc_propose_ack_processing_op] = "ProcOp ", [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", [rxrpc_propose_ack_retry_tx] = "RetryTx", diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 0d47db886f6e..5dab1ff3a6c2 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -19,26 +19,27 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -struct rxrpc_pkt_buffer { +struct rxrpc_ack_buffer { struct rxrpc_wire_header whdr; - union { - struct { - struct rxrpc_ackpacket ack; - u8 acks[255]; - u8 pad[3]; - }; - __be32 abort_code; - }; + struct rxrpc_ackpacket ack; + u8 acks[255]; + u8 pad[3]; struct rxrpc_ackinfo ackinfo; }; +struct rxrpc_abort_buffer { + struct rxrpc_wire_header whdr; + __be32 abort_code; +}; + /* * Fill out an ACK packet. */ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, - struct rxrpc_pkt_buffer *pkt, + struct rxrpc_ack_buffer *pkt, rxrpc_seq_t *_hard_ack, - rxrpc_seq_t *_top) + rxrpc_seq_t *_top, + u8 reason) { rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; @@ -58,10 +59,10 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); pkt->ack.serial = htonl(serial); - pkt->ack.reason = call->ackr_reason; + pkt->ack.reason = reason; pkt->ack.nAcks = top - hard_ack; - if (pkt->ack.reason == RXRPC_ACK_PING) + if (reason == RXRPC_ACK_PING) pkt->whdr.flags |= RXRPC_REQUEST_ACK; if (after(top, hard_ack)) { @@ -91,22 +92,19 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, } /* - * Send an ACK or ABORT call packet. + * Send an ACK call packet. */ -int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) +int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping) { struct rxrpc_connection *conn = NULL; - struct rxrpc_pkt_buffer *pkt; + struct rxrpc_ack_buffer *pkt; struct msghdr msg; struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; size_t len, n; - bool ping = false; - int ioc, ret; - u32 abort_code; - - _enter("%u,%s", call->debug_id, rxrpc_pkts[type]); + int ret; + u8 reason; spin_lock_bh(&call->lock); if (call->conn) @@ -131,68 +129,44 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) pkt->whdr.cid = htonl(call->cid); pkt->whdr.callNumber = htonl(call->call_id); pkt->whdr.seq = 0; - pkt->whdr.type = type; - pkt->whdr.flags = conn->out_clientflag; + pkt->whdr.type = RXRPC_PACKET_TYPE_ACK; + pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag; pkt->whdr.userStatus = 0; pkt->whdr.securityIndex = call->security_ix; pkt->whdr._rsvd = 0; pkt->whdr.serviceId = htons(call->service_id); - iov[0].iov_base = pkt; - iov[0].iov_len = sizeof(pkt->whdr); - len = sizeof(pkt->whdr); - - switch (type) { - case RXRPC_PACKET_TYPE_ACK: - spin_lock_bh(&call->lock); + spin_lock_bh(&call->lock); + if (ping) { + reason = RXRPC_ACK_PING; + } else { + reason = call->ackr_reason; if (!call->ackr_reason) { spin_unlock_bh(&call->lock); ret = 0; goto out; } - ping = (call->ackr_reason == RXRPC_ACK_PING); - n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top); call->ackr_reason = 0; + } + n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top, reason); - spin_unlock_bh(&call->lock); - - - pkt->whdr.flags |= RXRPC_SLOW_START_OK; - - iov[0].iov_len += sizeof(pkt->ack) + n; - iov[1].iov_base = &pkt->ackinfo; - iov[1].iov_len = sizeof(pkt->ackinfo); - len += sizeof(pkt->ack) + n + sizeof(pkt->ackinfo); - ioc = 2; - break; - - case RXRPC_PACKET_TYPE_ABORT: - abort_code = call->abort_code; - pkt->abort_code = htonl(abort_code); - iov[0].iov_len += sizeof(pkt->abort_code); - len += sizeof(pkt->abort_code); - ioc = 1; - break; + spin_unlock_bh(&call->lock); - default: - BUG(); - ret = -ENOANO; - goto out; - } + iov[0].iov_base = pkt; + iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n; + iov[1].iov_base = &pkt->ackinfo; + iov[1].iov_len = sizeof(pkt->ackinfo); + len = iov[0].iov_len + iov[1].iov_len; serial = atomic_inc_return(&conn->serial); pkt->whdr.serial = htonl(serial); - switch (type) { - case RXRPC_PACKET_TYPE_ACK: - trace_rxrpc_tx_ack(call, serial, - ntohl(pkt->ack.firstPacket), - ntohl(pkt->ack.serial), - pkt->ack.reason, pkt->ack.nAcks); - break; - } + trace_rxrpc_tx_ack(call, serial, + ntohl(pkt->ack.firstPacket), + ntohl(pkt->ack.serial), + pkt->ack.reason, pkt->ack.nAcks); if (ping) { - call->ackr_ping = serial; + call->ping_serial = serial; smp_wmb(); /* We need to stick a time in before we send the packet in case * the reply gets back before kernel_sendmsg() completes - but @@ -201,19 +175,19 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) * the packet transmission is more likely to happen towards the * end of the kernel_sendmsg() call. */ - call->ackr_ping_time = ktime_get_real(); + call->ping_time = ktime_get_real(); set_bit(RXRPC_CALL_PINGING, &call->flags); trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial); } - ret = kernel_sendmsg(conn->params.local->socket, - &msg, iov, ioc, len); + + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ping) - call->ackr_ping_time = ktime_get_real(); + call->ping_time = ktime_get_real(); - if (type == RXRPC_PACKET_TYPE_ACK && - call->state < RXRPC_CALL_COMPLETE) { + if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { - clear_bit(RXRPC_CALL_PINGING, &call->flags); + if (ping) + clear_bit(RXRPC_CALL_PINGING, &call->flags); rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), @@ -236,6 +210,56 @@ out: } /* + * Send an ABORT call packet. + */ +int rxrpc_send_abort_packet(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = NULL; + struct rxrpc_abort_buffer pkt; + struct msghdr msg; + struct kvec iov[1]; + rxrpc_serial_t serial; + int ret; + + spin_lock_bh(&call->lock); + if (call->conn) + conn = rxrpc_get_connection_maybe(call->conn); + spin_unlock_bh(&call->lock); + if (!conn) + return -ECONNRESET; + + msg.msg_name = &call->peer->srx.transport; + msg.msg_namelen = call->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + pkt.whdr.epoch = htonl(conn->proto.epoch); + pkt.whdr.cid = htonl(call->cid); + pkt.whdr.callNumber = htonl(call->call_id); + pkt.whdr.seq = 0; + pkt.whdr.type = RXRPC_PACKET_TYPE_ABORT; + pkt.whdr.flags = conn->out_clientflag; + pkt.whdr.userStatus = 0; + pkt.whdr.securityIndex = call->security_ix; + pkt.whdr._rsvd = 0; + pkt.whdr.serviceId = htons(call->service_id); + pkt.abort_code = htonl(call->abort_code); + + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt); + + serial = atomic_inc_return(&conn->serial); + pkt.whdr.serial = htonl(serial); + + ret = kernel_sendmsg(conn->params.local->socket, + &msg, iov, 1, sizeof(pkt)); + + rxrpc_put_connection(conn); + return ret; +} + +/* * send a packet through the transport endpoint */ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, @@ -283,11 +307,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. */ - if (retrans || - call->cong_mode == RXRPC_CALL_SLOW_START || - (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), - ktime_get_real())) + if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && + (retrans || + call->cong_mode == RXRPC_CALL_SLOW_START || + (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + ktime_get_real()))) whdr.flags |= RXRPC_REQUEST_ACK; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index f05ea0a88076..c29362d50a92 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -143,7 +143,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, rxrpc_propose_ack_terminal_ack); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + rxrpc_send_ack_packet(call, false); } write_lock_bh(&call->state_lock); @@ -151,17 +151,21 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) switch (call->state) { case RXRPC_CALL_CLIENT_RECV_REPLY: __rxrpc_call_completed(call); + write_unlock_bh(&call->state_lock); break; case RXRPC_CALL_SERVER_RECV_REQUEST: call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + call->ack_at = call->expire_at; + write_unlock_bh(&call->state_lock); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true, + rxrpc_propose_ack_processing_op); break; default: + write_unlock_bh(&call->state_lock); break; } - - write_unlock_bh(&call->state_lock); } /* @@ -212,7 +216,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) true, false, rxrpc_propose_ack_rotate_rx); if (call->ackr_reason) - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + rxrpc_send_ack_packet(call, false); } } @@ -652,7 +656,7 @@ excess_data: goto out; call_complete: *_abort = call->abort_code; - ret = call->error; + ret = -call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; if (size > 0) diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 627abed5f999..4374e7b9c7bf 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -381,7 +381,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, return 0; protocol_error: - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); _leave(" = -EPROTO"); return -EPROTO; @@ -471,7 +471,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, return 0; protocol_error: - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); _leave(" = -EPROTO"); return -EPROTO; @@ -523,7 +523,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, if (cksum != expected_cksum) { rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); _leave(" = -EPROTO [csum failed]"); return -EPROTO; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 3322543d460a..b214a4d4a641 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -130,6 +130,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, break; case RXRPC_CALL_SERVER_ACK_REQUEST: call->state = RXRPC_CALL_SERVER_SEND_REPLY; + call->ack_at = call->expire_at; + if (call->ackr_reason == RXRPC_ACK_DELAY) + call->ackr_reason = 0; + __rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply, + ktime_get_real()); if (!last) break; case RXRPC_CALL_SERVER_SEND_REPLY: @@ -197,7 +202,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, do { /* Check to see if there's a ping ACK to reply to. */ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + rxrpc_send_ack_packet(call, false); if (!skb) { size_t size, chunk, max, space; @@ -514,8 +519,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else if (cmd == RXRPC_CMD_SEND_ABORT) { ret = 0; if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED)) - ret = rxrpc_send_call_packet(call, - RXRPC_PACKET_TYPE_ABORT); + ret = rxrpc_send_abort_packet(call); } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else if (rxrpc_is_client_call(call) && @@ -597,7 +601,7 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, lock_sock(sock->sk); if (rxrpc_abort_call(why, call, 0, abort_code, error)) - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_send_abort_packet(call); release_sock(sock->sk); _leave(""); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index c9102172ce3b..a512b18c0088 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -341,22 +341,25 @@ int tcf_register_action(struct tc_action_ops *act, if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) return -EINVAL; + /* We have to register pernet ops before making the action ops visible, + * otherwise tcf_action_init_1() could get a partially initialized + * netns. + */ + ret = register_pernet_subsys(ops); + if (ret) + return ret; + write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); + unregister_pernet_subsys(ops); return -EEXIST; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); - ret = register_pernet_subsys(ops); - if (ret) { - tcf_unregister_action(act, ops); - return ret; - } - return 0; } EXPORT_SYMBOL(tcf_register_action); @@ -367,8 +370,6 @@ int tcf_unregister_action(struct tc_action_ops *act, struct tc_action_ops *a; int err = -ENOENT; - unregister_pernet_subsys(ops); - write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { @@ -378,6 +379,8 @@ int tcf_unregister_action(struct tc_action_ops *act, } } write_unlock(&act_mod_lock); + if (!err) + unregister_pernet_subsys(ops); return err; } EXPORT_SYMBOL(tcf_unregister_action); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 11da7da0b7c4..2ee29a3375f6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -101,7 +101,7 @@ EXPORT_SYMBOL(unregister_tcf_proto_ops); static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, - unsigned long fh, int event); + unsigned long fh, int event, bool unicast); static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, @@ -112,7 +112,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL; it_chain = &tp->next) - tfilter_notify(net, oskb, n, tp, 0, event); + tfilter_notify(net, oskb, n, tp, 0, event, false); } /* Select new prio value from the range, managed by kernel. */ @@ -319,7 +319,8 @@ replay: RCU_INIT_POINTER(*back, next); - tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, + RTM_DELTFILTER, false); tcf_destroy(tp, true); err = 0; goto errout; @@ -345,14 +346,14 @@ replay: struct tcf_proto *next = rtnl_dereference(tp->next); tfilter_notify(net, skb, n, tp, fh, - RTM_DELTFILTER); + RTM_DELTFILTER, false); if (tcf_destroy(tp, false)) RCU_INIT_POINTER(*back, next); } goto errout; case RTM_GETTFILTER: err = tfilter_notify(net, skb, n, tp, fh, - RTM_NEWTFILTER); + RTM_NEWTFILTER, true); goto errout; default: err = -EINVAL; @@ -367,7 +368,7 @@ replay: RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); rcu_assign_pointer(*back, tp); } - tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); + tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); } else { if (tp_created) tcf_destroy(tp, true); @@ -419,7 +420,7 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, - unsigned long fh, int event) + unsigned long fh, int event, bool unicast) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -433,6 +434,9 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, return -EINVAL; } + if (unicast) + return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } diff --git a/net/socket.c b/net/socket.c index a1bd16106625..5a9bf5ee2464 100644 --- a/net/socket.c +++ b/net/socket.c @@ -320,11 +320,38 @@ static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, }; +static int sockfs_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + if (value) { + if (dentry->d_name.len + 1 > size) + return -ERANGE; + memcpy(value, dentry->d_name.name, dentry->d_name.len + 1); + } + return dentry->d_name.len + 1; +} + +#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" +#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) +#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) + +static const struct xattr_handler sockfs_xattr_handler = { + .name = XATTR_NAME_SOCKPROTONAME, + .get = sockfs_xattr_get, +}; + +static const struct xattr_handler *sockfs_xattr_handlers[] = { + &sockfs_xattr_handler, + NULL +}; + static struct dentry *sockfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "socket:", &sockfs_ops, - &sockfs_dentry_operations, SOCKFS_MAGIC); + return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops, + sockfs_xattr_handlers, + &sockfs_dentry_operations, SOCKFS_MAGIC); } static struct vfsmount *sock_mnt __read_mostly; @@ -463,35 +490,6 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) return NULL; } -#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" -#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) -#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) -static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) -{ - const char *proto_name; - size_t proto_size; - int error; - - error = -ENODATA; - if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) { - proto_name = dentry->d_name.name; - proto_size = strlen(proto_name); - - if (value) { - error = -ERANGE; - if (proto_size + 1 > size) - goto out; - - strncpy(value, proto_name, proto_size + 1); - } - error = proto_size + 1; - } - -out: - return error; -} - static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { @@ -521,7 +519,6 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, } static const struct inode_operations sockfs_inode_ops = { - .getxattr = sockfs_getxattr, .listxattr = sockfs_listxattr, }; diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 5c7549b5b92c..41adf362936d 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -246,7 +246,7 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, } else { strp->rx_interrupted = 1; } - strp_parser_err(strp, err, desc); + strp_parser_err(strp, len, desc); break; } else if (len > strp->sk->sk_rcvbuf) { /* Message length exceeds maximum allowed */ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index a7e42f9a405c..2bff63a73cf8 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, *entry, *new; unsigned int nr; - nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits); + nr = auth->au_ops->hash_cred(acred, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 168219535a34..f1df9837f1ac 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task, return auth->au_ops->lookup_cred(auth, acred, lookupflags); } +static int +generic_hash_cred(struct auth_cred *acred, unsigned int hashbits) +{ + return hash_64(from_kgid(&init_user_ns, acred->gid) | + ((u64)from_kuid(&init_user_ns, acred->uid) << + (sizeof(gid_t) * 8)), hashbits); +} + /* * Lookup generic creds for current process */ @@ -176,8 +184,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) goto out_nomatch; for (i = 0; i < gcred->acred.group_info->ngroups; i++) { - if (!gid_eq(GROUP_AT(gcred->acred.group_info, i), - GROUP_AT(acred->group_info, i))) + if (!gid_eq(gcred->acred.group_info->gid[i], + acred->group_info->gid[i])) goto out_nomatch; } out_match: @@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) static const struct rpc_authops generic_auth_ops = { .owner = THIS_MODULE, .au_name = "Generic", + .hash_cred = generic_hash_cred, .lookup_cred = generic_lookup_cred, .crcreate = generic_create_cred, .key_timeout = generic_key_timeout, diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 976c7812bbd5..d8bd97a5a7c9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1298,6 +1298,12 @@ gss_destroy_cred(struct rpc_cred *cred) gss_destroy_nullcred(cred); } +static int +gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) +{ + return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits); +} + /* * Lookup RPCSEC_GSS cred for the current process */ @@ -1982,6 +1988,7 @@ static const struct rpc_authops authgss_ops = { .au_name = "RPCSEC_GSS", .create = gss_create, .destroy = gss_destroy, + .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, .list_pseudoflavors = gss_mech_list_pseudoflavors, diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index eeeba5adee6d..dc6fb79a361f 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, kgid = make_kgid(&init_user_ns, tmp); if (!gid_valid(kgid)) goto out_free_groups; - GROUP_AT(creds->cr_group_info, i) = kgid; + creds->cr_group_info->gid[i] = kgid; } return 0; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d8582028b346..d67f7e1bc82d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, id); if (!gid_valid(kgid)) goto out; - GROUP_AT(rsci.cred.cr_group_info, i) = kgid; + rsci.cred.cr_group_info->gid[i] = kgid; } /* mech name */ diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index a99278c984e8..306fc0f54596 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth) rpcauth_clear_credcache(auth->au_credcache); } +static int +unx_hash_cred(struct auth_cred *acred, unsigned int hashbits) +{ + return hash_64(from_kgid(&init_user_ns, acred->gid) | + ((u64)from_kuid(&init_user_ns, acred->uid) << + (sizeof(gid_t) * 8)), hashbits); +} + /* * Lookup AUTH_UNIX creds for current process */ @@ -79,7 +87,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t cred->uc_gid = acred->gid; for (i = 0; i < groups; i++) - cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + cred->uc_gids[i] = acred->group_info->gid[i]; if (i < NFS_NGROUPS) cred->uc_gids[i] = INVALID_GID; @@ -127,7 +135,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) if (groups > NFS_NGROUPS) groups = NFS_NGROUPS; for (i = 0; i < groups ; i++) - if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i))) + if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) return 0; if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; @@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = { .au_name = "UNIX", .create = unx_create, .destroy = unx_destroy, + .hash_cred = unx_hash_cred, .lookup_cred = unx_lookup_cred, .crcreate = unx_create_cred, }; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 229956bf8457..ac701c28f44f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) page = alloc_page(gfp_flags); if (page == NULL) return -ENOMEM; - buf->head[0].iov_base = page_address(page); - buf->head[0].iov_len = PAGE_SIZE; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = PAGE_SIZE; + xdr_buf_init(buf, page_address(page), PAGE_SIZE); return 0; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 4d8e11f94a35..8aabe12201f8 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_unlock(&cache_list_lock); /* start the cleaning process */ - schedule_delayed_work(&cache_cleaner, 0); + queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0); } EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail); @@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work) delay = 0; if (delay) - schedule_delayed_work(&cache_cleaner, delay); + queue_delayed_work(system_power_efficient_wq, + &cache_cleaner, delay); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 66f23b376fa0..34dd7b26ee5f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { struct dentry *dentry; - int err = 0; switch (event) { case RPC_PIPEFS_MOUNT: @@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event); return -ENOTSUPP; } - return err; + return 0; } static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, @@ -988,7 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { if (clnt != NULL) { - rpc_task_release_client(task); if (task->tk_xprt == NULL) task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi); task->tk_client = clnt; @@ -1693,6 +1691,7 @@ call_allocate(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + int status; dprint_status(task); @@ -1718,11 +1717,14 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; req->rq_rcvsize <<= 2; - req->rq_buffer = xprt->ops->buf_alloc(task, - req->rq_callsize + req->rq_rcvsize); - if (req->rq_buffer != NULL) - return; + status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); + if (status == 0) + return; + if (status != -ENOMEM) { + rpc_exit(task, status); + return; + } dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); @@ -1748,18 +1750,6 @@ rpc_task_force_reencode(struct rpc_task *task) task->tk_rqstp->rq_bytes_sent = 0; } -static inline void -rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) -{ - buf->head[0].iov_base = start; - buf->head[0].iov_len = len; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->flags = 0; - buf->len = 0; - buf->buflen = len; -} - /* * 3. Encode arguments of an RPC call */ @@ -1772,12 +1762,12 @@ rpc_xdr_encode(struct rpc_task *task) dprint_status(task); - rpc_xdr_buf_init(&req->rq_snd_buf, - req->rq_buffer, - req->rq_callsize); - rpc_xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, - req->rq_rcvsize); + xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + xdr_buf_init(&req->rq_rcv_buf, + req->rq_rbuffer, + req->rq_rcvsize); p = rpc_encode_header(task); if (p == NULL) { @@ -2616,6 +2606,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); /** + * rpc_clnt_setup_test_and_add_xprt() + * + * This is an rpc_clnt_add_xprt setup() function which returns 1 so: + * 1) caller of the test function must dereference the rpc_xprt_switch + * and the rpc_xprt. + * 2) test function must call rpc_xprt_switch_add_xprt, usually in + * the rpc_call_done routine. + * + * Upon success (return of 1), the test function adds the new + * transport to the rpc_clnt xprt switch + * + * @clnt: struct rpc_clnt to get the new transport + * @xps: the rpc_xprt_switch to hold the new transport + * @xprt: the rpc_xprt to test + * @data: a struct rpc_add_xprt_test pointer that holds the test function + * and test function call data + */ +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_cred *cred; + struct rpc_task *task; + struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; + int status = -EADDRINUSE; + + xprt = xprt_get(xprt); + xprt_switch_get(xps); + + if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) + goto out_err; + + /* Test the connection */ + cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_call_null_helper(clnt, xprt, cred, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + NULL, NULL); + put_rpccred(cred); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_err; + } + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + goto out_err; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + xtest->add_xprt_test(clnt, xprt, xtest->data); + + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ + return 1; +out_err: + xprt_put(xprt); + xprt_switch_put(xps); + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", + status, xprt->address_strings[RPC_DISPLAY_ADDR]); + return status; +} +EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); + +/** * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt * @xprtargs: pointer to struct xprt_create @@ -2697,6 +2751,34 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) } EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) +{ + xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); + +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); + +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap) +{ + struct rpc_xprt_switch *xps; + bool ret; + + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + + rcu_read_lock(); + ret = rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 84f98cbe31c3..61a504fb1ae2 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -477,7 +477,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9ae588511aaf..5db68b371db2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work) } /** - * rpc_malloc - allocate an RPC buffer - * @task: RPC task that will use this buffer - * @size: requested byte size + * rpc_malloc - allocate RPC buffer resources + * @task: RPC task + * + * A single memory region is allocated, which is split between the + * RPC call and RPC reply that this task is being used for. When + * this RPC is retired, the memory is released by calling rpc_free. * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL and suppressing warning if the request cannot be serviced - * immediately. - * The caller can arrange to sleep in a way that is safe for rpciod. + * returning -ENOMEM and suppressing warning if the request cannot + * be serviced immediately. The caller can arrange to sleep in a + * way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a * mempool, ensuring that NFS reads and writes can always proceed, @@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work) * In order to avoid memory starvation triggering more writebacks of * NFS requests, we avoid using GFP_KERNEL. */ -void *rpc_malloc(struct rpc_task *task, size_t size) +int rpc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; gfp_t gfp = GFP_NOIO | __GFP_NOWARN; @@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size) buf = kmalloc(size, gfp); if (!buf) - return NULL; + return -ENOMEM; buf->len = size; dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); - return &buf->data; + rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; + return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); /** - * rpc_free - free buffer allocated via rpc_malloc - * @buffer: buffer to free + * rpc_free - free RPC buffer resources allocated via rpc_malloc + * @task: RPC task * */ -void rpc_free(void *buffer) +void rpc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; size_t size; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c5b0cb4f4056..7c8070ec93c8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net) } EXPORT_SYMBOL_GPL(svc_bind); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static void +__svc_init_bc(struct svc_serv *serv) +{ + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); +} +#else +static void +__svc_init_bc(struct svc_serv *serv) +{ +} +#endif + /* * Create an RPC service */ @@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); + __svc_init_bc(serv); + serv->sv_nrpools = npools; serv->sv_pools = kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index dfacdc95b3f5..64af4f034de6 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, gid); if (!gid_valid(kgid)) goto out; - GROUP_AT(ug.gi, i) = kgid; + ug.gi->gid[i] = kgid; } ugp = unix_gid_lookup(cd, uid); @@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m, seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen); for (i = 0; i < glen; i++) - seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i))); + seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i])); seq_printf(m, "\n"); return 0; } @@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_CLOSE; for (i = 0; i < slen; i++) { kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); - GROUP_AT(cred->cr_group_info, i) = kgid; + cred->cr_group_info->gid[i] = kgid; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index c4f3cc0c0775..7f1071e103ca 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase -= xdr->buf->page_base; if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); + xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) @@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr) xdr_set_next_page(xdr); else if (xdr->iov == xdr->buf->head) { if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len); + xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); } return xdr->p != xdr->end; } @@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer); static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; - void *cpdest = xdr->scratch.iov_base; + char *cpdest = xdr->scratch.iov_base; size_t cplen = (char *)xdr->end - (char *)xdr->p; if (nbytes > xdr->scratch.iov_len) return NULL; - memcpy(cpdest, xdr->p, cplen); + p = __xdr_inline_decode(xdr, cplen); + if (p == NULL) + return NULL; + memcpy(cpdest, p, cplen); cpdest += cplen; nbytes -= cplen; if (!xdr_set_next_buffer(xdr)) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ea244b29138b..685e6d225414 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task) xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) - xprt->ops->buf_free(req->rq_buffer); + xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 66c9d63f4797..ae92a9e9ba52 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -15,6 +15,7 @@ #include <asm/cmpxchg.h> #include <linux/spinlock.h> #include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtmultipath.h> typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, @@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xprt == NULL) return; spin_lock(&xps->xps_lock); - if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) + if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && + !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } @@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + struct list_head *head; + struct rpc_xprt *pos; + + if (xps == NULL || sap == NULL) + return false; + + head = &xps->xps_xprt_list; + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { + pr_info("RPC: addr %s already in xprt switch\n", + pos->address_strings[RPC_DISPLAY_ADDR]); + return true; + } + } + return false; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 87762d976b63..2c472e1b4827 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, list_del(&req->rl_all); spin_unlock(&buf->rb_reqslock); - rpcrdma_destroy_req(&r_xprt->rx_ia, req); + rpcrdma_destroy_req(req); kfree(rqst); } @@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - struct xdr_buf *buf; size_t size; req = rpcrdma_create_req(r_xprt); @@ -46,30 +44,19 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, return PTR_ERR(req); req->rl_backchannel = true; - size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); - rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; - size += RPCRDMA_INLINE_READ_THRESHOLD(rqst); - rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + size = r_xprt->rx_data.inline_rsize; + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; req->rl_sendbuf = rb; - /* so that rpcr_to_rdmar works when receiving a request */ - rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - - buf = &rqst->rq_snd_buf; - buf->head[0].iov_base = rqst->rq_buffer; - buf->head[0].iov_len = 0; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = size; - + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size); + rpcrdma_set_xprtdata(rqst, req); return 0; out_fail: @@ -219,7 +206,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_msg *headerp; - size_t rpclen; headerp = rdmab_to_msg(req->rl_rdmabuf); headerp->rm_xid = rqst->rq_xid; @@ -231,26 +217,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) headerp->rm_body.rm_chunks[1] = xdr_zero; headerp->rm_body.rm_chunks[2] = xdr_zero; - rpclen = rqst->rq_svec[0].iov_len; - -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", - __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); - pr_info("RPC: %s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); - pr_info("RPC: %s: RPC: %*ph\n", - __func__, (int)rpclen, rqst->rq_svec[0].iov_base); -#endif - - req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); - req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; - req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); - - req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); - req->rl_send_iov[1].length = rpclen; - req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 2; + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, + &rqst->rq_snd_buf, rpcrdma_noch)) + return -EIO; return 0; } @@ -402,7 +371,7 @@ out_overflow: out_short: pr_warn("RPC/RDMA short backward direction call\n"); - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) xprt_disconnect_done(xprt); else pr_warn("RPC: %s: reposting rep %p\n", diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 21cb3b150b37..1ebb09e1ac4f 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -160,9 +160,8 @@ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - RPCRDMA_MAX_FMR_SGES)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES); return 0; } @@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ list_for_each_entry(mw, &req->rl_registered, mw_list) list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) goto out_reset; @@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_init_mr = fmr_op_init_mr, .ro_release_mr = fmr_op_release_mr, .ro_displayname = "fmr", + .ro_send_w_inv_ok = 0, }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 892b5e1d9b09..210949562786 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -67,6 +67,8 @@ * pending send queue WRs before the transport is reconnected. */ +#include <linux/sunrpc/rpc_rdma.h> + #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -161,7 +163,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return PTR_ERR(f->fr_mr); } - dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); + dprintk("RPC: %s: recovered FRMR %p\n", __func__, f); f->fr_state = FRMR_IS_INVALID; return 0; } @@ -242,9 +244,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, depth; } - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frmr_depth); return 0; } @@ -329,7 +330,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); if (wc->status != IB_WC_SUCCESS) __frwr_sendcompletion_flush(wc, frmr, "localinv"); - complete_all(&frmr->fr_linv_done); + complete(&frmr->fr_linv_done); } /* Post a REG_MR Work Request to register a memory region @@ -396,7 +397,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, goto out_mapmr_err; dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", - __func__, mw, mw->mw_nents, mr->length); + __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); @@ -449,6 +450,8 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw) struct rpcrdma_frmr *f = &mw->frmr; struct ib_send_wr *invalidate_wr; + dprintk("RPC: %s: invalidating frmr %p\n", __func__, f); + f->fr_state = FRMR_IS_INVALID; invalidate_wr = &f->fr_invwr; @@ -472,6 +475,7 @@ static void frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; @@ -487,6 +491,12 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) f = NULL; invalidate_wrs = pos = prev = NULL; list_for_each_entry(mw, &req->rl_registered, mw_list) { + if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) && + (mw->mw_handle == rep->rr_inv_rkey)) { + mw->frmr.fr_state = FRMR_IS_INVALID; + continue; + } + pos = __frwr_prepare_linv_wr(mw); if (!invalidate_wrs) @@ -496,6 +506,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) prev = pos; f = &mw->frmr; } + if (!f) + goto unmap; /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain @@ -510,6 +522,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * replaces the QP. The RPC reply handler won't call us * unless ri_id->qp is a valid pointer. */ + r_xprt->rx_stats.local_inv_needed++; rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); if (rc) goto reset_mrs; @@ -521,6 +534,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ unmap: list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + dprintk("RPC: %s: unmapping frmr %p\n", + __func__, &mw->frmr); list_del_init(&mw->mw_list); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); @@ -576,4 +591,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_init_mr = frwr_op_init_mr, .ro_release_mr = frwr_op_release_mr, .ro_displayname = "frwr", + .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK, }; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a47f170b20ef..d987c2d3dd6e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,14 +53,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - static const char transfertypes[][12] = { "inline", /* no chunks */ "read list", /* some argument via rdma read */ @@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) return size; } -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata, - unsigned int maxsegs) +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int maxsegs = ia->ri_max_segs; + ia->ri_max_inline_write = cdata->inline_wsize - rpcrdma_max_call_header_size(maxsegs); ia->ri_max_inline_read = cdata->inline_rsize - @@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } -static int -rpcrdma_tail_pullup(struct xdr_buf *buf) -{ - size_t tlen = buf->tail[0].iov_len; - size_t skip = tlen & 3; - - /* Do not include the tail if it is only an XDR pad */ - if (tlen < 4) - return 0; - - /* xdr_write_pages() adds a pad at the beginning of the tail - * if the content in "buf->pages" is unaligned. Force the - * tail's actual content to land at the next XDR position - * after the head instead. - */ - if (skip) { - unsigned char *src, *dst; - unsigned int count; - - src = buf->tail[0].iov_base; - dst = buf->head[0].iov_base; - dst += buf->head[0].iov_len; - - src += skip; - tlen -= skip; - - dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", - __func__, skip, dst, src, tlen); - - for (count = tlen; count; count--) - *dst++ = *src++; - } - - return tlen; -} - /* Split "vec" on page boundaries into segments. FMR registers pages, * not a byte range. Other modes coalesce these segments into a single * MR when they can. @@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) static int rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, + bool reminv_expected) { int len, n, p, page_base; struct page **ppages; @@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (type == rpcrdma_readch) return n; + /* When encoding the Write list, some servers need to see an extra + * segment for odd-length Write chunks. The upper layer provides + * space in the tail iovec for this purpose. + */ + if (type == rpcrdma_writech && reminv_expected) + return n; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ @@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, if (rtype == rpcrdma_areadch) pos = 0; seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); if (nsegs < 0) return ERR_PTR(nsegs); @@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg); + wtype, seg, + r_xprt->rx_ia.ri_reminv_expected); if (nsegs < 0) return ERR_PTR(nsegs); @@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, + r_xprt->rx_ia.ri_reminv_expected); if (nsegs < 0) return ERR_PTR(nsegs); @@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return iptr; } -/* - * Copy write data inline. - * This function is used for "small" requests. Data which is passed - * to RPC via iovecs (or page list) is copied directly into the - * pre-registered memory buffer for this request. For small amounts - * of data, this is efficient. The cutoff value is tunable. +/* Prepare the RPC-over-RDMA header SGE. */ -static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) +static bool +rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + u32 len) { - int i, npages, curlen; - int copy_len; - unsigned char *srcp, *destp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int page_base; - struct page **ppages; + struct rpcrdma_regbuf *rb = req->rl_rdmabuf; + struct ib_sge *sge = &req->rl_send_sge[0]; + + if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) { + if (!__rpcrdma_dma_map_regbuf(ia, rb)) + return false; + sge->addr = rdmab_addr(rb); + sge->lkey = rdmab_lkey(rb); + } + sge->length = len; + + ib_dma_sync_single_for_device(ia->ri_device, sge->addr, + sge->length, DMA_TO_DEVICE); + req->rl_send_wr.num_sge++; + return true; +} - destp = rqst->rq_svec[0].iov_base; - curlen = rqst->rq_svec[0].iov_len; - destp += curlen; +/* Prepare the Send SGEs. The head and tail iovec, and each entry + * in the page list, gets its own SGE. + */ +static bool +rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +{ + unsigned int sge_no, page_base, len, remaining; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + struct ib_device *device = ia->ri_device; + struct ib_sge *sge = req->rl_send_sge; + u32 lkey = ia->ri_pd->local_dma_lkey; + struct page *page, **ppages; + + /* The head iovec is straightforward, as it is already + * DMA-mapped. Sync the content that has changed. + */ + if (!rpcrdma_dma_map_regbuf(ia, rb)) + return false; + sge_no = 1; + sge[sge_no].addr = rdmab_addr(rb); + sge[sge_no].length = xdr->head[0].iov_len; + sge[sge_no].lkey = rdmab_lkey(rb); + ib_dma_sync_single_for_device(device, sge[sge_no].addr, + sge[sge_no].length, DMA_TO_DEVICE); + + /* If there is a Read chunk, the page list is being handled + * via explicit RDMA, and thus is skipped here. However, the + * tail iovec may include an XDR pad for the page list, as + * well as additional content, and may not reside in the + * same page as the head iovec. + */ + if (rtype == rpcrdma_readch) { + len = xdr->tail[0].iov_len; - dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", - __func__, destp, rqst->rq_slen, curlen); + /* Do not include the tail if it is only an XDR pad */ + if (len < 4) + goto out; - copy_len = rqst->rq_snd_buf.page_len; + page = virt_to_page(xdr->tail[0].iov_base); + page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; - if (rqst->rq_snd_buf.tail[0].iov_len) { - curlen = rqst->rq_snd_buf.tail[0].iov_len; - if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { - memmove(destp + copy_len, - rqst->rq_snd_buf.tail[0].iov_base, curlen); - r_xprt->rx_stats.pullup_copy_count += curlen; + /* If the content in the page list is an odd length, + * xdr_write_pages() has added a pad at the beginning + * of the tail iovec. Force the tail's non-pad content + * to land at the next XDR position in the Send message. + */ + page_base += len & 3; + len -= len & 3; + goto map_tail; + } + + /* If there is a page list present, temporarily DMA map + * and prepare an SGE for each page to be sent. + */ + if (xdr->page_len) { + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + sge_no++; + if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) + goto out_mapping_overflow; + + len = min_t(u32, PAGE_SIZE - page_base, remaining); + sge[sge_no].addr = ib_dma_map_page(device, *ppages, + page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, sge[sge_no].addr)) + goto out_mapping_err; + sge[sge_no].length = len; + sge[sge_no].lkey = lkey; + + req->rl_mapped_sges++; + ppages++; + remaining -= len; + page_base = 0; } - dprintk("RPC: %s: tail destp 0x%p len %d\n", - __func__, destp + copy_len, curlen); - rqst->rq_svec[0].iov_len += curlen; } - r_xprt->rx_stats.pullup_copy_count += copy_len; - page_base = rqst->rq_snd_buf.page_base; - ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); - page_base &= ~PAGE_MASK; - npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; - for (i = 0; copy_len && i < npages; i++) { - curlen = PAGE_SIZE - page_base; - if (curlen > copy_len) - curlen = copy_len; - dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", - __func__, i, destp, copy_len, curlen); - srcp = kmap_atomic(ppages[i]); - memcpy(destp, srcp+page_base, curlen); - kunmap_atomic(srcp); - rqst->rq_svec[0].iov_len += curlen; - destp += curlen; - copy_len -= curlen; - page_base = 0; + /* The tail iovec is not always constructed in the same + * page where the head iovec resides (see, for example, + * gss_wrap_req_priv). To neatly accommodate that case, + * DMA map it separately. + */ + if (xdr->tail[0].iov_len) { + page = virt_to_page(xdr->tail[0].iov_base); + page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + len = xdr->tail[0].iov_len; + +map_tail: + sge_no++; + sge[sge_no].addr = ib_dma_map_page(device, page, + page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, sge[sge_no].addr)) + goto out_mapping_err; + sge[sge_no].length = len; + sge[sge_no].lkey = lkey; + req->rl_mapped_sges++; } - /* header now contains entire send message */ + +out: + req->rl_send_wr.num_sge = sge_no + 1; + return true; + +out_mapping_overflow: + pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); + return false; + +out_mapping_err: + pr_err("rpcrdma: Send mapping error\n"); + return false; +} + +bool +rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + u32 hdrlen, struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) +{ + req->rl_send_wr.num_sge = 0; + req->rl_mapped_sges = 0; + + if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen)) + goto out_map; + + if (rtype != rpcrdma_areadch) + if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype)) + goto out_map; + + return true; + +out_map: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; +} + +void +rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_device *device = ia->ri_device; + struct ib_sge *sge; + int count; + + sge = &req->rl_send_sge[2]; + for (count = req->rl_mapped_sges; count--; sge++) + ib_dma_unmap_page(device, sge->addr, sge->length, + DMA_TO_DEVICE); + req->rl_mapped_sges = 0; } /* * Marshal a request: the primary job of this routine is to choose * the transfer modes. See comments below. * - * Prepares up to two IOVs per Call message: - * - * [0] -- RPC RDMA header - * [1] -- the RPC header/data - * * Returns zero on success, otherwise a negative errno. */ @@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_args_inline(r_xprt, rqst)) { rtype = rpcrdma_noch; - rpcrdma_inline_pullup(rqst); - rpclen = rqst->rq_svec[0].iov_len; + rpclen = rqst->rq_snd_buf.len; } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; - rpclen = rqst->rq_svec[0].iov_len; - rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); + rpclen = rqst->rq_snd_buf.head[0].iov_len + + rqst->rq_snd_buf.tail[0].iov_len; } else { r_xprt->rx_stats.nomsg_call_count++; headerp->rm_type = htonl(RDMA_NOMSG); @@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) goto out_unmap; hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; - if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - goto out_overflow; - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", rqst->rq_task->tk_pid, __func__, transfertypes[rtype], transfertypes[wtype], hdrlen, rpclen); - req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); - req->rl_send_iov[0].length = hdrlen; - req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); - - req->rl_niovs = 1; - if (rtype == rpcrdma_areadch) - return 0; - - req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); - req->rl_send_iov[1].length = rpclen; - req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 2; + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, + &rqst->rq_snd_buf, rtype)) { + iptr = ERR_PTR(-EIO); + goto out_unmap; + } return 0; -out_overflow: - pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", - hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); - iptr = ERR_PTR(-EIO); - out_unmap: r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); return PTR_ERR(iptr); @@ -916,8 +977,10 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) * allowed to timeout, to discover the errors at that time. */ void -rpcrdma_reply_handler(struct rpcrdma_rep *rep) +rpcrdma_reply_handler(struct work_struct *work) { + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_msg *headerp; struct rpcrdma_req *req; struct rpc_rqst *rqst; @@ -1132,6 +1195,6 @@ out_duplicate: repost: r_xprt->rx_stats.bad_reply_count++; - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) rpcrdma_recv_buffer_put(rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..2d8545c34095 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, ret = -EIO; goto out_unmap; } - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); memset(&send_wr, 0, sizeof(send_wr)); ctxt->cqe.done = svc_rdma_wc_send; @@ -159,33 +159,34 @@ out_unmap: /* Server-side transport endpoint wants a whole page for its send * buffer. The client RPC code constructs the RPC header in this * buffer before it invokes ->send_request. - * - * Returns NULL if there was a temporary allocation failure. */ -static void * -xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_bc_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + size_t size = rqst->rq_callsize; struct svcxprt_rdma *rdma; struct page *page; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - /* Prevent an infinite loop: try to make this case work */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", size); + return -EINVAL; + } page = alloc_page(RPCRDMA_DEF_GFP); if (!page) - return NULL; + return -ENOMEM; - return page_address(page); + rqst->rq_buffer = page_address(page); + return 0; } static void -xprt_rdma_bc_free(void *buffer) +xprt_rdma_bc_free(struct rpc_task *task) { /* No-op: ctxt and page have already been freed. */ } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2c25606f2561..ad1df979b3f0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->sge[pno].addr); if (ret) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 54d533300620..f5a91edcd233 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -225,6 +225,48 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp, return rp_ary; } +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one rkey to invalidate. + * + * Find a candidate rkey to invalidate when sending a reply. Picks the + * first rkey it finds in the chunks lists. + * + * Returns zero if RPC's chunk lists are empty. + */ +static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, + struct rpcrdma_write_array *wr_ary, + struct rpcrdma_write_array *rp_ary) +{ + struct rpcrdma_read_chunk *rd_ary; + struct rpcrdma_segment *arg_ch; + u32 inv_rkey; + + inv_rkey = 0; + + rd_ary = svc_rdma_get_read_chunk(rdma_argp); + if (rd_ary) { + inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); + goto out; + } + + if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { + arg_ch = &wr_ary->wc_array[0].wc_target; + inv_rkey = be32_to_cpu(arg_ch->rs_handle); + goto out; + } + + if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { + arg_ch = &rp_ary->wc_array[0].wc_target; + inv_rkey = be32_to_cpu(arg_ch->rs_handle); + goto out; + } + +out: + dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey); + return inv_rkey; +} + /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ @@ -280,7 +322,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge[sge_no].addr)) goto err; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; @@ -464,7 +506,8 @@ static int send_reply(struct svcxprt_rdma *rdma, struct page *page, struct rpcrdma_msg *rdma_resp, struct svc_rdma_req_map *vec, - int byte_count) + int byte_count, + u32 inv_rkey) { struct svc_rdma_op_ctxt *ctxt; struct ib_send_wr send_wr; @@ -489,7 +532,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].length, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->direction = DMA_TO_DEVICE; @@ -505,7 +548,7 @@ static int send_reply(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[sge_no].addr)) goto err; - atomic_inc(&rdma->sc_dma_used); + svc_rdma_count_mappings(rdma, ctxt); ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } @@ -523,23 +566,9 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; ctxt->count++; rqstp->rq_respages[page_no] = NULL; - /* - * If there are more pages than SGE, terminate SGE - * list so that svc_rdma_unmap_dma doesn't attempt to - * unmap garbage. - */ - if (page_no+1 >= sge_no) - ctxt->sge[page_no+1].length = 0; } rqstp->rq_next_page = rqstp->rq_respages + 1; - /* The loop above bumps sc_dma_used for each sge. The - * xdr_buf.tail gets a separate sge, but resides in the - * same page as xdr_buf.head. Don't count it twice. - */ - if (sge_no > ctxt->count) - atomic_dec(&rdma->sc_dma_used); - if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; @@ -549,7 +578,11 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.wr_cqe = &ctxt->cqe; send_wr.sg_list = ctxt->sge; send_wr.num_sge = sge_no; - send_wr.opcode = IB_WR_SEND; + if (inv_rkey) { + send_wr.opcode = IB_WR_SEND_WITH_INV; + send_wr.ex.invalidate_rkey = inv_rkey; + } else + send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; ret = svc_rdma_send(rdma, &send_wr); @@ -581,6 +614,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; + u32 inv_rkey; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -591,6 +625,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) wr_ary = svc_rdma_get_write_array(rdma_argp); rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); + inv_rkey = 0; + if (rdma->sc_snd_w_inv) + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); + /* Build an req vec for the XDR */ vec = svc_rdma_get_req_map(rdma); ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); @@ -633,9 +671,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err1; ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, - inline_bytes); + inline_bytes, inv_rkey); if (ret < 0) - goto err1; + goto err0; svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); @@ -692,7 +730,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, svc_rdma_put_context(ctxt, 1); return; } - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); /* Prepare SEND WR */ memset(&err_wr, 0, sizeof(err_wr)); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index dd9440137834..6864fb967038 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -198,6 +198,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) out: ctxt->count = 0; + ctxt->mapped_sges = 0; ctxt->frmr = NULL; return ctxt; @@ -221,22 +222,27 @@ out_empty: void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; - int i; - for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + struct ib_device *device = xprt->sc_cm_id->device; + u32 lkey = xprt->sc_pd->local_dma_lkey; + unsigned int i, count; + + for (count = 0, i = 0; i < ctxt->mapped_sges; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_page(xprt->sc_cm_id->device, + if (ctxt->sge[i].lkey == lkey) { + count++; + ib_dma_unmap_page(device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); } } + ctxt->mapped_sges = 0; + atomic_sub(count, &xprt->sc_dma_used); } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -600,7 +606,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) goto err_put_ctxt; - atomic_inc(&xprt->sc_dma_used); + svc_rdma_count_mappings(xprt, ctxt); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; @@ -642,6 +648,26 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) return ret; } +static void +svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, + struct rdma_conn_param *param) +{ + const struct rpcrdma_connect_private *pmsg = param->private_data; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + newxprt->sc_snd_w_inv = pmsg->cp_flags & + RPCRDMA_CMP_F_SND_W_INV_OK; + + dprintk("svcrdma: client send_size %u, recv_size %u " + "remote inv %ssupported\n", + rpcrdma_decode_buffer_size(pmsg->cp_send_size), + rpcrdma_decode_buffer_size(pmsg->cp_recv_size), + newxprt->sc_snd_w_inv ? "" : "un"); + } +} + /* * This function handles the CONNECT_REQUEST event on a listening * endpoint. It is passed the cma_id for the _new_ connection. The context in @@ -653,7 +679,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags) * will call the recvfrom method on the listen xprt which will accept the new * connection. */ -static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) +static void handle_connect_req(struct rdma_cm_id *new_cma_id, + struct rdma_conn_param *param) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; @@ -669,9 +696,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) new_cma_id->context = newxprt; dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + svc_rdma_parse_connect_private(newxprt, param); /* Save client advertised inbound read limit for use later in accept. */ - newxprt->sc_ord = client_ird; + newxprt->sc_ord = param->initiator_depth; /* Set the local and remote addresses in the transport */ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; @@ -706,8 +734,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event = %s (%d)\n", cma_id, cma_id->context, rdma_event_msg(event->event), event->event); - handle_connect_req(cma_id, - event->param.conn.initiator_depth); + handle_connect_req(cma_id, &event->param.conn); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -941,6 +968,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; + struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; struct ib_device *dev; unsigned int i; @@ -993,7 +1021,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(dev); + newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); goto errout; @@ -1070,7 +1098,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; - } + } else + newxprt->sc_snd_w_inv = false; /* * Determine if a DMA MR is required and if so, what privs are required @@ -1094,11 +1123,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; + /* Construct RDMA-CM private message */ + pmsg.cp_magic = rpcrdma_cmp_magic; + pmsg.cp_version = RPCRDMA_CMP_VERSION; + pmsg.cp_flags = 0; + pmsg.cp_send_size = pmsg.cp_recv_size = + rpcrdma_encode_buffer_size(newxprt->sc_max_req_size); + /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 0; conn_param.initiator_depth = newxprt->sc_ord; + conn_param.private_data = &pmsg; + conn_param.private_data_len = sizeof(pmsg); ret = rdma_accept(newxprt->sc_cm_id, &conn_param); if (ret) { dprintk("svcrdma: failed to accept new connection, ret=%d\n", diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 81f0e879f019..ed5e285fd2ea 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_read, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, @@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_write, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, @@ -477,115 +477,152 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual send/recv - * sequence. +/* Allocate a fixed-size buffer in which to construct and send the + * RPC-over-RDMA header for this request. + */ +static bool +rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + gfp_t flags) +{ + size_t size = RPCRDMA_HDRBUF_SIZE; + struct rpcrdma_regbuf *rb; + + if (req->rl_rdmabuf) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + if (IS_ERR(rb)) + return false; + + r_xprt->rx_stats.hardway_register_count += size; + req->rl_rdmabuf = rb; + return true; +} + +static bool +rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + + if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(req->rl_sendbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_sendbuf = rb; + return true; +} + +/* The rq_rcv_buf is used only if a Reply chunk is necessary. + * The decision to use a Reply chunk is made later in + * rpcrdma_marshal_req. This buffer is registered at that time. * - * The RPC layer allocates both send and receive buffers in the same call - * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). - * We may register rq_rcv_buf when using reply chunks. + * Otherwise, the associated RPC Reply arrives in a separate + * Receive buffer, arbitrarily chosen by the HCA. The buffer + * allocated here for the RPC Reply is not utilized in that + * case. See rpcrdma_inline_fixup. + * + * A regbuf is used here to remember the buffer size. */ -static void * -xprt_rdma_allocate(struct rpc_task *task, size_t size) +static bool +rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_regbuf *rb; + + if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(req->rl_recvbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_recvbuf = rb; + return true; +} + +/** + * xprt_rdma_allocate - allocate transport resources for an RPC + * @task: RPC task + * + * Return values: + * 0: Success; rq_buffer points to RPC buffer to use + * ENOMEM: Out of memory, call again later + * EIO: A permanent error occurred, do not retry + * + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual + * send/recv sequence. + * + * xprt_rdma_allocate provides buffers that are already mapped for + * DMA, and a local DMA lkey is provided for each. + */ +static int +xprt_rdma_allocate(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req; - size_t min_size; gfp_t flags; req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return NULL; + return -ENOMEM; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (req->rl_rdmabuf == NULL) - goto out_rdmabuf; - if (req->rl_sendbuf == NULL) - goto out_sendbuf; - if (size > req->rl_sendbuf->rg_size) - goto out_sendbuf; - -out: - dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); - req->rl_connect_cookie = 0; /* our reserved value */ - req->rl_task = task; - return req->rl_sendbuf->rg_base; - -out_rdmabuf: - min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); - if (IS_ERR(rb)) + if (!rpcrdma_get_rdmabuf(r_xprt, req, flags)) goto out_fail; - req->rl_rdmabuf = rb; - -out_sendbuf: - /* XDR encoding and RPC/RDMA marshaling of this request has not - * yet occurred. Thus a lower bound is needed to prevent buffer - * overrun during marshaling. - * - * RPC/RDMA marshaling may choose to send payload bearing ops - * inline, if the result is smaller than the inline threshold. - * The value of the "size" argument accounts for header - * requirements but not for the payload in these cases. - * - * Likewise, allocate enough space to receive a reply up to the - * size of the inline threshold. - * - * It's unlikely that both the send header and the received - * reply will be large, but slush is provided here to allow - * flexibility when marshaling. - */ - min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp); - min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); - if (size < min_size) - size = min_size; - - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); - if (IS_ERR(rb)) + if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) + goto out_fail; + if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - rb->rg_owner = req; - r_xprt->rx_stats.hardway_register_count += size; - rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); - req->rl_sendbuf = rb; - goto out; + dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n", + task->tk_pid, __func__, rqst->rq_callsize, + rqst->rq_rcvsize, req); + + req->rl_connect_cookie = 0; /* our reserved value */ + rpcrdma_set_xprtdata(rqst, req); + rqst->rq_buffer = req->rl_sendbuf->rg_base; + rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + return 0; out_fail: rpcrdma_buffer_put(req); - return NULL; + return -ENOMEM; } -/* - * This function returns all RDMA resources to the pool. +/** + * xprt_rdma_free - release resources allocated by xprt_rdma_allocate + * @task: RPC task + * + * Caller guarantees rqst->rq_buffer is non-NULL. */ static void -xprt_rdma_free(void *buffer) +xprt_rdma_free(struct rpc_task *task) { - struct rpcrdma_req *req; - struct rpcrdma_xprt *r_xprt; - struct rpcrdma_regbuf *rb; - - if (buffer == NULL) - return; + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); - req = rb->rg_owner; if (req->rl_backchannel) return; - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, - !RPC_IS_ASYNC(req->rl_task)); - + ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); + rpcrdma_unmap_sges(ia, req); rpcrdma_buffer_put(req); } @@ -685,10 +722,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); - seq_printf(seq, "%lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %lu\n", r_xprt->rx_stats.mrs_recovered, r_xprt->rx_stats.mrs_orphaned, - r_xprt->rx_stats.mrs_allocated); + r_xprt->rx_stats.mrs_allocated, + r_xprt->rx_stats.local_inv_needed); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 799cce6cbe45..ec74289af7ec 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) wc->status, wc->vendor_err); } -static void -rpcrdma_receive_worker(struct work_struct *work) -{ - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); - - rpcrdma_reply_handler(rep); -} - /* Perform basic sanity checking to avoid using garbage * to update the credit grant value. */ @@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) } /** - * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC + * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC * @cq: completion queue (ignored) * @wc: completed WR * */ static void -rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) +rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, @@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) __func__, rep, wc->byte_len); rep->rr_len = wc->byte_len; + rep->rr_wc_flags = wc->wc_flags; + rep->rr_inv_rkey = wc->ex.invalidate_rkey; + ib_dma_sync_single_for_cpu(rep->rr_device, rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); @@ -204,6 +198,36 @@ out_fail: goto out_schedule; } +static void +rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + const struct rpcrdma_connect_private *pmsg = param->private_data; + unsigned int rsize, wsize; + + /* Default settings for RPC-over-RDMA Version One */ + r_xprt->rx_ia.ri_reminv_expected = false; + rsize = RPCRDMA_V1_DEF_INLINE_SIZE; + wsize = RPCRDMA_V1_DEF_INLINE_SIZE; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + r_xprt->rx_ia.ri_reminv_expected = true; + rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); + wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); + } + + if (rsize < cdata->inline_rsize) + cdata->inline_rsize = rsize; + if (wsize < cdata->inline_wsize) + cdata->inline_wsize = wsize; + pr_info("rpcrdma: max send %u, max recv %u\n", + cdata->inline_wsize, cdata->inline_rsize); + rpcrdma_set_max_header_sizes(r_xprt); +} + static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) " (%d initiator)\n", __func__, attr->max_dest_rd_atomic, attr->max_rd_atomic); + rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; @@ -387,7 +412,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } ia->ri_device = ia->ri_id->device; - ia->ri_pd = ib_alloc_pd(ia->ri_device); + ia->ri_pd = ib_alloc_pd(ia->ri_device, 0); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); @@ -454,11 +479,12 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { + struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; unsigned int max_qp_wr; int rc; - if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { + if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { dprintk("RPC: %s: insufficient sge's available\n", __func__); return -ENOMEM; @@ -487,7 +513,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ - ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -536,9 +562,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Initialize cma parameters */ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); - /* RPC/RDMA does not use private data */ - ep->rep_remote_cma.private_data = NULL; - ep->rep_remote_cma.private_data_len = 0; + /* Prepare RDMA-CM private message */ + pmsg->cp_magic = rpcrdma_cmp_magic; + pmsg->cp_version = RPCRDMA_CMP_VERSION; + pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok; + pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + ep->rep_remote_cma.private_data = pmsg; + ep->rep_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; @@ -849,6 +880,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; INIT_LIST_HEAD(&req->rl_registered); + req->rl_send_wr.next = NULL; + req->rl_send_wr.wr_cqe = &req->rl_cqe; + req->rl_send_wr.sg_list = req->rl_send_sge; + req->rl_send_wr.opcode = IB_WR_SEND; return req; } @@ -865,17 +900,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, - GFP_KERNEL); + rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, + DMA_FROM_DEVICE, GFP_KERNEL); if (IS_ERR(rep->rr_rdmabuf)) { rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; } rep->rr_device = ia->ri_device; - rep->rr_cqe.done = rpcrdma_receive_wc; + rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; - INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); + INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); + rep->rr_recv_wr.next = NULL; + rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; + rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; + rep->rr_recv_wr.num_sge = 1; return rep; out_free: @@ -966,17 +1005,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf) } static void -rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) +rpcrdma_destroy_rep(struct rpcrdma_rep *rep) { - rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); + rpcrdma_free_regbuf(rep->rr_rdmabuf); kfree(rep); } void -rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +rpcrdma_destroy_req(struct rpcrdma_req *req) { - rpcrdma_free_regbuf(ia, req->rl_sendbuf); - rpcrdma_free_regbuf(ia, req->rl_rdmabuf); + rpcrdma_free_regbuf(req->rl_recvbuf); + rpcrdma_free_regbuf(req->rl_sendbuf); + rpcrdma_free_regbuf(req->rl_rdmabuf); kfree(req); } @@ -1009,15 +1049,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - struct rpcrdma_ia *ia = rdmab_to_ia(buf); - cancel_delayed_work_sync(&buf->rb_recovery_worker); while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; rep = rpcrdma_buffer_get_rep_locked(buf); - rpcrdma_destroy_rep(ia, rep); + rpcrdma_destroy_rep(rep); } buf->rb_send_count = 0; @@ -1030,7 +1068,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) list_del(&req->rl_all); spin_unlock(&buf->rb_reqslock); - rpcrdma_destroy_req(ia, req); + rpcrdma_destroy_req(req); spin_lock(&buf->rb_reqslock); } spin_unlock(&buf->rb_reqslock); @@ -1129,7 +1167,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_rep *rep = req->rl_reply; - req->rl_niovs = 0; + req->rl_send_wr.num_sge = 0; req->rl_reply = NULL; spin_lock(&buffers->rb_lock); @@ -1171,70 +1209,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) spin_unlock(&buffers->rb_lock); } -/* - * Wrappers for internal-use kmalloc memory registration, used by buffer code. - */ - /** - * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers - * @ia: controlling rpcrdma_ia + * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers * @size: size of buffer to be allocated, in bytes + * @direction: direction of data movement * @flags: GFP flags * - * Returns pointer to private header of an area of internally - * registered memory, or an ERR_PTR. The registered buffer follows - * the end of the private header. + * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that + * can be persistently DMA-mapped for I/O. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for - * receiving the payload of RDMA RECV operations. regbufs are not - * used for RDMA READ/WRITE operations, thus are registered only for - * LOCAL access. + * receiving the payload of RDMA RECV operations. During Long Calls + * or Replies they may be registered externally via ro_map. */ struct rpcrdma_regbuf * -rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) +rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, + gfp_t flags) { struct rpcrdma_regbuf *rb; - struct ib_sge *iov; rb = kmalloc(sizeof(*rb) + size, flags); if (rb == NULL) - goto out; + return ERR_PTR(-ENOMEM); - iov = &rb->rg_iov; - iov->addr = ib_dma_map_single(ia->ri_device, - (void *)rb->rg_base, size, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_device, iov->addr)) - goto out_free; + rb->rg_device = NULL; + rb->rg_direction = direction; + rb->rg_iov.length = size; - iov->length = size; - iov->lkey = ia->ri_pd->local_dma_lkey; - rb->rg_size = size; - rb->rg_owner = NULL; return rb; +} -out_free: - kfree(rb); -out: - return ERR_PTR(-ENOMEM); +/** + * __rpcrdma_map_regbuf - DMA-map a regbuf + * @ia: controlling rpcrdma_ia + * @rb: regbuf to be mapped + */ +bool +__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + if (rb->rg_direction == DMA_NONE) + return false; + + rb->rg_iov.addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, + rdmab_length(rb), + rb->rg_direction); + if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb))) + return false; + + rb->rg_device = ia->ri_device; + rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; + return true; +} + +static void +rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) +{ + if (!rpcrdma_regbuf_is_mapped(rb)) + return; + + ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), + rdmab_length(rb), rb->rg_direction); + rb->rg_device = NULL; } /** * rpcrdma_free_regbuf - deregister and free registered buffer - * @ia: controlling rpcrdma_ia * @rb: regbuf to be deregistered and freed */ void -rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) { - struct ib_sge *iov; - if (!rb) return; - iov = &rb->rg_iov; - ib_dma_unmap_single(ia->ri_device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); + rpcrdma_dma_unmap_regbuf(rb); kfree(rb); } @@ -1248,39 +1297,28 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_device *device = ia->ri_device; - struct ib_send_wr send_wr, *send_wr_fail; - struct rpcrdma_rep *rep = req->rl_reply; - struct ib_sge *iov = req->rl_send_iov; - int i, rc; + struct ib_send_wr *send_wr = &req->rl_send_wr; + struct ib_send_wr *send_wr_fail; + int rc; - if (rep) { - rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (req->rl_reply) { + rc = rpcrdma_ep_post_recv(ia, req->rl_reply); if (rc) return rc; req->rl_reply = NULL; } - send_wr.next = NULL; - send_wr.wr_cqe = &req->rl_cqe; - send_wr.sg_list = iov; - send_wr.num_sge = req->rl_niovs; - send_wr.opcode = IB_WR_SEND; - - for (i = 0; i < send_wr.num_sge; i++) - ib_dma_sync_single_for_device(device, iov[i].addr, - iov[i].length, DMA_TO_DEVICE); dprintk("RPC: %s: posting %d s/g entries\n", - __func__, send_wr.num_sge); + __func__, send_wr->num_sge); if (DECR_CQCOUNT(ep) > 0) - send_wr.send_flags = 0; + send_wr->send_flags = 0; else { /* Provider must take a send completion every now and then */ INIT_CQCOUNT(ep); - send_wr.send_flags = IB_SEND_SIGNALED; + send_wr->send_flags = IB_SEND_SIGNALED; } - rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); + rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); if (rc) goto out_postsend_err; return 0; @@ -1290,32 +1328,24 @@ out_postsend_err: return -ENOTCONN; } -/* - * (Re)post a receive buffer. - */ int rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, - struct rpcrdma_ep *ep, struct rpcrdma_rep *rep) { - struct ib_recv_wr recv_wr, *recv_wr_fail; + struct ib_recv_wr *recv_wr_fail; int rc; - recv_wr.next = NULL; - recv_wr.wr_cqe = &rep->rr_cqe; - recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; - recv_wr.num_sge = 1; - - ib_dma_sync_single_for_cpu(ia->ri_device, - rdmab_addr(rep->rr_rdmabuf), - rdmab_length(rep->rr_rdmabuf), - DMA_BIDIRECTIONAL); - - rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); + if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) + goto out_map; + rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); if (rc) goto out_postrecv; return 0; +out_map: + pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); + return -EIO; + out_postrecv: pr_err("rpcrdma: ib_post_recv returned %i\n", rc); return -ENOTCONN; @@ -1333,7 +1363,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) { struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; int rc; @@ -1344,7 +1373,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) rep = rpcrdma_buffer_get_rep_locked(buffers); spin_unlock(&buffers->rb_lock); - rc = rpcrdma_ep_post_recv(ia, ep, rep); + rc = rpcrdma_ep_post_recv(ia, rep); if (rc) goto out_rc; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a71b0f5897d8..0d35b761c883 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -70,9 +70,11 @@ struct rpcrdma_ia { struct ib_pd *ri_pd; struct completion ri_done; int ri_async_rc; + unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; + bool ri_reminv_expected; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -87,6 +89,7 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; + struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; @@ -112,9 +115,9 @@ struct rpcrdma_ep { */ struct rpcrdma_regbuf { - size_t rg_size; - struct rpcrdma_req *rg_owner; struct ib_sge rg_iov; + struct ib_device *rg_device; + enum dma_data_direction rg_direction; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -162,7 +165,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * The smallest inline threshold is 1024 bytes, ensuring that * at least 750 bytes are available for RPC messages. */ -#define RPCRDMA_MAX_HDR_SEGS (8) +enum { + RPCRDMA_MAX_HDR_SEGS = 8, + RPCRDMA_HDRBUF_SIZE = 256, +}; /* * struct rpcrdma_rep -- this structure encapsulates state required to recv @@ -182,10 +188,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) struct rpcrdma_rep { struct ib_cqe rr_cqe; unsigned int rr_len; + int rr_wc_flags; + u32 rr_inv_rkey; struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; struct list_head rr_list; + struct ib_recv_wr rr_recv_wr; struct rpcrdma_regbuf *rr_rdmabuf; }; @@ -276,19 +285,30 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; -#define RPCRDMA_MAX_IOVS (2) +/* Reserve enough Send SGEs to send a maximum size inline request: + * - RPC-over-RDMA header + * - xdr_buf head iovec + * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages + * - xdr_buf tail iovec + */ +enum { + RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, + RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, + RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, +}; struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_free; - unsigned int rl_niovs; + unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; - struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; - struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_rep *rl_reply; + struct ib_send_wr rl_send_wr; + struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; + struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ + struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ + struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ struct ib_cqe rl_cqe; struct list_head rl_all; @@ -298,14 +318,16 @@ struct rpcrdma_req { struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +static inline void +rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) +{ + rqst->rq_xprtdata = req; +} + static inline struct rpcrdma_req * rpcr_to_rdmar(struct rpc_rqst *rqst) { - void *buffer = rqst->rq_buffer; - struct rpcrdma_regbuf *rb; - - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base); - return rb->rg_owner; + return rqst->rq_xprtdata; } /* @@ -356,15 +378,6 @@ struct rpcrdma_create_data_internal { unsigned int padding; /* non-rdma write header padding */ }; -#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ - (rpcx_to_rdmad(rq->rq_xprt).inline_rsize) - -#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ - (rpcx_to_rdmad(rq->rq_xprt).inline_wsize) - -#define RPCRDMA_INLINE_PAD_VALUE(rq)\ - rpcx_to_rdmad(rq->rq_xprt).padding - /* * Statistics for RPCRDMA */ @@ -386,6 +399,7 @@ struct rpcrdma_stats { unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + unsigned long local_inv_needed; }; /* @@ -409,6 +423,7 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mw *); void (*ro_release_mr)(struct rpcrdma_mw *); const char *ro_displayname; + const int ro_send_w_inv_ok; }; extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; @@ -461,15 +476,14 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); -int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, - struct rpcrdma_rep *); +int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *); /* * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); -void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *); +void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); @@ -482,10 +496,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); -struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, - size_t, gfp_t); -void rpcrdma_free_regbuf(struct rpcrdma_ia *, - struct rpcrdma_regbuf *); +struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, + gfp_t); +bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +void rpcrdma_free_regbuf(struct rpcrdma_regbuf *); + +static inline bool +rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device != NULL; +} + +static inline bool +rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + if (likely(rpcrdma_regbuf_is_mapped(rb))) + return true; + return __rpcrdma_dma_map_regbuf(ia, rb); +} int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); @@ -507,15 +535,25 @@ rpcrdma_data_dir(bool writing) */ void rpcrdma_connect_worker(struct work_struct *); void rpcrdma_conn_func(struct rpcrdma_ep *); -void rpcrdma_reply_handler(struct rpcrdma_rep *); +void rpcrdma_reply_handler(struct work_struct *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, + u32, struct xdr_buf *, enum rpcrdma_chunktype); +void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_marshal_req(struct rpc_rqst *); -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *, - unsigned int); +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); /* RPC/RDMA module init - xprtrdma/transport.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bf168838a029..0137af1c0916 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); /* Race breaker in case memory is freed before above code is called */ - sk->sk_write_space(sk); + if (ret == -EAGAIN) { + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags); + rcu_read_unlock(); + + sk->sk_write_space(sk); + } return ret; } @@ -2533,35 +2542,38 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ -static void *bc_malloc(struct rpc_task *task, size_t size) +static int bc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; - WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); - if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) - return NULL; + if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { + WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", + size); + return -EINVAL; + } page = alloc_page(GFP_KERNEL); if (!page) - return NULL; + return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; - return buf->data; + rqst->rq_buffer = buf->data; + return 0; } /* * Free the space allocated in the bc_alloc routine */ -static void bc_free(void *buffer) +static void bc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index d80cd3f7503f..78cab9c5a445 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -407,6 +407,7 @@ static int __tipc_nl_add_udp_addr(struct sk_buff *skb, if (ntohs(addr->proto) == ETH_P_IP) { struct sockaddr_in ip4; + memset(&ip4, 0, sizeof(ip4)); ip4.sin_family = AF_INET; ip4.sin_port = addr->port; ip4.sin_addr.s_addr = addr->ipv4.s_addr; @@ -417,6 +418,7 @@ static int __tipc_nl_add_udp_addr(struct sk_buff *skb, } else if (ntohs(addr->proto) == ETH_P_IPV6) { struct sockaddr_in6 ip6; + memset(&ip6, 0, sizeof(ip6)); ip6.sin6_family = AF_INET6; ip6.sin6_port = addr->port; memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8309687a56b0..145082e2ba36 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2475,28 +2475,13 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, return unix_stream_read_generic(&state); } -static ssize_t skb_unix_socket_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - struct unix_sock *u = unix_sk(sk); - - mutex_unlock(&u->iolock); - ret = splice_to_pipe(pipe, spd); - mutex_lock(&u->iolock); - - return ret; -} - static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, - state->pipe, chunk, state->splice_flags, - skb_unix_socket_splice); + state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, |