diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/ceph/Makefile | 1 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 8 | ||||
-rw-r--r-- | net/ceph/crypto.c | 6 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 17 | ||||
-rw-r--r-- | net/ceph/messenger.c | 188 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 2 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 67 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 71 | ||||
-rw-r--r-- | net/ceph/striper.c | 261 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 6 | ||||
-rw-r--r-- | net/l2tp/l2tp_core.c | 225 | ||||
-rw-r--r-- | net/l2tp/l2tp_core.h | 4 | ||||
-rw-r--r-- | net/l2tp/l2tp_netlink.c | 22 | ||||
-rw-r--r-- | net/l2tp/l2tp_ppp.c | 9 | ||||
-rw-r--r-- | net/rds/send.c | 15 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 8 | ||||
-rw-r--r-- | net/sunrpc/sched.c | 10 | ||||
-rw-r--r-- | net/sunrpc/stats.c | 16 | ||||
-rw-r--r-- | net/sunrpc/sunrpc.h | 6 | ||||
-rw-r--r-- | net/sunrpc/xdr.c | 82 | ||||
-rw-r--r-- | net/sunrpc/xprt.c | 34 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/backchannel.c | 7 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 13 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 53 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 32 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 43 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 44 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 4 | ||||
-rw-r--r-- | net/sunrpc/xprtsock.c | 4 |
29 files changed, 832 insertions, 426 deletions
diff --git a/net/ceph/Makefile b/net/ceph/Makefile index b4bded4b5396..12bf49772d24 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -8,6 +8,7 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ mon_client.o \ cls_lock_client.o \ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + striper.o \ debugfs.o \ auth.o auth_none.o \ crypto.o armor.o \ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4adf07826f4a..584fdbef2088 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -72,6 +72,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_FS_MAP_USER: return "fs_map_user"; case CEPH_MSG_CLIENT_SESSION: return "client_session"; case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; case CEPH_MSG_CLIENT_REQUEST: return "client_request"; @@ -79,8 +80,13 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_CLIENT_REPLY: return "client_reply"; case CEPH_MSG_CLIENT_CAPS: return "client_caps"; case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_QUOTA: return "client_quota"; case CEPH_MSG_CLIENT_SNAP: return "client_snap"; case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_POOLOP_REPLY: return "poolop_reply"; + case CEPH_MSG_POOLOP: return "poolop"; + case CEPH_MSG_MON_COMMAND: return "mon_command"; + case CEPH_MSG_MON_COMMAND_ACK: return "mon_command_ack"; case CEPH_MSG_OSD_MAP: return "osd_map"; case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; @@ -217,7 +223,7 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid) if (i == 16) err = 0; - dout("parse_fsid ret %d got fsid %pU", err, fsid); + dout("parse_fsid ret %d got fsid %pU\n", err, fsid); return err; } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index bf9d079cbafd..02172c408ff2 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -347,10 +347,12 @@ struct key_type key_type_ceph = { .destroy = ceph_key_destroy, }; -int ceph_crypto_init(void) { +int __init ceph_crypto_init(void) +{ return register_key_type(&key_type_ceph); } -void ceph_crypto_shutdown(void) { +void ceph_crypto_shutdown(void) +{ unregister_key_type(&key_type_ceph); } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 1eef6806aa1a..02952605d121 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -389,7 +389,7 @@ CEPH_DEFINE_SHOW_FUNC(monc_show) CEPH_DEFINE_SHOW_FUNC(osdc_show) CEPH_DEFINE_SHOW_FUNC(client_options_show) -int ceph_debugfs_init(void) +int __init ceph_debugfs_init(void) { ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); if (!ceph_debugfs_dir) @@ -418,7 +418,7 @@ int ceph_debugfs_client_init(struct ceph_client *client) goto out; client->monc.debugfs_file = debugfs_create_file("monc", - 0600, + 0400, client->debugfs_dir, client, &monc_show_fops); @@ -426,7 +426,7 @@ int ceph_debugfs_client_init(struct ceph_client *client) goto out; client->osdc.debugfs_file = debugfs_create_file("osdc", - 0600, + 0400, client->debugfs_dir, client, &osdc_show_fops); @@ -434,7 +434,7 @@ int ceph_debugfs_client_init(struct ceph_client *client) goto out; client->debugfs_monmap = debugfs_create_file("monmap", - 0600, + 0400, client->debugfs_dir, client, &monmap_show_fops); @@ -442,7 +442,7 @@ int ceph_debugfs_client_init(struct ceph_client *client) goto out; client->debugfs_osdmap = debugfs_create_file("osdmap", - 0600, + 0400, client->debugfs_dir, client, &osdmap_show_fops); @@ -450,7 +450,7 @@ int ceph_debugfs_client_init(struct ceph_client *client) goto out; client->debugfs_options = debugfs_create_file("client_options", - 0600, + 0400, client->debugfs_dir, client, &client_options_show_fops); @@ -477,7 +477,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) #else /* CONFIG_DEBUG_FS */ -int ceph_debugfs_init(void) +int __init ceph_debugfs_init(void) { return 0; } @@ -496,6 +496,3 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) } #endif /* CONFIG_DEBUG_FS */ - -EXPORT_SYMBOL(ceph_debugfs_init); -EXPORT_SYMBOL(ceph_debugfs_cleanup); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 8a4d3758030b..fcb40c12b1f8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -277,7 +277,7 @@ static void _ceph_msgr_exit(void) ceph_msgr_slab_exit(); } -int ceph_msgr_init(void) +int __init ceph_msgr_init(void) { if (ceph_msgr_slab_init()) return -ENOMEM; @@ -299,7 +299,6 @@ int ceph_msgr_init(void) return -ENOMEM; } -EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { @@ -307,7 +306,6 @@ void ceph_msgr_exit(void) _ceph_msgr_exit(); } -EXPORT_SYMBOL(ceph_msgr_exit); void ceph_msgr_flush(void) { @@ -839,93 +837,112 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; - struct bio *bio; + struct ceph_bio_iter *it = &cursor->bio_iter; - BUG_ON(data->type != CEPH_MSG_DATA_BIO); + cursor->resid = min_t(size_t, length, data->bio_length); + *it = data->bio_pos; + if (cursor->resid < it->iter.bi_size) + it->iter.bi_size = cursor->resid; - bio = data->bio; - BUG_ON(!bio); - - cursor->resid = min(length, data->bio_length); - cursor->bio = bio; - cursor->bvec_iter = bio->bi_iter; - cursor->last_piece = - cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); + BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { - struct ceph_msg_data *data = cursor->data; - struct bio *bio; - struct bio_vec bio_vec; - - BUG_ON(data->type != CEPH_MSG_DATA_BIO); - - bio = cursor->bio; - BUG_ON(!bio); - - bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); - - *page_offset = (size_t) bio_vec.bv_offset; - BUG_ON(*page_offset >= PAGE_SIZE); - if (cursor->last_piece) /* pagelist offset is always 0 */ - *length = cursor->resid; - else - *length = (size_t) bio_vec.bv_len; - BUG_ON(*length > cursor->resid); - BUG_ON(*page_offset + *length > PAGE_SIZE); + struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, + cursor->bio_iter.iter); - return bio_vec.bv_page; + *page_offset = bv.bv_offset; + *length = bv.bv_len; + return bv.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { - struct bio *bio; - struct bio_vec bio_vec; + struct ceph_bio_iter *it = &cursor->bio_iter; - BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); + BUG_ON(bytes > cursor->resid); + BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); + cursor->resid -= bytes; + bio_advance_iter(it->bio, &it->iter, bytes); - bio = cursor->bio; - BUG_ON(!bio); + if (!cursor->resid) { + BUG_ON(!cursor->last_piece); + return false; /* no more data */ + } - bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); + if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done)) + return false; /* more bytes to process in this segment */ - /* Advance the cursor offset */ + if (!it->iter.bi_size) { + it->bio = it->bio->bi_next; + it->iter = it->bio->bi_iter; + if (cursor->resid < it->iter.bi_size) + it->iter.bi_size = cursor->resid; + } - BUG_ON(cursor->resid < bytes); - cursor->resid -= bytes; + BUG_ON(cursor->last_piece); + BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); + return true; +} +#endif /* CONFIG_BLOCK */ - bio_advance_iter(bio, &cursor->bvec_iter, bytes); +static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio_vec *bvecs = data->bvec_pos.bvecs; - if (bytes < bio_vec.bv_len) - return false; /* more bytes to process in this segment */ + cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); + cursor->bvec_iter = data->bvec_pos.iter; + cursor->bvec_iter.bi_size = cursor->resid; - /* Move on to the next segment, and possibly the next bio */ + BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); + cursor->last_piece = + cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); +} - if (!cursor->bvec_iter.bi_size) { - bio = bio->bi_next; - cursor->bio = bio; - if (bio) - cursor->bvec_iter = bio->bi_iter; - else - memset(&cursor->bvec_iter, 0, - sizeof(cursor->bvec_iter)); - } +static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, + size_t *length) +{ + struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, + cursor->bvec_iter); + + *page_offset = bv.bv_offset; + *length = bv.bv_len; + return bv.bv_page; +} + +static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; + + BUG_ON(bytes > cursor->resid); + BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); + cursor->resid -= bytes; + bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); - if (!cursor->last_piece) { - BUG_ON(!cursor->resid); - BUG_ON(!bio); - /* A short read is OK, so use <= rather than == */ - if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) - cursor->last_piece = true; + if (!cursor->resid) { + BUG_ON(!cursor->last_piece); + return false; /* no more data */ } + if (!bytes || cursor->bvec_iter.bi_bvec_done) + return false; /* more bytes to process in this segment */ + + BUG_ON(cursor->last_piece); + BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); + cursor->last_piece = + cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); return true; } -#endif /* CONFIG_BLOCK */ /* * For a page array, a piece comes from the first page in the array @@ -1110,6 +1127,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: + ceph_msg_data_bvecs_cursor_init(cursor, length); + break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ @@ -1158,14 +1178,19 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: + page = ceph_msg_data_bvecs_next(cursor, page_offset, length); + break; case CEPH_MSG_DATA_NONE: default: page = NULL; break; } + BUG_ON(!page); BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); + BUG_ON(*length > cursor->resid); if (last_piece) *last_piece = cursor->last_piece; @@ -1194,6 +1219,9 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_BVECS: + new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); + break; case CEPH_MSG_DATA_NONE: default: BUG(); @@ -1575,13 +1603,18 @@ static int write_partial_message_data(struct ceph_connection *con) * been revoked, so use the zero page. */ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; - while (cursor->resid) { + while (cursor->total_resid) { struct page *page; size_t page_offset; size_t length; bool last_piece; int ret; + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + page = ceph_msg_data_next(cursor, &page_offset, &length, &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, @@ -2297,7 +2330,12 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = con->in_data_crc; - while (cursor->resid) { + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { @@ -3262,16 +3300,14 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg, EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK -void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, - size_t length) +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, + u32 length) { struct ceph_msg_data *data; - BUG_ON(!bio); - data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); - data->bio = bio; + data->bio_pos = *bio_pos; data->bio_length = length; list_add_tail(&data->links, &msg->data); @@ -3280,6 +3316,20 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ +void ceph_msg_data_add_bvecs(struct ceph_msg *msg, + struct ceph_bvec_iter *bvec_pos) +{ + struct ceph_msg_data *data; + + data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); + BUG_ON(!data); + data->bvec_pos = *bvec_pos; + + list_add_tail(&data->links, &msg->data); + msg->data_length += bvec_pos->iter.bi_size; +} +EXPORT_SYMBOL(ceph_msg_data_add_bvecs); + /* * construct a new message with given type, size * the new msg has a ref count of 1. diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 1547107f4854..b3dac24412d3 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -60,7 +60,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) num_mon = ceph_decode_32(&p); ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); - if (num_mon >= CEPH_MAX_MON) + if (num_mon > CEPH_MAX_MON) goto bad; m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); if (m == NULL) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2814dba5902d..ea2a6c9fb7ce 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -20,6 +20,7 @@ #include <linux/ceph/decode.h> #include <linux/ceph/auth.h> #include <linux/ceph/pagelist.h> +#include <linux/ceph/striper.h> #define OSD_OPREPLY_FRONT_LEN 512 @@ -103,13 +104,12 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *objnum, u64 *objoff, u64 *objlen) { u64 orig_len = *plen; - int r; + u32 xlen; /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, - objoff, objlen); - if (r < 0) - return r; + ceph_calc_file_object_mapping(layout, off, orig_len, objnum, + objoff, &xlen); + *objlen = xlen; if (*objlen < orig_len) { *plen = *objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", @@ -117,7 +117,6 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, } dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); - return 0; } @@ -148,14 +147,22 @@ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, #ifdef CONFIG_BLOCK static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, - struct bio *bio, size_t bio_length) + struct ceph_bio_iter *bio_pos, + u32 bio_length) { osd_data->type = CEPH_OSD_DATA_TYPE_BIO; - osd_data->bio = bio; + osd_data->bio_pos = *bio_pos; osd_data->bio_length = bio_length; } #endif /* CONFIG_BLOCK */ +static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, + struct ceph_bvec_iter *bvec_pos) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; + osd_data->bvec_pos = *bvec_pos; +} + #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ struct ceph_osd_request *__oreq = (oreq); \ @@ -218,16 +225,29 @@ EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, - unsigned int which, struct bio *bio, size_t bio_length) + unsigned int which, + struct ceph_bio_iter *bio_pos, + u32 bio_length) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); - ceph_osd_data_bio_init(osd_data, bio, bio_length); + ceph_osd_data_bio_init(osd_data, bio_pos, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, + unsigned int which, + struct ceph_bvec_iter *bvec_pos) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bvecs_init(osd_data, bvec_pos); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); + static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) @@ -265,6 +285,23 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); +void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 bytes) +{ + struct ceph_osd_data *osd_data; + struct ceph_bvec_iter it = { + .bvecs = bvecs, + .iter = { .bi_size = bytes }, + }; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_bvecs_init(osd_data, &it); + osd_req->r_ops[which].cls.indata_len += bytes; + osd_req->r_ops[which].indata_len += bytes; +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs); + void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -290,6 +327,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) case CEPH_OSD_DATA_TYPE_BIO: return (u64)osd_data->bio_length; #endif /* CONFIG_BLOCK */ + case CEPH_OSD_DATA_TYPE_BVECS: + return osd_data->bvec_pos.iter.bi_size; default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; @@ -828,8 +867,10 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, ceph_msg_data_add_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { - ceph_msg_data_add_bio(msg, osd_data->bio, length); + ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length); #endif + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { + ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } @@ -5065,7 +5106,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); -int ceph_osdc_setup(void) +int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); @@ -5076,7 +5117,6 @@ int ceph_osdc_setup(void) return ceph_osd_request_cache ? 0 : -ENOMEM; } -EXPORT_SYMBOL(ceph_osdc_setup); void ceph_osdc_cleanup(void) { @@ -5084,7 +5124,6 @@ void ceph_osdc_cleanup(void) kmem_cache_destroy(ceph_osd_request_cache); ceph_osd_request_cache = NULL; } -EXPORT_SYMBOL(ceph_osdc_cleanup); /* * handle incoming message diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0da27c66349a..9645ffd6acfb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -4,7 +4,6 @@ #include <linux/module.h> #include <linux/slab.h> -#include <asm/div64.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osdmap.h> @@ -2141,76 +2140,6 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, } /* - * calculate file layout from given offset, length. - * fill in correct oid, logical length, and object extent - * offset, length. - * - * for now, we write only a single su, until we can - * pass a stride back to the caller. - */ -int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 len, - u64 *ono, - u64 *oxoff, u64 *oxlen) -{ - u32 osize = layout->object_size; - u32 su = layout->stripe_unit; - u32 sc = layout->stripe_count; - u32 bl, stripeno, stripepos, objsetno; - u32 su_per_object; - u64 t, su_offset; - - dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, - osize, su); - if (su == 0 || sc == 0) - goto invalid; - su_per_object = osize / su; - if (su_per_object == 0) - goto invalid; - dout("osize %u / su %u = su_per_object %u\n", osize, su, - su_per_object); - - if ((su & ~PAGE_MASK) != 0) - goto invalid; - - /* bl = *off / su; */ - t = off; - do_div(t, su); - bl = t; - dout("off %llu / su %u = bl %u\n", off, su, bl); - - stripeno = bl / sc; - stripepos = bl % sc; - objsetno = stripeno / su_per_object; - - *ono = objsetno * sc + stripepos; - dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); - - /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ - t = off; - su_offset = do_div(t, su); - *oxoff = su_offset + (stripeno % su_per_object) * su; - - /* - * Calculate the length of the extent being written to the selected - * object. This is the minimum of the full length requested (len) or - * the remainder of the current stripe being written to. - */ - *oxlen = min_t(u64, len, su - su_offset); - - dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); - return 0; - -invalid: - dout(" invalid layout\n"); - *ono = 0; - *oxoff = 0; - *oxlen = 0; - return -EINVAL; -} -EXPORT_SYMBOL(ceph_calc_file_object_mapping); - -/* * Map an object into a PG. * * Should only be called with target_oid and target_oloc (as opposed to diff --git a/net/ceph/striper.c b/net/ceph/striper.c new file mode 100644 index 000000000000..c36462dc86b7 --- /dev/null +++ b/net/ceph/striper.c @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/ceph/ceph_debug.h> + +#include <linux/math64.h> +#include <linux/slab.h> + +#include <linux/ceph/striper.h> +#include <linux/ceph/types.h> + +/* + * Map a file extent to a stripe unit within an object. + * Fill in objno, offset into object, and object extent length (i.e. the + * number of bytes mapped, less than or equal to @l->stripe_unit). + * + * Example for stripe_count = 3, stripes_per_object = 4: + * + * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19 + * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6 + * stripepos | 0 | 1 | 2 | 0 | 1 + * objno | 0 | 1 | 2 | 3 | 4 + * objsetno | 0 | 1 + */ +void ceph_calc_file_object_mapping(struct ceph_file_layout *l, + u64 off, u64 len, + u64 *objno, u64 *objoff, u32 *xlen) +{ + u32 stripes_per_object = l->object_size / l->stripe_unit; + u64 blockno; /* which su in the file (i.e. globally) */ + u32 blockoff; /* offset into su */ + u64 stripeno; /* which stripe */ + u32 stripepos; /* which su in the stripe, + which object in the object set */ + u64 objsetno; /* which object set */ + u32 objsetpos; /* which stripe in the object set */ + + blockno = div_u64_rem(off, l->stripe_unit, &blockoff); + stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos); + objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos); + + *objno = objsetno * l->stripe_count + stripepos; + *objoff = objsetpos * l->stripe_unit + blockoff; + *xlen = min_t(u64, len, l->stripe_unit - blockoff); +} +EXPORT_SYMBOL(ceph_calc_file_object_mapping); + +/* + * Return the last extent with given objno (@object_extents is sorted + * by objno). If not found, return NULL and set @add_pos so that the + * new extent can be added with list_add(add_pos, new_ex). + */ +static struct ceph_object_extent * +lookup_last(struct list_head *object_extents, u64 objno, + struct list_head **add_pos) +{ + struct list_head *pos; + + list_for_each_prev(pos, object_extents) { + struct ceph_object_extent *ex = + list_entry(pos, typeof(*ex), oe_item); + + if (ex->oe_objno == objno) + return ex; + + if (ex->oe_objno < objno) + break; + } + + *add_pos = pos; + return NULL; +} + +static struct ceph_object_extent * +lookup_containing(struct list_head *object_extents, u64 objno, + u64 objoff, u32 xlen) +{ + struct ceph_object_extent *ex; + + list_for_each_entry(ex, object_extents, oe_item) { + if (ex->oe_objno == objno && + ex->oe_off <= objoff && + ex->oe_off + ex->oe_len >= objoff + xlen) /* paranoia */ + return ex; + + if (ex->oe_objno > objno) + break; + } + + return NULL; +} + +/* + * Map a file extent to a sorted list of object extents. + * + * We want only one (or as few as possible) object extents per object. + * Adjacent object extents will be merged together, each returned object + * extent may reverse map to multiple different file extents. + * + * Call @alloc_fn for each new object extent and @action_fn for each + * mapped stripe unit, whether it was merged into an already allocated + * object extent or started a new object extent. + * + * Newly allocated object extents are added to @object_extents. + * To keep @object_extents sorted, successive calls to this function + * must map successive file extents (i.e. the list of file extents that + * are mapped using the same @object_extents must be sorted). + * + * The caller is responsible for @object_extents. + */ +int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + struct ceph_object_extent *alloc_fn(void *arg), + void *alloc_arg, + ceph_object_extent_fn_t action_fn, + void *action_arg) +{ + struct ceph_object_extent *last_ex, *ex; + + while (len) { + struct list_head *add_pos = NULL; + u64 objno, objoff; + u32 xlen; + + ceph_calc_file_object_mapping(l, off, len, &objno, &objoff, + &xlen); + + last_ex = lookup_last(object_extents, objno, &add_pos); + if (!last_ex || last_ex->oe_off + last_ex->oe_len != objoff) { + ex = alloc_fn(alloc_arg); + if (!ex) + return -ENOMEM; + + ex->oe_objno = objno; + ex->oe_off = objoff; + ex->oe_len = xlen; + if (action_fn) + action_fn(ex, xlen, action_arg); + + if (!last_ex) + list_add(&ex->oe_item, add_pos); + else + list_add(&ex->oe_item, &last_ex->oe_item); + } else { + last_ex->oe_len += xlen; + if (action_fn) + action_fn(last_ex, xlen, action_arg); + } + + off += xlen; + len -= xlen; + } + + for (last_ex = list_first_entry(object_extents, typeof(*ex), oe_item), + ex = list_next_entry(last_ex, oe_item); + &ex->oe_item != object_extents; + last_ex = ex, ex = list_next_entry(ex, oe_item)) { + if (last_ex->oe_objno > ex->oe_objno || + (last_ex->oe_objno == ex->oe_objno && + last_ex->oe_off + last_ex->oe_len >= ex->oe_off)) { + WARN(1, "%s: object_extents list not sorted!\n", + __func__); + return -EINVAL; + } + } + + return 0; +} +EXPORT_SYMBOL(ceph_file_to_extents); + +/* + * A stripped down, non-allocating version of ceph_file_to_extents(), + * for when @object_extents is already populated. + */ +int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len, + struct list_head *object_extents, + ceph_object_extent_fn_t action_fn, + void *action_arg) +{ + while (len) { + struct ceph_object_extent *ex; + u64 objno, objoff; + u32 xlen; + + ceph_calc_file_object_mapping(l, off, len, &objno, &objoff, + &xlen); + + ex = lookup_containing(object_extents, objno, objoff, xlen); + if (!ex) { + WARN(1, "%s: objno %llu %llu~%u not found!\n", + __func__, objno, objoff, xlen); + return -EINVAL; + } + + action_fn(ex, xlen, action_arg); + + off += xlen; + len -= xlen; + } + + return 0; +} +EXPORT_SYMBOL(ceph_iterate_extents); + +/* + * Reverse map an object extent to a sorted list of file extents. + * + * On success, the caller is responsible for: + * + * kfree(file_extents) + */ +int ceph_extent_to_file(struct ceph_file_layout *l, + u64 objno, u64 objoff, u64 objlen, + struct ceph_file_extent **file_extents, + u32 *num_file_extents) +{ + u32 stripes_per_object = l->object_size / l->stripe_unit; + u64 blockno; /* which su */ + u32 blockoff; /* offset into su */ + u64 stripeno; /* which stripe */ + u32 stripepos; /* which su in the stripe, + which object in the object set */ + u64 objsetno; /* which object set */ + u32 i = 0; + + if (!objlen) { + *file_extents = NULL; + *num_file_extents = 0; + return 0; + } + + *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) - + DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit); + *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents), + GFP_NOIO); + if (!*file_extents) + return -ENOMEM; + + div_u64_rem(objoff, l->stripe_unit, &blockoff); + while (objlen) { + u64 off, len; + + objsetno = div_u64_rem(objno, l->stripe_count, &stripepos); + stripeno = div_u64(objoff, l->stripe_unit) + + objsetno * stripes_per_object; + blockno = stripeno * l->stripe_count + stripepos; + off = blockno * l->stripe_unit + blockoff; + len = min_t(u64, objlen, l->stripe_unit - blockoff); + + (*file_extents)[i].fe_off = off; + (*file_extents)[i].fe_len = len; + + blockoff = 0; + objoff += len; + objlen -= len; + i++; + } + + BUG_ON(i != *num_file_extents); + return 0; +} +EXPORT_SYMBOL(ceph_extent_to_file); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a8772a978224..9c169bb2444d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -781,8 +781,14 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) tunnel->encap.type == TUNNEL_ENCAP_NONE) { dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } else { + dev->features &= ~NETIF_F_GSO_SOFTWARE; + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } dev->features |= NETIF_F_LLTX; + } else { + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; + dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE); } } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 14b67dfacc4b..0fbd3ee26165 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -335,26 +335,6 @@ err_tlock: } EXPORT_SYMBOL_GPL(l2tp_session_register); -/* Lookup a tunnel by id - */ -struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id) -{ - struct l2tp_tunnel *tunnel; - struct l2tp_net *pn = l2tp_pernet(net); - - rcu_read_lock_bh(); - list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (tunnel->tunnel_id == tunnel_id) { - rcu_read_unlock_bh(); - return tunnel; - } - } - rcu_read_unlock_bh(); - - return NULL; -} -EXPORT_SYMBOL_GPL(l2tp_tunnel_find); - struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth) { struct l2tp_net *pn = l2tp_pernet(net); @@ -1436,74 +1416,11 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 { struct l2tp_tunnel *tunnel = NULL; int err; - struct socket *sock = NULL; - struct sock *sk = NULL; - struct l2tp_net *pn; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; - /* Get the tunnel socket from the fd, which was opened by - * the userspace L2TP daemon. If not specified, create a - * kernel socket. - */ - if (fd < 0) { - err = l2tp_tunnel_sock_create(net, tunnel_id, peer_tunnel_id, - cfg, &sock); - if (err < 0) - goto err; - } else { - sock = sockfd_lookup(fd, &err); - if (!sock) { - pr_err("tunl %u: sockfd_lookup(fd=%d) returned %d\n", - tunnel_id, fd, err); - err = -EBADF; - goto err; - } - - /* Reject namespace mismatches */ - if (!net_eq(sock_net(sock->sk), net)) { - pr_err("tunl %u: netns mismatch\n", tunnel_id); - err = -EINVAL; - goto err; - } - } - - sk = sock->sk; - if (cfg != NULL) encap = cfg->encap; - /* Quick sanity checks */ - err = -EPROTONOSUPPORT; - if (sk->sk_type != SOCK_DGRAM) { - pr_debug("tunl %hu: fd %d wrong socket type\n", - tunnel_id, fd); - goto err; - } - switch (encap) { - case L2TP_ENCAPTYPE_UDP: - if (sk->sk_protocol != IPPROTO_UDP) { - pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", - tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP); - goto err; - } - break; - case L2TP_ENCAPTYPE_IP: - if (sk->sk_protocol != IPPROTO_L2TP) { - pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", - tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP); - goto err; - } - break; - } - - /* Check if this socket has already been prepped */ - tunnel = l2tp_tunnel(sk); - if (tunnel != NULL) { - /* This socket has already been prepped */ - err = -EBUSY; - goto err; - } - tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL); if (tunnel == NULL) { err = -ENOMEM; @@ -1520,72 +1437,126 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - /* The net we belong to */ - tunnel->l2tp_net = net; - pn = l2tp_pernet(net); - if (cfg != NULL) tunnel->debug = cfg->debug; - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; - if (encap == L2TP_ENCAPTYPE_UDP) { - struct udp_tunnel_sock_cfg udp_cfg = { }; - - udp_cfg.sk_user_data = tunnel; - udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP; - udp_cfg.encap_rcv = l2tp_udp_encap_recv; - udp_cfg.encap_destroy = l2tp_udp_encap_destroy; - - setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - sk->sk_user_data = tunnel; - } - /* Bump the reference count. The tunnel context is deleted - * only when this drops to zero. A reference is also held on - * the tunnel socket to ensure that it is not released while - * the tunnel is extant. Must be done before sk_destruct is - * set. - */ refcount_set(&tunnel->ref_count, 1); - sock_hold(sk); - tunnel->sock = sk; tunnel->fd = fd; - /* Hook on the tunnel socket destructor so that we can cleanup - * if the tunnel socket goes away. - */ - tunnel->old_sk_destruct = sk->sk_destruct; - sk->sk_destruct = &l2tp_tunnel_destruct; - lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); - - sk->sk_allocation = GFP_ATOMIC; - /* Init delete workqueue struct */ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); - /* Add tunnel to our list */ INIT_LIST_HEAD(&tunnel->list); - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); err = 0; err: if (tunnelp) *tunnelp = tunnel; - /* If tunnel's socket was created by the kernel, it doesn't - * have a file. - */ - if (sock && sock->file) - sockfd_put(sock); - return err; } EXPORT_SYMBOL_GPL(l2tp_tunnel_create); +static int l2tp_validate_socket(const struct sock *sk, const struct net *net, + enum l2tp_encap_type encap) +{ + if (!net_eq(sock_net(sk), net)) + return -EINVAL; + + if (sk->sk_type != SOCK_DGRAM) + return -EPROTONOSUPPORT; + + if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) || + (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) + return -EPROTONOSUPPORT; + + if (sk->sk_user_data) + return -EBUSY; + + return 0; +} + +int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, + struct l2tp_tunnel_cfg *cfg) +{ + struct l2tp_tunnel *tunnel_walk; + struct l2tp_net *pn; + struct socket *sock; + struct sock *sk; + int ret; + + if (tunnel->fd < 0) { + ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id, + tunnel->peer_tunnel_id, cfg, + &sock); + if (ret < 0) + goto err; + } else { + sock = sockfd_lookup(tunnel->fd, &ret); + if (!sock) + goto err; + + ret = l2tp_validate_socket(sock->sk, net, tunnel->encap); + if (ret < 0) + goto err_sock; + } + + sk = sock->sk; + + sock_hold(sk); + tunnel->sock = sk; + tunnel->l2tp_net = net; + + pn = l2tp_pernet(net); + + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) { + if (tunnel_walk->tunnel_id == tunnel->tunnel_id) { + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + ret = -EEXIST; + goto err_sock; + } + } + list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { + struct udp_tunnel_sock_cfg udp_cfg = { + .sk_user_data = tunnel, + .encap_type = UDP_ENCAP_L2TPINUDP, + .encap_rcv = l2tp_udp_encap_recv, + .encap_destroy = l2tp_udp_encap_destroy, + }; + + setup_udp_tunnel_sock(net, sock, &udp_cfg); + } else { + sk->sk_user_data = tunnel; + } + + tunnel->old_sk_destruct = sk->sk_destruct; + sk->sk_destruct = &l2tp_tunnel_destruct; + lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, + "l2tp_sock"); + sk->sk_allocation = GFP_ATOMIC; + + if (tunnel->fd >= 0) + sockfd_put(sock); + + return 0; + +err_sock: + if (tunnel->fd < 0) + sock_release(sock); + else + sockfd_put(sock); +err: + return ret; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_register); + /* This function is used by the netlink TUNNEL_DELETE command. */ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 2718d0b284d0..ba33cbec71eb 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -220,12 +220,14 @@ struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); -struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); +int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, + struct l2tp_tunnel_cfg *cfg); + void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index e7ea9c4b89ff..b05dbd9ffcb2 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -236,12 +236,6 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info if (info->attrs[L2TP_ATTR_DEBUG]) cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel != NULL) { - ret = -EEXIST; - goto out; - } - ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: @@ -251,9 +245,19 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info break; } - if (ret >= 0) - ret = l2tp_tunnel_notify(&l2tp_nl_family, info, - tunnel, L2TP_CMD_TUNNEL_CREATE); + if (ret < 0) + goto out; + + l2tp_tunnel_inc_refcount(tunnel); + ret = l2tp_tunnel_register(tunnel, net, &cfg); + if (ret < 0) { + kfree(tunnel); + goto out; + } + ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, + L2TP_CMD_TUNNEL_CREATE); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d6deca11da19..896bbca9bdaa 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -698,6 +698,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); if (error < 0) goto end; + + l2tp_tunnel_inc_refcount(tunnel); + error = l2tp_tunnel_register(tunnel, sock_net(sk), + &tcfg); + if (error < 0) { + kfree(tunnel); + goto end; + } + drop_tunnel = true; } } else { /* Error if we can't find the tunnel */ diff --git a/net/rds/send.c b/net/rds/send.c index acad04243b41..94c7f74909be 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -1017,10 +1017,15 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); - if (conn->c_npaths == 0) { - wait_event_interruptible(conn->c_hs_waitq, - (conn->c_npaths != 0)); - } + /* The underlying connection is not up yet. Need to wait + * until it is up to be sure that the non-zero c_path can be + * used. But if we are interrupted, we have to use the zero + * c_path in case the connection ends up being non-MP capable. + */ + if (conn->c_npaths == 0) + if (wait_event_interruptible(conn->c_hs_waitq, + conn->c_npaths != 0)) + hash = 0; if (conn->c_npaths == 1) hash = 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 806395687bb6..c2266f387213 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1887,7 +1887,7 @@ call_connect_status(struct rpc_task *task) dprint_status(task); - trace_rpc_connect_status(task, status); + trace_rpc_connect_status(task); task->tk_status = 0; switch (status) { case -ECONNREFUSED: @@ -2014,6 +2014,9 @@ call_transmit_status(struct rpc_task *task) case -EPERM: if (RPC_IS_SOFTCONN(task)) { xprt_end_transmit(task); + if (!task->tk_msg.rpc_proc->p_proc) + trace_xprt_ping(task->tk_xprt, + task->tk_status); rpc_exit(task, task->tk_status); break; } @@ -2112,6 +2115,9 @@ call_status(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; int status; + if (!task->tk_msg.rpc_proc->p_proc) + trace_xprt_ping(task->tk_xprt, task->tk_status); + if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) task->tk_status = req->rq_reply_bytes_recvd; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index d9db2eab3a8d..3fe5d60ab0e2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,7 +276,7 @@ static void rpc_set_active(struct rpc_task *task) { rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); - trace_rpc_task_begin(task->tk_client, task, NULL); + trace_rpc_task_begin(task, NULL); } /* @@ -291,7 +291,7 @@ static int rpc_complete_task(struct rpc_task *task) unsigned long flags; int ret; - trace_rpc_task_complete(task->tk_client, task, NULL); + trace_rpc_task_complete(task, NULL); spin_lock_irqsave(&wq->lock, flags); clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); @@ -358,7 +358,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", task->tk_pid, rpc_qname(q), jiffies); - trace_rpc_task_sleep(task->tk_client, task, q); + trace_rpc_task_sleep(task, q); __rpc_add_wait_queue(q, task, queue_priority); @@ -428,7 +428,7 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, return; } - trace_rpc_task_wakeup(task->tk_client, task, queue); + trace_rpc_task_wakeup(task, queue); __rpc_remove_wait_queue(queue, task); @@ -780,7 +780,7 @@ static void __rpc_execute(struct rpc_task *task) } if (!do_action) break; - trace_rpc_task_run_action(task->tk_client, task, do_action); + trace_rpc_task_run_action(task, do_action); do_action(task); /* diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 1e671333c3d5..f68aa46c9dd7 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -24,6 +24,8 @@ #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> +#include <trace/events/sunrpc.h> + #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC @@ -148,7 +150,7 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; - ktime_t delta, now; + ktime_t backlog, execute, now; if (!op_metrics || !req) return; @@ -164,16 +166,20 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + backlog = 0; if (ktime_to_ns(req->rq_xtime)) { - delta = ktime_sub(req->rq_xtime, task->tk_start); - op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); + backlog = ktime_sub(req->rq_xtime, task->tk_start); + op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } + op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); - delta = ktime_sub(now, task->tk_start); - op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); + execute = ktime_sub(now, task->tk_start); + op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); spin_unlock(&op_metrics->om_lock); + + trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index f2b7cb540e61..09a0315ea77b 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -37,12 +37,6 @@ struct rpc_buffer { char data[]; }; -static inline int rpc_reply_expected(struct rpc_task *task) -{ - return (task->tk_msg.rpc_proc != NULL) && - (task->tk_msg.rpc_proc->p_decode != NULL); -} - static inline int sock_is_loopback(struct sock *sk) { struct dst_entry *dst; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e34f4ee7f2b6..30afbd236656 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1519,6 +1519,88 @@ out: EXPORT_SYMBOL_GPL(xdr_process_buf); /** + * xdr_stream_decode_opaque - Decode variable length opaque + * @xdr: pointer to xdr_stream + * @ptr: location to store opaque data + * @size: size of storage buffer @ptr + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE on overflow of storage buffer @ptr + */ +ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, size); + if (ret <= 0) + return ret; + memcpy(ptr, p, ret); + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); + +/** + * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque + * @xdr: pointer to xdr_stream + * @ptr: location to store pointer to opaque data + * @maxlen: maximum acceptable object size + * @gfp_flags: GFP mask to use + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the object would exceed @maxlen + * %-ENOMEM on memory allocation failure + */ +ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, + size_t maxlen, gfp_t gfp_flags) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); + if (ret > 0) { + *ptr = kmemdup(p, ret, gfp_flags); + if (*ptr != NULL) + return ret; + ret = -ENOMEM; + } + *ptr = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); + +/** + * xdr_stream_decode_string - Decode variable length string + * @xdr: pointer to xdr_stream + * @str: location to store string + * @size: size of storage buffer @str + * + * Return values: + * On success, returns length of NUL-terminated string stored in *@str + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE on overflow of storage buffer @str + */ +ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, size); + if (ret > 0) { + memcpy(str, p, ret); + str[ret] = '\0'; + return strlen(str); + } + *str = '\0'; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_string); + +/** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream * @str: location to store pointer to string diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8f0ad4f268da..70f005044f06 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -826,6 +826,7 @@ static void xprt_connect_status(struct rpc_task *task) * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * + * Caller holds xprt->recv_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { @@ -834,6 +835,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) list_for_each_entry(entry, &xprt->recv, rq_list) if (entry->rq_xid == xid) { trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); return entry; } @@ -889,7 +891,13 @@ __must_hold(&req->rq_xprt->recv_lock) } } -static void xprt_update_rtt(struct rpc_task *task) +/** + * xprt_update_rtt - Update RPC RTT statistics + * @task: RPC request that recently completed + * + * Caller holds xprt->recv_lock. + */ +void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_rtt *rtt = task->tk_client->cl_rtt; @@ -902,13 +910,14 @@ static void xprt_update_rtt(struct rpc_task *task) rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } } +EXPORT_SYMBOL_GPL(xprt_update_rtt); /** * xprt_complete_rqst - called when reply processing is complete * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds transport lock. + * Caller holds xprt->recv_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -920,9 +929,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) trace_xprt_complete_rqst(xprt, req->rq_xid, copied); xprt->stat.recvs++; - req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); - if (xprt->ops->timer != NULL) - xprt_update_rtt(task); list_del_init(&req->rq_list); req->rq_private_buf.len = copied; @@ -1003,7 +1009,7 @@ void xprt_transmit(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; unsigned int connect_cookie; - int status, numreqs; + int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1027,7 +1033,6 @@ void xprt_transmit(struct rpc_task *task) return; connect_cookie = xprt->connect_cookie; - req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { @@ -1042,9 +1047,6 @@ void xprt_transmit(struct rpc_task *task) xprt->ops->set_retrans_timeout(task); - numreqs = atomic_read(&xprt->num_reqs); - if (numreqs > xprt->stat.max_slots) - xprt->stat.max_slots = numreqs; xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1106,14 +1108,15 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); - if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs)) + if (xprt->num_reqs >= xprt->max_reqs) goto out; + ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; - atomic_dec(&xprt->num_reqs); + --xprt->num_reqs; req = ERR_PTR(-ENOMEM); out: return req; @@ -1121,7 +1124,8 @@ out: static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { - if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) { + if (xprt->num_reqs > xprt->min_reqs) { + --xprt->num_reqs; kfree(req); return true; } @@ -1157,6 +1161,8 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock(&xprt->reserve_lock); return; out_init_req: + xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots, + xprt->num_reqs); task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); @@ -1224,7 +1230,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, else xprt->max_reqs = num_prealloc; xprt->min_reqs = num_prealloc; - atomic_set(&xprt->num_reqs, num_prealloc); + xprt->num_reqs = num_prealloc; return xprt; diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ed1a4a3065ee..47ebac949769 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,13 +44,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, if (IS_ERR(req)) return PTR_ERR(req); - rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, - DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) - goto out_fail; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); - size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index d5f95bb39300..5cc68a824f45 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -191,7 +191,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = rpcrdma_mr_get(r_xprt); if (!mr) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-EAGAIN); pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -251,6 +251,16 @@ out_maperr: return ERR_PTR(-EIO); } +/* Post Send WR containing the RPC Call message. + */ +static int +fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *bad_wr; + + return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr); +} + /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -305,6 +315,7 @@ out_reset: const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 90f688f19783..c5743a0960be 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -357,8 +357,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mr *mr; struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; - struct ib_send_wr *bad_wr; - int rc, i, n; + int i, n; u8 key; mr = NULL; @@ -367,7 +366,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_mr_defer_recovery(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-EAGAIN); } while (mr->frwr.fr_state != FRWR_IS_INVALID); frwr = &mr->frwr; frwr->fr_state = FRWR_IS_VALID; @@ -407,22 +406,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, ib_update_fast_reg_key(ibmr, ++key); reg_wr = &frwr->fr_regwr; - reg_wr->wr.next = NULL; - reg_wr->wr.opcode = IB_WR_REG_MR; - frwr->fr_cqe.done = frwr_wc_fastreg; - reg_wr->wr.wr_cqe = &frwr->fr_cqe; - reg_wr->wr.num_sge = 0; - reg_wr->wr.send_flags = 0; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); - if (rc) - goto out_senderr; - mr->mr_handle = ibmr->rkey; mr->mr_length = ibmr->length; mr->mr_offset = ibmr->iova; @@ -442,11 +431,40 @@ out_mapmr_err: frwr->fr_mr, n, mr->mr_nents); rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-EIO); +} -out_senderr: - pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); - rpcrdma_mr_defer_recovery(mr); - return ERR_PTR(-ENOTCONN); +/* Post Send WR containing the RPC Call message. + * + * For FRMR, chain any FastReg WRs to the Send WR. Only a + * single ib_post_send call is needed to register memory + * and then post the Send WR. + */ +static int +frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *post_wr, *bad_wr; + struct rpcrdma_mr *mr; + + post_wr = &req->rl_sendctx->sc_wr; + list_for_each_entry(mr, &req->rl_registered, mr_list) { + struct rpcrdma_frwr *frwr; + + frwr = &mr->frwr; + + frwr->fr_cqe.done = frwr_wc_fastreg; + frwr->fr_regwr.wr.next = post_wr; + frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; + frwr->fr_regwr.wr.num_sge = 0; + frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; + frwr->fr_regwr.wr.send_flags = 0; + + post_wr = &frwr->fr_regwr.wr; + } + + /* If ib_post_send fails, the next ->send_request for + * @req will queue these MWs for recovery. + */ + return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); } /* Handle a remotely invalidated mr on the @mrs list @@ -561,6 +579,7 @@ reset_mrs: const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, .ro_recover_mr = frwr_op_recover_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f0855a959a27..e8adad33d0bb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) @@ -377,6 +377,11 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } while (nsegs); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /* Register and XDR encode the Write list. Supports encoding a list @@ -423,7 +428,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -440,6 +445,11 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -481,7 +491,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -498,6 +508,11 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /** @@ -724,8 +739,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, * Returns: * %0 if the RPC was sent successfully, * %-ENOTCONN if the connection was lost, - * %-EAGAIN if not enough pages are available for on-demand reply buffer, - * %-ENOBUFS if no MRs are available to register chunks, + * %-EAGAIN if the caller should call again with the same arguments, + * %-ENOBUFS if the caller should call again after a delay, * %-EMSGSIZE if the transport header is too small, * %-EIO if a permanent problem occurred while marshaling. */ @@ -868,10 +883,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) return 0; out_err: - if (ret != -ENOBUFS) { - pr_err("rpcrdma: header marshaling failed (%d)\n", ret); - r_xprt->rx_stats.failed_marshal_count++; - } + r_xprt->rx_stats.failed_marshal_count++; return ret; } @@ -1366,7 +1378,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 4b1ecfe979cf..cc1aad325496 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,7 +52,6 @@ #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/sunrpc/addr.h> -#include <linux/smp.h> #include "xprt_rdma.h" @@ -237,8 +236,6 @@ rpcrdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->rx_xprt; spin_lock_bh(&xprt->transport_lock); - if (++xprt->connect_cookie == 0) /* maintain a reserved value */ - ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); @@ -540,29 +537,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* Allocate a fixed-size buffer in which to construct and send the - * RPC-over-RDMA header for this request. - */ -static bool -rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - gfp_t flags) -{ - size_t size = RPCRDMA_HDRBUF_SIZE; - struct rpcrdma_regbuf *rb; - - if (req->rl_rdmabuf) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); - if (IS_ERR(rb)) - return false; - - r_xprt->rx_stats.hardway_register_count += size; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); - return true; -} - static bool rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, size_t size, gfp_t flags) @@ -644,15 +618,11 @@ xprt_rdma_allocate(struct rpc_task *task) if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (!rpcrdma_get_rdmabuf(r_xprt, req, flags)) - goto out_fail; if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) goto out_fail; if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - req->rl_cpu = smp_processor_id(); - req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; @@ -694,7 +664,8 @@ xprt_rdma_free(struct rpc_task *task) * Returns: * %0 if the RPC message has been sent * %-ENOTCONN if the caller should reconnect and call again - * %-ENOBUFS if the caller should call again later + * %-EAGAIN if the caller should call again + * %-ENOBUFS if the caller should call again after a delay * %-EIO if a permanent error occurred and the request was not * sent. Do not try to send this message again. */ @@ -723,9 +694,9 @@ xprt_rdma_send_request(struct rpc_task *task) rpcrdma_recv_buffer_get(req); /* Must suppress retransmit to maintain credits */ - if (req->rl_connect_cookie == xprt->connect_cookie) + if (rqst->rq_connect_cookie == xprt->connect_cookie) goto drop_connection; - req->rl_connect_cookie = xprt->connect_cookie; + rqst->rq_xtime = ktime_get(); __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) @@ -733,6 +704,12 @@ xprt_rdma_send_request(struct rpc_task *task) rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; + + /* An RPC with no reply will throw off credit accounting, + * so drop the connection to reset the credit grant. + */ + if (!rpc_reply_expected(task)) + goto drop_connection; return 0; failed_marshal: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e6f84a6434a0..fe5eaca2d197 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -250,11 +250,11 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; - ia->ri_pd = NULL; ia->ri_device = NULL; /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: + ++xprt->rx_xprt.connect_cookie; connstate = 1; rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; @@ -273,6 +273,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: + ++xprt->rx_xprt.connect_cookie; connstate = -ECONNABORTED; connected: xprt->rx_buf.rb_credits = 1; @@ -445,7 +446,9 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } ib_free_cq(ep->rep_attr.recv_cq); + ep->rep_attr.recv_cq = NULL; ib_free_cq(ep->rep_attr.send_cq); + ep->rep_attr.send_cq = NULL; /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. @@ -458,6 +461,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); } rpcrdma_mrs_destroy(buf); + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; /* Allow waiters to continue */ complete(&ia->ri_remove_done); @@ -589,11 +594,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ - ep->rep_remote_cma.responder_resources = 32; - else - ep->rep_remote_cma.responder_resources = - ia->ri_device->attrs.max_qp_rd_atom; + ep->rep_remote_cma.responder_resources = + min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -628,14 +630,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) { + if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } - ib_free_cq(ep->rep_attr.recv_cq); - ib_free_cq(ep->rep_attr.send_cq); + if (ep->rep_attr.recv_cq) + ib_free_cq(ep->rep_attr.recv_cq); + if (ep->rep_attr.send_cq) + ib_free_cq(ep->rep_attr.send_cq); } /* Re-establish a connection after a device removal event. @@ -1024,7 +1028,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 32; count++) { + for (count = 0; count < 3; count++) { struct rpcrdma_mr *mr; int rc; @@ -1049,8 +1053,9 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; spin_unlock(&buf->rb_mrlock); - trace_xprtrdma_createmrs(r_xprt, count); + + xprt_write_space(&r_xprt->rx_xprt); } static void @@ -1068,17 +1073,27 @@ struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; + struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) return ERR_PTR(-ENOMEM); + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); + if (IS_ERR(rb)) { + kfree(req); + return ERR_PTR(-ENOMEM); + } + req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); + req->rl_buffer = buffer; + INIT_LIST_HEAD(&req->rl_registered); + spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); - req->rl_buffer = &r_xprt->rx_buf; - INIT_LIST_HEAD(&req->rl_registered); return req; } @@ -1535,7 +1550,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; - struct ib_send_wr *send_wr_fail; int rc; if (req->rl_reply) { @@ -1554,7 +1568,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, --ep->rep_send_count; } - rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); + rc = ia->ri_ops->ro_send(ia, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 69883a960a3f..3d3b423fa9c1 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -334,8 +334,6 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - int rl_cpu; - unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; @@ -474,6 +472,8 @@ struct rpcrdma_memreg_ops { (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mr **); + int (*ro_send)(struct rpcrdma_ia *ia, + struct rpcrdma_req *req); void (*ro_reminv)(struct rpcrdma_rep *rep, struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 956e29c1438d..c8902f11efdd 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -527,6 +527,7 @@ static int xs_local_send_request(struct rpc_task *task) xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, true, &sent); dprintk("RPC: %s(%u) = %d\n", @@ -589,6 +590,7 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, xdr, req->rq_bytes_sent, true, &sent); @@ -678,6 +680,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ + req->rq_xtime = ktime_get(); while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, @@ -1060,6 +1063,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); + xprt_update_rtt(rovr->rq_task); spin_unlock(&xprt->recv_lock); task = rovr->rq_task; |