diff options
Diffstat (limited to 'net/ceph')
-rw-r--r-- | net/ceph/auth_x.c | 49 | ||||
-rw-r--r-- | net/ceph/auth_x.h | 2 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 33 | ||||
-rw-r--r-- | net/ceph/messenger.c | 105 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 4 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 10 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 19 |
7 files changed, 159 insertions, 63 deletions
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 10d87753ed87..9e43a315e662 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac, void *ticket_buf = NULL; void *tp, *tpend; void **ptp; - struct ceph_timespec new_validity; struct ceph_crypto_key new_session_key; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; @@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); + ceph_decode_timespec(&validity, dp); + dp += sizeof(struct ceph_timespec); new_expires = get_seconds() + validity.tv_sec; new_renew_after = new_expires - (validity.tv_sec / 4); dout(" expires=%lu renew_after=%lu\n", new_expires, @@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_buffer_put(th->ticket_blob); th->session_key = new_session_key; th->ticket_blob = new_ticket_blob; - th->validity = new_validity; th->secret_id = new_secret_id; th->expires = new_expires; th->renew_after = new_renew_after; + th->have_key = true; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); @@ -384,6 +383,24 @@ bad: return -ERANGE; } +static bool need_key(struct ceph_x_ticket_handler *th) +{ + if (!th->have_key) + return true; + + return get_seconds() >= th->renew_after; +} + +static bool have_key(struct ceph_x_ticket_handler *th) +{ + if (th->have_key) { + if (get_seconds() >= th->expires) + th->have_key = false; + } + + return th->have_key; +} + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) { int want = ac->want_keys; @@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) continue; th = get_ticket_handler(ac, service); - if (IS_ERR(th)) { *pneed |= service; continue; } - if (get_seconds() >= th->renew_after) + if (need_key(th)) *pneed |= service; - if (get_seconds() >= th->expires) + if (!have_key(th)) xi->have_keys &= ~service; } } - static int ceph_x_build_request(struct ceph_auth_client *ac, void *buf, void *end) { @@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) ac->private = NULL; } -static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, - int peer_type) +static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) { struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - memset(&th->validity, 0, sizeof(th->validity)); + th->have_key = false; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + /* + * We are to invalidate a service ticket in the hopes of + * getting a new, hopefully more valid, one. But, we won't get + * it unless our AUTH ticket is good, so invalidate AUTH ticket + * as well, just in case. + */ + invalidate_ticket(ac, peer_type); + invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); } static int calcu_signature(struct ceph_x_authorizer *au, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index e8b7c6917d47..40b1a3cf7397 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -16,7 +16,7 @@ struct ceph_x_ticket_handler { unsigned int service; struct ceph_crypto_key session_key; - struct ceph_timespec validity; + bool have_key; u64 secret_id; struct ceph_buffer *ticket_blob; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 393bfb22d5bb..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent @@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_fallback_retries, int recurse_to_leaf, unsigned int vary_r, + unsigned int stable, int *out2, int parent_r) { @@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, int collide, reject; int count = out_size; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep, tries, recurse_tries, local_retries, local_fallback_retries, - parent_r); + parent_r, stable); - for (rep = outpos; rep < numrep && count > 0 ; rep++) { + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, - x, outpos+1, 0, + x, stable ? 1 : outpos+1, 0, out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, 0, vary_r, + stable, NULL, sub_r) <= outpos) /* didn't get leaf */ @@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, int choose_local_fallback_retries = map->choose_local_fallback_tries; int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_TAKE: if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) || - (-1-curstep->arg1 < map->max_buckets && + (-1-curstep->arg1 >= 0 && + -1-curstep->arg1 < map->max_buckets && map->buckets[-1-curstep->arg1])) { w[0] = curstep->arg1; wsize = 1; @@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, vary_r = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map, osize = 0; for (i = 0; i < wsize; i++) { + int bno; /* * see CRUSH_N, CRUSH_N_MINUS macros. * basically, numrep <= 0 means relative to @@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; + /* make sure bucket id is valid */ + bno = -1 - w[i]; + if (bno < 0 || bno >= map->max_buckets) { + /* w[i] is probably CRUSH_ITEM_NONE */ + dprintk(" bad w[i] %d\n", w[i]); + continue; + } if (firstn) { int recurse_tries; if (choose_leaf_tries) @@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, numrep, curstep->arg2, @@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries, recurse_to_leaf, vary_r, + stable, c+osize, 0); } else { @@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, out_size, numrep, curstep->arg2, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9981039ef4ff..9cfedf565f5b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -23,9 +23,6 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> -#define list_entry_next(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con) } con->in_seq = 0; con->in_seq_acked = 0; + + con->out_skip = 0; } /* @@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) static void con_out_kvec_reset(struct ceph_connection *con) { + BUG_ON(con->out_skip); + con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; @@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { - int index; + int index = con->out_kvec_left; - index = con->out_kvec_left; + BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; @@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int off = con->out_kvec_cur - con->out_kvec; + int skip = 0; + + if (con->out_kvec_bytes > 0) { + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; + BUG_ON(con->out_kvec_bytes < skip); + BUG_ON(!con->out_kvec_left); + con->out_kvec_bytes -= skip; + con->out_kvec_left--; + } + + return skip; +} + #ifdef CONFIG_BLOCK /* @@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); - cursor->page = list_entry_next(cursor->page, lru); + cursor->page = list_next_entry(cursor->page, lru); cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; @@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_entry_next(cursor->data, links); + cursor->data = list_next_entry(cursor->data, links); __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con) m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) @@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con) u32 crc; con_out_kvec_reset(con); - con->out_kvec_is_msg = true; con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same @@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con) /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); - /* fill in crc (except data pages), footer */ + /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = 0; + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); + /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { @@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; @@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con) } } con->out_kvec_left = 0; - con->out_kvec_is_msg = false; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, @@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con) { int ret; + dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); @@ -2506,13 +2528,13 @@ more: more_kvec: /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); + if (con->out_kvec_left) { + ret = write_partial_kvec(con); if (ret <= 0) goto out; } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); + if (con->out_skip) { + ret = write_partial_skip(con); if (ret <= 0) goto out; } @@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con) static void con_fault_finish(struct ceph_connection *con) { + dout("%s %p\n", __func__, con); + /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); + if (con->auth_retry) { + dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->ops->invalidate_authorizer) + con->ops->invalidate_authorizer(con); + con->auth_retry = 0; } if (con->ops->fault) @@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg) ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("%s %p msg %p - was sending\n", __func__, con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; + BUG_ON(con->out_skip); + /* footer */ + if (con->out_msg_done) { + con->out_skip += con_out_kvec_skip(con); + } else { + BUG_ON(!msg->data_length); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) + con->out_skip += sizeof(msg->footer); + else + con->out_skip += sizeof(msg->old_footer); } + /* data, middle, front */ + if (msg->data_length) + con->out_skip += msg->cursor.total_resid; + if (msg->middle) + con->out_skip += con_out_kvec_skip(con); + con->out_skip += con_out_kvec_skip(con); + + dout("%s %p msg %p - was sending, will write %d skip %d\n", + __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; - + con->out_msg = NULL; ceph_msg_put(msg); } + mutex_unlock(&con->mutex); } @@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m) static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - LIST_HEAD(data); - struct list_head *links; - struct list_head *next; + struct ceph_msg_data *data, *next; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_splice_init(&m->data, &data); - list_for_each_safe(links, next, &data) { - struct ceph_msg_data *data; - - data = list_entry(links, struct ceph_msg_data, links); - list_del_init(links); + list_for_each_entry_safe(data, next, &m->data, links) { + list_del_init(&data->links); ceph_msg_data_destroy(data); } m->data_length = 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index edda01626a45..de85dddc3dc0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc) return monc->client->have_fsid && monc->auth->global_id > 0; } -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f235930d88..3534e12683d3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 osdmap_epoch; int already_completed; u32 bytes; + u8 decode_redir; unsigned int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) p += 8 + 4; /* skip replay_version */ p += 8; /* skip user_version */ + if (le16_to_cpu(msg->hdr.version) >= 7) + ceph_decode_8_safe(&p, end, decode_redir, bad_put); + else + decode_redir = 1; + } else { + decode_redir = 0; + } + + if (decode_redir) { err = ceph_redirect_decode(&p, end, &redir); if (err) goto bad_put; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f..243574c8cf33 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); - dout("crush decode tunable choose_local_tries = %d", + dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); - dout("crush decode tunable choose_local_fallback_tries = %d", + dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); - dout("crush decode tunable choose_total_tries = %d", + dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); - dout("crush decode tunable chooseleaf_descend_once = %d", + dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); - dout("crush decode tunable chooseleaf_vary_r = %d", + dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); + /* skip straw_calc_version, allowed_bucket_algs */ + ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); + *p += sizeof(u8) + sizeof(u32); + + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_stable = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_stable = %d\n", + c->chooseleaf_stable); + done: dout("crush_decode success\n"); return c; |