From 0da5d70369e87f80adf794080cfff1ca15a34198 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 19 May 2011 11:21:05 -0700 Subject: libceph: handle connection reopen race with callbacks If a connection is closed and/or reopened (ceph_con_close, ceph_con_open) it can race with a callback. con_work does various state checks for closed or reopened sockets at the beginning, but drops con->mutex before making callbacks. We need to check for state bit changes after retaking the lock to ensure we restart con_work and execute those CLOSED/OPENING tests or else we may end up operating under stale assumptions. In Jim's case, this was causing 'bad tag' errors. There are four cases where we re-take the con->mutex inside con_work: catch them all and return EAGAIN from try_{read,write} so that we can restart con_work. Reported-by: Jim Schutt Tested-by: Jim Schutt Signed-off-by: Sage Weil --- net/ceph/messenger.c | 64 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e15a82ccc05f..b140dd3515de 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -598,7 +598,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static void prepare_connect_authorizer(struct ceph_connection *con) +static int prepare_connect_authorizer(struct ceph_connection *con) { void *auth_buf; int auth_len = 0; @@ -612,6 +612,10 @@ static void prepare_connect_authorizer(struct ceph_connection *con) con->auth_retry); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); @@ -619,6 +623,8 @@ static void prepare_connect_authorizer(struct ceph_connection *con) con->out_kvec[con->out_kvec_left].iov_len = auth_len; con->out_kvec_left++; con->out_kvec_bytes += auth_len; + + return 0; } /* @@ -640,9 +646,9 @@ static void prepare_write_banner(struct ceph_messenger *msgr, set_bit(WRITE_PENDING, &con->state); } -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int after_banner) +static int prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -683,7 +689,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_more = 0; set_bit(WRITE_PENDING, &con->state); - prepare_connect_authorizer(con); + return prepare_connect_authorizer(con); } @@ -1216,6 +1222,7 @@ static int process_connect(struct ceph_connection *con) u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); + int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1250,7 +1257,9 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - prepare_write_connect(con->msgr, con, 0); + ret = prepare_write_connect(con->msgr, con, 0); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1277,6 +1286,9 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; break; case CEPH_MSGR_TAG_RETRY_SESSION: @@ -1810,6 +1822,17 @@ static int try_read(struct ceph_connection *con) more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); + + /* + * process_connect and process_message drop and re-take + * con->mutex. make sure we handle a racing close or reopen. + */ + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) { + ret = -EAGAIN; + goto out; + } + if (test_bit(CONNECTING, &con->state)) { if (!test_bit(NEGOTIATING, &con->state)) { dout("try_read connecting\n"); @@ -1938,8 +1961,10 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); + int ret; mutex_lock(&con->mutex); +restart: if (test_and_clear_bit(BACKOFF, &con->state)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, @@ -1969,18 +1994,31 @@ static void con_work(struct work_struct *work) con_close_socket(con); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state) || - try_read(con) < 0 || - try_write(con) < 0) { - mutex_unlock(&con->mutex); - ceph_fault(con); /* error/fault path */ - goto done_unlocked; - } + if (test_and_clear_bit(SOCK_CLOSED, &con->state)) + goto fault; + + ret = try_read(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; + + ret = try_write(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; done: mutex_unlock(&con->mutex); done_unlocked: con->ops->put(con); + return; + +fault: + mutex_unlock(&con->mutex); + ceph_fault(con); /* error/fault path */ + goto done_unlocked; } -- cgit v1.2.1 From e8f54ce169125a2e59330fac25ad3c9ac0ce22a5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:18:42 -0700 Subject: libceph: fix uninitialized value when no get_authorizer method is set If there is no get_authorizer method we set the out_kvec to a bogus pointer. The length is also zero in that case, so it doesn't much matter, but it's better not to add the empty item in the first place. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b140dd3515de..ce326c806237 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -619,11 +619,12 @@ static int prepare_connect_authorizer(struct ceph_connection *con) con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; - + if (auth_len) { + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; + } return 0; } -- cgit v1.2.1 From 2dab036b8c349d747f447ac98c8eb40c769727a8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:29:51 -0700 Subject: libceph: use snprintf for formatting object name Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6b5dda1cb5df..9adbb01d23ad 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -124,7 +124,7 @@ static void calc_layout(struct ceph_osd_client *osdc, ceph_calc_raw_layout(osdc, layout, vino.snap, off, plen, &bno, req, op); - sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); } -- cgit v1.2.1 From 12a2f643b0e6e791ba61485430d0003eeb3e373c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 14:34:04 -0700 Subject: libceph: use snprintf for unknown addrs Signed-off-by: Sage Weil --- net/ceph/messenger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ce326c806237..3cdbb8853cd7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -76,7 +76,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) break; default: - sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", + (int)ss->ss_family); } return s; -- cgit v1.2.1 From 31456665a02148353a83fec84d3182700e356588 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:18:43 -0700 Subject: libceph: fix osdmap timestamp assignment Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 71603ac3dff5..94f068dda433 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -765,7 +765,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } map->epoch++; - map->modified = map->modified; + map->modified = modified; if (newcrush) { if (map->crush) crush_destroy(map->crush); -- cgit v1.2.1 From 04177882265bc5014300a631e7384f8fe6b6aa0f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:33:17 -0700 Subject: libceph: fix TAG_WAIT case If we get a WAIT as a client something went wrong; error out. And don't fall through to an unrelated case. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3cdbb8853cd7..35c0000a658e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1355,7 +1355,9 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect peer connecting WAIT\n"); + pr_err("process_connect got WAIT as client\n"); + con->error_msg = "protocol error, got WAIT as client"; + return -1; default: pr_err("connect protocol error, will retry\n"); -- cgit v1.2.1 From a2a79609c044d3ddb540671d5029a41c90c57251 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:34:24 -0700 Subject: libceph: add missing breaks in addr_set_port Signed-off-by: Sage Weil --- net/ceph/messenger.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 35c0000a658e..78b55f49de7c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1073,8 +1073,10 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) switch (ss->ss_family) { case AF_INET: ((struct sockaddr_in *)ss)->sin_port = htons(p); + break; case AF_INET6: ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + break; } } -- cgit v1.2.1 From 9d6fcb081a4770c3772c51c59c7251c22716d7bb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 15:48:16 -0700 Subject: ceph: check return value for start_request in writepages Since we pass the nofail arg, we should never get an error; BUG if we do. (And fix the function to not return an error if __map_request fails.) Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9adbb01d23ad..caa092eb0009 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1677,8 +1677,14 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, */ if (req->r_sent == 0) { rc = __map_request(osdc, req); - if (rc < 0) + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } goto out_unlock; + } if (req->r_osd == NULL) { dout("send_request %p no up osds in pg\n", req); ceph_monc_request_next_osdmap(&osdc->client->monc); -- cgit v1.2.1 From 7662d8ff57d2b00ce8f7fe0b60a85efbb2c05652 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 May 2011 12:52:05 -0700 Subject: libceph: handle new osdmap down/state change encoding Old incrementals encode a 0 value (nearly always) when an osd goes down. Change that to allow any state bit(s) to be flipped. Special case 0 to mean flip the CEPH_OSD_UP bit to mimic the old behavior. Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 94f068dda433..e97c3588c3ec 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -830,15 +830,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->osd_addr[osd] = addr; } - /* new_down */ + /* new_state */ ceph_decode_32_safe(p, end, len, bad); while (len--) { u32 osd; + u8 xorstate; ceph_decode_32_safe(p, end, osd, bad); + xorstate = **(u8 **)p; (*p)++; /* clean flag */ - pr_info("osd%d down\n", osd); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + if (xorstate & CEPH_OSD_UP) + pr_info("osd%d down\n", osd); if (osd < map->max_osd) - map->osd_state[osd] &= ~CEPH_OSD_UP; + map->osd_state[osd] ^= xorstate; } /* new_weight */ -- cgit v1.2.1 From cd634fb6eec72ef8e6dd677546b8d0ffdd2501eb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 09:29:18 -0700 Subject: libceph: subscribe to osdmap when cluster is full When the cluster is marked full, subscribe to subsequent map updates to ensure we find out promptly when it is no longer full. This will prevent us from spewing ENOSPC for (much) longer than necessary. Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index caa092eb0009..6ea2b892f44b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1421,6 +1421,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + + /* + * subscribe to subsequent osdmap updates if full to ensure + * we find out when we are no longer full and stop returning + * ENOSPC. + */ + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + ceph_monc_request_next_osdmap(&osdc->client->monc); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); -- cgit v1.2.1