summaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd/drbd_receiver.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd/drbd_receiver.c')
-rw-r--r--drivers/block/drbd/drbd_receiver.c196
1 files changed, 138 insertions, 58 deletions
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 68e3992e8838..b6c8aaf4931b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -46,9 +46,10 @@
#include "drbd_int.h"
#include "drbd_protocol.h"
#include "drbd_req.h"
-
#include "drbd_vli.h"
+#define PRO_FEATURES (FF_TRIM)
+
struct packet_info {
enum drbd_packet cmd;
unsigned int size;
@@ -65,7 +66,7 @@ enum finish_epoch {
static int drbd_do_features(struct drbd_connection *connection);
static int drbd_do_auth(struct drbd_connection *connection);
static int drbd_disconnected(struct drbd_peer_device *);
-
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
static int e_end_block(struct drbd_work *, int);
@@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
* @retry: whether to retry, if not enough pages are available right now
*
* Tries to allocate number pages, first from our own page pool, then from
- * the kernel, unless this allocation would exceed the max_buffers setting.
+ * the kernel.
* Possibly retry until DRBD frees sufficient pages somewhere else.
*
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
* Returns a page chain linked via page->private.
*/
struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
@@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
struct page *page = NULL;
struct net_conf *nc;
DEFINE_WAIT(wait);
- int mxb;
+ unsigned int mxb;
- /* Yes, we may run up to @number over max_buffers. If we
- * follow it strictly, the admin will get it wrong anyways. */
rcu_read_lock();
nc = rcu_dereference(peer_device->connection->net_conf);
mxb = nc ? nc->max_buffers : 1000000;
@@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
break;
}
- schedule();
+ if (schedule_timeout(HZ/10) == 0)
+ mxb = UINT_MAX;
}
finish_wait(&drbd_pp_wait, &wait);
@@ -331,7 +339,7 @@ You must not have the req_lock:
struct drbd_peer_request *
drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
- unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
+ unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
@@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
- if (data_size) {
+ if (has_payload && data_size) {
page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
if (!page)
goto fail;
@@ -1026,24 +1034,27 @@ randomize:
if (drbd_send_protocol(connection) == -EOPNOTSUPP)
return -1;
+ /* Prevent a race between resync-handshake and
+ * being promoted to Primary.
+ *
+ * Grab and release the state mutex, so we know that any current
+ * drbd_set_role() is finished, and any incoming drbd_set_role
+ * will see the STATE_SENT flag, and wait for it to be cleared.
+ */
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_lock(peer_device->device->state_mutex);
+
set_bit(STATE_SENT, &connection->flags);
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_unlock(peer_device->device->state_mutex);
+
rcu_read_lock();
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device;
kref_get(&device->kref);
rcu_read_unlock();
- /* Prevent a race between resync-handshake and
- * being promoted to Primary.
- *
- * Grab and release the state mutex, so we know that any current
- * drbd_set_role() is finished, and any incoming drbd_set_role
- * will see the STATE_SENT flag, and wait for it to be cleared.
- */
- mutex_lock(device->state_mutex);
- mutex_unlock(device->state_mutex);
-
if (discard_my_data)
set_bit(DISCARD_MY_DATA, &device->flags);
else
@@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device,
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
+ if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+ /* wait for all pending IO completions, before we start
+ * zeroing things out. */
+ conn_wait_active_ee_empty(first_peer_device(device)->connection);
+ if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+ sector, ds >> 9, GFP_NOIO))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+ return 0;
+ }
+
+ if (peer_req->flags & EE_IS_TRIM)
+ nr_pages = 0; /* discards don't have any payload. */
+
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
@@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
next_bio:
bio = bio_alloc(GFP_NOIO, nr_pages);
if (!bio) {
- drbd_err(device, "submit_ee: Allocation of a bio failed\n");
+ drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
goto fail;
}
/* > peer_req->i.sector, unless this is the first bio */
@@ -1340,6 +1365,11 @@ next_bio:
bios = bio;
++n_bios;
+ if (rw & REQ_DISCARD) {
+ bio->bi_iter.bi_size = ds;
+ goto submit;
+ }
+
page_chain_for_each(page) {
unsigned len = min_t(unsigned, ds, PAGE_SIZE);
if (!bio_add_page(bio, page, len, 0)) {
@@ -1360,8 +1390,9 @@ next_bio:
sector += len >> 9;
--nr_pages;
}
- D_ASSERT(device, page == NULL);
D_ASSERT(device, ds == 0);
+submit:
+ D_ASSERT(device, page == NULL);
atomic_set(&peer_req->pending_bios, n_bios);
do {
@@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
* and from receive_Data */
static struct drbd_peer_request *
read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
- int data_size) __must_hold(local)
+ struct packet_info *pi) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
const sector_t capacity = drbd_get_capacity(device->this_bdev);
struct drbd_peer_request *peer_req;
struct page *page;
int dgs, ds, err;
+ int data_size = pi->size;
void *dig_in = peer_device->connection->int_dig_in;
void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
+ struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
dgs = 0;
- if (peer_device->connection->peer_integrity_tfm) {
+ if (!trim && peer_device->connection->peer_integrity_tfm) {
dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
/*
* FIXME: Receive the incoming digest into the receive buffer
@@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
data_size -= dgs;
}
+ if (trim) {
+ D_ASSERT(peer_device, data_size == 0);
+ data_size = be32_to_cpu(trim->size);
+ }
+
if (!expect(IS_ALIGNED(data_size, 512)))
return NULL;
- if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+ /* prepare for larger trim requests. */
+ if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
@@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO);
+ peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
if (!peer_req)
return NULL;
- if (!data_size)
+ if (trim)
return peer_req;
ds = data_size;
@@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused)
}
static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
- int data_size) __releases(local)
+ struct packet_info *pi) __releases(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
- peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size);
+ peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
if (!peer_req)
goto fail;
@@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
list_add(&peer_req->w.list, &device->sync_ee);
spin_unlock_irq(&device->resource->req_lock);
- atomic_add(data_size >> 9, &device->rs_sect_ev);
+ atomic_add(pi->size >> 9, &device->rs_sect_ev);
if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
return 0;
@@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
/* data is submitted to disk within recv_resync_read.
* corresponding put_ldev done below on error,
* or in drbd_peer_request_endio. */
- err = recv_resync_read(peer_device, sector, pi->size);
+ err = recv_resync_read(peer_device, sector, pi);
} else {
if (__ratelimit(&drbd_ratelimit_state))
drbd_err(device, "Can not write resync data to local disk.\n");
@@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
*/
sector = be64_to_cpu(p->sector);
- peer_req = read_in_block(peer_device, p->block_id, sector, pi->size);
+ peer_req = read_in_block(peer_device, p->block_id, sector, pi);
if (!peer_req) {
put_ldev(device);
return -EIO;
@@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
dp_flags = be32_to_cpu(p->dp_flags);
rw |= wire_flags_to_bio(dp_flags);
- if (peer_req->pages == NULL) {
+ if (pi->cmd == P_TRIM) {
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+ peer_req->flags |= EE_IS_TRIM;
+ if (!blk_queue_discard(q))
+ peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ D_ASSERT(peer_device, peer_req->i.size > 0);
+ D_ASSERT(peer_device, rw & REQ_DISCARD);
+ D_ASSERT(peer_device, peer_req->pages == NULL);
+ } else if (peer_req->pages == NULL) {
D_ASSERT(device, peer_req->i.size == 0);
D_ASSERT(device, dp_flags & DP_FLUSH);
}
@@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
update_peer_seq(peer_device, peer_seq);
spin_lock_irq(&device->resource->req_lock);
}
- list_add(&peer_req->w.list, &device->active_ee);
+ /* if we use the zeroout fallback code, we process synchronously
+ * and we wait for all pending requests, respectively wait for
+ * active_ee to become empty in drbd_submit_peer_request();
+ * better not add ourselves here. */
+ if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+ list_add(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
if (device->state.conn == C_SYNC_TARGET)
@@ -2313,39 +2365,45 @@ out_interrupted:
* The current sync rate used here uses only the most recent two step marks,
* to have a short time average so we can react faster.
*/
-int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
{
- struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
- unsigned long db, dt, dbdt;
struct lc_element *tmp;
- int curr_events;
- int throttle = 0;
- unsigned int c_min_rate;
-
- rcu_read_lock();
- c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
- rcu_read_unlock();
+ bool throttle = true;
- /* feature disabled? */
- if (c_min_rate == 0)
- return 0;
+ if (!drbd_rs_c_min_rate_throttle(device))
+ return false;
spin_lock_irq(&device->al_lock);
tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
if (tmp) {
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
- if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
- spin_unlock_irq(&device->al_lock);
- return 0;
- }
+ if (test_bit(BME_PRIORITY, &bm_ext->flags))
+ throttle = false;
/* Do not slow down if app IO is already waiting for this extent */
}
spin_unlock_irq(&device->al_lock);
+ return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
+{
+ struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ unsigned int c_min_rate;
+ int curr_events;
+
+ rcu_read_lock();
+ c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+ rcu_read_unlock();
+
+ /* feature disabled? */
+ if (c_min_rate == 0)
+ return false;
+
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
(int)part_stat_read(&disk->part0, sectors[1]) -
atomic_read(&device->rs_sect_ev);
-
if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
unsigned long rs_left;
int i;
@@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
dbdt = Bit2KB(db/dt);
if (dbdt > c_min_rate)
- throttle = 1;
+ return true;
}
- return throttle;
+ return false;
}
-
static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
{
struct drbd_peer_device *peer_device;
@@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO);
+ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+ true /* has real payload */, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
@@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
put_ldev(device);
}
+ device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+ drbd_reconsider_max_bio_size(device);
+ /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+ In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+ drbd_reconsider_max_bio_size(), we can be sure that after
+ drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(device)) {
dd = drbd_determine_dev_size(device, ddsf, NULL);
@@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
drbd_set_my_capacity(device, p_size);
}
- device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
- drbd_reconsider_max_bio_size(device);
-
if (get_ldev(device)) {
if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
@@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
};
static void drbdd(struct drbd_connection *connection)
@@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection)
memset(p, 0, sizeof(*p));
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+ p->feature_flags = cpu_to_be32(PRO_FEATURES);
return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
}
@@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection)
goto incompat;
connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
drbd_info(connection, "Handshake successful: "
"Agreed network protocol version %d\n", connection->agreed_pro_version);
+ drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+ connection->agreed_features & FF_TRIM ? " " : " not ");
+
return 1;
incompat:
@@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
goto fail;
}
+ if (pi.size < CHALLENGE_LEN) {
+ drbd_err(connection, "AuthChallenge payload too small.\n");
+ rv = -1;
+ goto fail;
+ }
+
peers_ch = kmalloc(pi.size, GFP_NOIO);
if (peers_ch == NULL) {
drbd_err(connection, "kmalloc of peers_ch failed\n");
@@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
goto fail;
}
+ if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+ drbd_err(connection, "Peer presented the same challenge!\n");
+ rv = -1;
+ goto fail;
+ }
+
resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
response = kmalloc(resp_size, GFP_NOIO);
if (response == NULL) {
OpenPOWER on IntegriCloud