diff options
author | Philipp Reisner <philipp.reisner@linbit.com> | 2010-07-06 11:14:00 +0200 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2010-10-14 18:38:14 +0200 |
commit | 778f271dfe7a7173c0bae2d6cde8d9bd1533e668 (patch) | |
tree | 1c057622152bd652102749b488653bff8be24c2a /drivers/block | |
parent | 8e26f9ccb9be00fdb33551a34c8f6029e89ab79f (diff) | |
download | blackbird-op-linux-778f271dfe7a7173c0bae2d6cde8d9bd1533e668.tar.gz blackbird-op-linux-778f271dfe7a7173c0bae2d6cde8d9bd1533e668.zip |
drbd: The new, smarter resync speed controller
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 11 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 22 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 20 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 98 |
5 files changed, 151 insertions, 1 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index fd2cdd45f155..facb72ccc56b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -928,6 +928,12 @@ enum write_ordering_e { WO_bio_barrier }; +struct fifo_buffer { + int *values; + unsigned int head_index; + unsigned int size; +}; + struct drbd_conf { /* things that are stored as / read from meta data on disk */ unsigned long flags; @@ -1068,6 +1074,11 @@ struct drbd_conf { u64 ed_uuid; /* UUID of the exposed data */ struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* counter to measure the incoming resync data rate */ + int c_sync_rate; /* current resync rate after delay_probe magic */ + struct fifo_buffer rs_plan_s; /* correction values of resync planer */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + int rs_planed; /* resync sectors already planed */ }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index bff4f598d38f..ed09a840d838 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2734,6 +2734,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->net_cnt, 0); atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); + atomic_set(&mdev->rs_sect_in, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 7d384fd39c16..295b8d593708 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1587,6 +1587,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n struct crypto_hash *csums_tfm = NULL; struct syncer_conf sc; cpumask_var_t new_cpu_mask; + int *rs_plan_s = NULL; + int fifo_size; if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { retcode = ERR_NOMEM; @@ -1687,6 +1689,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n if (retcode != NO_ERROR) goto fail; + fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + retcode = ERR_NOMEM; + goto fail; + } + } + /* ok, assign the rest of it as well. * lock against receive_SyncParam() */ spin_lock(&mdev->peer_seq_lock); @@ -1703,6 +1715,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n mdev->verify_tfm = verify_tfm; verify_tfm = NULL; } + + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + rs_plan_s = NULL; + } + spin_unlock(&mdev->peer_seq_lock); if (get_ldev(mdev)) { @@ -1734,6 +1755,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); fail: + kfree(rs_plan_s); free_cpumask_var(new_cpu_mask); crypto_free_hash(csums_tfm); crypto_free_hash(verify_tfm); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 34bea972f734..5f80b22e711d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1640,6 +1640,8 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) drbd_send_ack_dp(mdev, P_NEG_ACK, p); } + atomic_add(data_size >> 9, &mdev->rs_sect_in); + return ok; } @@ -2810,6 +2812,8 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; const int apv = mdev->agreed_pro_version; + int *rs_plan_s = NULL; + int fifo_size = 0; exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) @@ -2904,6 +2908,15 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + goto disconnect; + } + } } spin_lock(&mdev->peer_seq_lock); @@ -2922,6 +2935,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) mdev->csums_tfm = csums_tfm; dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); } + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + } spin_unlock(&mdev->peer_seq_lock); } @@ -4202,6 +4221,7 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); dec_rs_pending(mdev); + atomic_add(blksize >> 9, &mdev->rs_sect_in); return TRUE; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d94720f4bd07..fd3e1e9561cb 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -422,6 +422,89 @@ void resync_timer_fn(unsigned long data) drbd_queue_work(&mdev->data.work, &mdev->resync_work); } +static void fifo_set(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +static int fifo_push(struct fifo_buffer *fb, int value) +{ + int ov; + + ov = fb->values[fb->head_index]; + fb->values[fb->head_index++] = value; + + if (fb->head_index >= fb->size) + fb->head_index = 0; + + return ov; +} + +static void fifo_add_val(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +int drbd_rs_controller(struct drbd_conf *mdev) +{ + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + unsigned int want; /* The number of sectors we want in the proxy */ + int req_sect; /* Number of sectors to request in this turn */ + int correction; /* Number of sectors more we need in the proxy*/ + int cps; /* correction per invocation of drbd_rs_controller() */ + int steps; /* Number of time steps to plan ahead */ + int curr_corr; + int max_sect; + + sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ + mdev->rs_in_flight -= sect_in; + + spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ + + steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + + if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ + want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; + } else { /* normal path */ + want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : + sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); + } + + correction = want - mdev->rs_in_flight - mdev->rs_planed; + + /* Plan ahead */ + cps = correction / steps; + fifo_add_val(&mdev->rs_plan_s, cps); + mdev->rs_planed += cps * steps; + + /* What we do in this step */ + curr_corr = fifo_push(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); + mdev->rs_planed -= curr_corr; + + req_sect = sect_in + curr_corr; + if (req_sect < 0) + req_sect = 0; + + max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; + if (req_sect > max_sect) + req_sect = max_sect; + + /* + dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n", + sect_in, mdev->rs_in_flight, want, correction, + steps, cps, mdev->rs_planed, curr_corr, req_sect); + */ + + return req_sect; +} + int w_make_resync_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { @@ -459,7 +542,13 @@ int w_make_resync_request(struct drbd_conf *mdev, max_segment_size = mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE; - number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ); + if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ + number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); + mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; + } else { + mdev->c_sync_rate = mdev->sync_conf.rate; + number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); + } pe = atomic_read(&mdev->rs_pending_cnt); mutex_lock(&mdev->data.mutex); @@ -593,6 +682,7 @@ next_sector: } requeue: + mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); put_ldev(mdev); return 1; @@ -1419,6 +1509,12 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) drbd_resync_finished(mdev); } + atomic_set(&mdev->rs_sect_in, 0); + mdev->rs_in_flight = 0; + mdev->rs_planed = 0; + spin_lock(&mdev->peer_seq_lock); + fifo_set(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); /* ns.conn may already be != mdev->state.conn, * we may have been paused in between, or become paused until * the timer triggers. |