summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorPhilipp Reisner <philipp.reisner@linbit.com>2009-09-25 16:07:19 -0700
committerJens Axboe <jens.axboe@oracle.com>2009-10-01 21:17:49 +0200
commitb411b3637fa71fce9cf2acf0639009500f5892fe (patch)
tree6b88e5202e0f137fef50e95b0441bcafdbf91990 /include
parent1a35e0f6443f4266dad4c569c55c57a9032596fa (diff)
downloadtalos-obmc-linux-b411b3637fa71fce9cf2acf0639009500f5892fe.tar.gz
talos-obmc-linux-b411b3637fa71fce9cf2acf0639009500f5892fe.zip
The DRBD driver
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'include')
-rw-r--r--include/linux/drbd.h349
-rw-r--r--include/linux/drbd_limits.h137
-rw-r--r--include/linux/drbd_nl.h137
-rw-r--r--include/linux/drbd_tag_magic.h83
-rw-r--r--include/linux/lru_cache.h294
5 files changed, 1000 insertions, 0 deletions
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
new file mode 100644
index 000000000000..69dc711f37b3
--- /dev/null
+++ b/include/linux/drbd.h
@@ -0,0 +1,349 @@
+/*
+ drbd.h
+ Kernel module for 2.6.x Kernels
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+#ifndef DRBD_H
+#define DRBD_H
+#include <linux/connector.h>
+#include <asm/types.h>
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#else
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <limits.h>
+
+/* Altough the Linux source code makes a difference between
+ generic endianness and the bitfields' endianness, there is no
+ architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
+ does not match the generic endianness. */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN_BITFIELD
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define __BIG_ENDIAN_BITFIELD
+#else
+# error "sorry, weird endianness on this box"
+#endif
+
+#endif
+
+
+extern const char *drbd_buildtag(void);
+#define REL_VERSION "8.3.3rc2"
+#define API_VERSION 88
+#define PRO_VERSION_MIN 86
+#define PRO_VERSION_MAX 91
+
+
+enum drbd_io_error_p {
+ EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
+ EP_CALL_HELPER,
+ EP_DETACH
+};
+
+enum drbd_fencing_p {
+ FP_DONT_CARE,
+ FP_RESOURCE,
+ FP_STONITH
+};
+
+enum drbd_disconnect_p {
+ DP_RECONNECT,
+ DP_DROP_NET_CONF,
+ DP_FREEZE_IO
+};
+
+enum drbd_after_sb_p {
+ ASB_DISCONNECT,
+ ASB_DISCARD_YOUNGER_PRI,
+ ASB_DISCARD_OLDER_PRI,
+ ASB_DISCARD_ZERO_CHG,
+ ASB_DISCARD_LEAST_CHG,
+ ASB_DISCARD_LOCAL,
+ ASB_DISCARD_REMOTE,
+ ASB_CONSENSUS,
+ ASB_DISCARD_SECONDARY,
+ ASB_CALL_HELPER,
+ ASB_VIOLENTLY
+};
+
+/* KEEP the order, do not delete or insert. Only append. */
+enum drbd_ret_codes {
+ ERR_CODE_BASE = 100,
+ NO_ERROR = 101,
+ ERR_LOCAL_ADDR = 102,
+ ERR_PEER_ADDR = 103,
+ ERR_OPEN_DISK = 104,
+ ERR_OPEN_MD_DISK = 105,
+ ERR_DISK_NOT_BDEV = 107,
+ ERR_MD_NOT_BDEV = 108,
+ ERR_DISK_TO_SMALL = 111,
+ ERR_MD_DISK_TO_SMALL = 112,
+ ERR_BDCLAIM_DISK = 114,
+ ERR_BDCLAIM_MD_DISK = 115,
+ ERR_MD_IDX_INVALID = 116,
+ ERR_IO_MD_DISK = 118,
+ ERR_MD_INVALID = 119,
+ ERR_AUTH_ALG = 120,
+ ERR_AUTH_ALG_ND = 121,
+ ERR_NOMEM = 122,
+ ERR_DISCARD = 123,
+ ERR_DISK_CONFIGURED = 124,
+ ERR_NET_CONFIGURED = 125,
+ ERR_MANDATORY_TAG = 126,
+ ERR_MINOR_INVALID = 127,
+ ERR_INTR = 129, /* EINTR */
+ ERR_RESIZE_RESYNC = 130,
+ ERR_NO_PRIMARY = 131,
+ ERR_SYNC_AFTER = 132,
+ ERR_SYNC_AFTER_CYCLE = 133,
+ ERR_PAUSE_IS_SET = 134,
+ ERR_PAUSE_IS_CLEAR = 135,
+ ERR_PACKET_NR = 137,
+ ERR_NO_DISK = 138,
+ ERR_NOT_PROTO_C = 139,
+ ERR_NOMEM_BITMAP = 140,
+ ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
+ ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
+ ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
+ ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
+ ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
+ ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
+ ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
+ ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
+ ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
+ ERR_DATA_NOT_CURRENT = 150,
+ ERR_CONNECTED = 151, /* DRBD 8.3 only */
+
+ /* insert new ones above this line */
+ AFTER_LAST_ERR_CODE
+};
+
+#define DRBD_PROT_A 1
+#define DRBD_PROT_B 2
+#define DRBD_PROT_C 3
+
+enum drbd_role {
+ R_UNKNOWN = 0,
+ R_PRIMARY = 1, /* role */
+ R_SECONDARY = 2, /* role */
+ R_MASK = 3,
+};
+
+/* The order of these constants is important.
+ * The lower ones (<C_WF_REPORT_PARAMS) indicate
+ * that there is no socket!
+ * >=C_WF_REPORT_PARAMS ==> There is a socket
+ */
+enum drbd_conns {
+ C_STANDALONE,
+ C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
+ C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
+
+ /* These temporal states are all used on the way
+ * from >= C_CONNECTED to Unconnected.
+ * The 'disconnect reason' states
+ * I do not allow to change beween them. */
+ C_TIMEOUT,
+ C_BROKEN_PIPE,
+ C_NETWORK_FAILURE,
+ C_PROTOCOL_ERROR,
+ C_TEAR_DOWN,
+
+ C_WF_CONNECTION,
+ C_WF_REPORT_PARAMS, /* we have a socket */
+ C_CONNECTED, /* we have introduced each other */
+ C_STARTING_SYNC_S, /* starting full sync by admin request. */
+ C_STARTING_SYNC_T, /* stariing full sync by admin request. */
+ C_WF_BITMAP_S,
+ C_WF_BITMAP_T,
+ C_WF_SYNC_UUID,
+
+ /* All SyncStates are tested with this comparison
+ * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
+ C_SYNC_SOURCE,
+ C_SYNC_TARGET,
+ C_VERIFY_S,
+ C_VERIFY_T,
+ C_PAUSED_SYNC_S,
+ C_PAUSED_SYNC_T,
+ C_MASK = 31
+};
+
+enum drbd_disk_state {
+ D_DISKLESS,
+ D_ATTACHING, /* In the process of reading the meta-data */
+ D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
+ /* when >= D_FAILED it is legal to access mdev->bc */
+ D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
+ D_INCONSISTENT,
+ D_OUTDATED,
+ D_UNKNOWN, /* Only used for the peer, never for myself */
+ D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
+ D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
+ D_MASK = 15
+};
+
+union drbd_state {
+/* According to gcc's docs is the ...
+ * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
+ * Determined by ABI.
+ * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
+ * even though we transmit as "cpu_to_be32(state)",
+ * the offsets of the bitfields still need to be swapped
+ * on different endianess.
+ */
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned susp:1 ; /* 2/2 IO suspended no/yes */
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned peer_isp:1 ;
+ unsigned user_isp:1 ;
+ unsigned _pad:11; /* 0 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned _pad:11; /* 0 unused */
+ unsigned user_isp:1 ;
+ unsigned peer_isp:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned susp:1 ; /* 2/2 IO suspended no/yes */
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+ };
+ unsigned int i;
+};
+
+enum drbd_state_ret_codes {
+ SS_CW_NO_NEED = 4,
+ SS_CW_SUCCESS = 3,
+ SS_NOTHING_TO_DO = 2,
+ SS_SUCCESS = 1,
+ SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
+ SS_TWO_PRIMARIES = -1,
+ SS_NO_UP_TO_DATE_DISK = -2,
+ SS_NO_LOCAL_DISK = -4,
+ SS_NO_REMOTE_DISK = -5,
+ SS_CONNECTED_OUTDATES = -6,
+ SS_PRIMARY_NOP = -7,
+ SS_RESYNC_RUNNING = -8,
+ SS_ALREADY_STANDALONE = -9,
+ SS_CW_FAILED_BY_PEER = -10,
+ SS_IS_DISKLESS = -11,
+ SS_DEVICE_IN_USE = -12,
+ SS_NO_NET_CONFIG = -13,
+ SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
+ SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
+ SS_LOWER_THAN_OUTDATED = -16,
+ SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
+ SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
+ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
+ SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
+};
+
+/* from drbd_strings.c */
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
+
+#define SHARED_SECRET_MAX 64
+
+#define MDF_CONSISTENT (1 << 0)
+#define MDF_PRIMARY_IND (1 << 1)
+#define MDF_CONNECTED_IND (1 << 2)
+#define MDF_FULL_SYNC (1 << 3)
+#define MDF_WAS_UP_TO_DATE (1 << 4)
+#define MDF_PEER_OUT_DATED (1 << 5)
+#define MDF_CRASHED_PRIMARY (1 << 6)
+
+enum drbd_uuid_index {
+ UI_CURRENT,
+ UI_BITMAP,
+ UI_HISTORY_START,
+ UI_HISTORY_END,
+ UI_SIZE, /* nl-packet: number of dirty bits */
+ UI_FLAGS, /* nl-packet: flags */
+ UI_EXTENDED_SIZE /* Everything. */
+};
+
+enum drbd_timeout_flag {
+ UT_DEFAULT = 0,
+ UT_DEGRADED = 1,
+ UT_PEER_OUTDATED = 2,
+};
+
+#define UUID_JUST_CREATED ((__u64)4)
+
+#define DRBD_MAGIC 0x83740267
+#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
+
+/* these are of type "int" */
+#define DRBD_MD_INDEX_INTERNAL -1
+#define DRBD_MD_INDEX_FLEX_EXT -2
+#define DRBD_MD_INDEX_FLEX_INT -3
+
+/* Start of the new netlink/connector stuff */
+
+#define DRBD_NL_CREATE_DEVICE 0x01
+#define DRBD_NL_SET_DEFAULTS 0x02
+
+/* The following line should be moved over to linux/connector.h
+ * when the time comes */
+#ifndef CN_IDX_DRBD
+# define CN_IDX_DRBD 0x4
+/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
+#endif
+#define CN_VAL_DRBD 0x1
+
+/* For searching a vacant cn_idx value */
+#define CN_IDX_STEP 6977
+
+struct drbd_nl_cfg_req {
+ int packet_type;
+ unsigned int drbd_minor;
+ int flags;
+ unsigned short tag_list[];
+};
+
+struct drbd_nl_cfg_reply {
+ int packet_type;
+ unsigned int minor;
+ int ret_code; /* enum ret_code or set_st_err_t */
+ unsigned short tag_list[]; /* only used with get_* calls */
+};
+
+#endif
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
new file mode 100644
index 000000000000..9d067ce46960
--- /dev/null
+++ b/include/linux/drbd_limits.h
@@ -0,0 +1,137 @@
+/*
+ drbd_limits.h
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+*/
+
+/*
+ * Our current limitations.
+ * Some of them are hard limits,
+ * some of them are arbitrary range limits, that make it easier to provide
+ * feedback about nonsense settings for certain configurable values.
+ */
+
+#ifndef DRBD_LIMITS_H
+#define DRBD_LIMITS_H 1
+
+#define DEBUG_RANGE_CHECK 0
+
+#define DRBD_MINOR_COUNT_MIN 1
+#define DRBD_MINOR_COUNT_MAX 255
+
+#define DRBD_DIALOG_REFRESH_MIN 0
+#define DRBD_DIALOG_REFRESH_MAX 600
+
+/* valid port number */
+#define DRBD_PORT_MIN 1
+#define DRBD_PORT_MAX 0xffff
+
+/* startup { */
+ /* if you want more than 3.4 days, disable */
+#define DRBD_WFC_TIMEOUT_MIN 0
+#define DRBD_WFC_TIMEOUT_MAX 300000
+#define DRBD_WFC_TIMEOUT_DEF 0
+
+#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
+#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
+#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
+
+#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
+#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
+#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
+/* }*/
+
+/* net { */
+ /* timeout, unit centi seconds
+ * more than one minute timeout is not usefull */
+#define DRBD_TIMEOUT_MIN 1
+#define DRBD_TIMEOUT_MAX 600
+#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
+
+ /* active connection retries when C_WF_CONNECTION */
+#define DRBD_CONNECT_INT_MIN 1
+#define DRBD_CONNECT_INT_MAX 120
+#define DRBD_CONNECT_INT_DEF 10 /* seconds */
+
+ /* keep-alive probes when idle */
+#define DRBD_PING_INT_MIN 1
+#define DRBD_PING_INT_MAX 120
+#define DRBD_PING_INT_DEF 10
+
+ /* timeout for the ping packets.*/
+#define DRBD_PING_TIMEO_MIN 1
+#define DRBD_PING_TIMEO_MAX 100
+#define DRBD_PING_TIMEO_DEF 5
+
+ /* max number of write requests between write barriers */
+#define DRBD_MAX_EPOCH_SIZE_MIN 1
+#define DRBD_MAX_EPOCH_SIZE_MAX 20000
+#define DRBD_MAX_EPOCH_SIZE_DEF 2048
+
+ /* I don't think that a tcp send buffer of more than 10M is usefull */
+#define DRBD_SNDBUF_SIZE_MIN 0
+#define DRBD_SNDBUF_SIZE_MAX (10<<20)
+#define DRBD_SNDBUF_SIZE_DEF (2*65535)
+
+#define DRBD_RCVBUF_SIZE_MIN 0
+#define DRBD_RCVBUF_SIZE_MAX (10<<20)
+#define DRBD_RCVBUF_SIZE_DEF (2*65535)
+
+ /* @4k PageSize -> 128kB - 512MB */
+#define DRBD_MAX_BUFFERS_MIN 32
+#define DRBD_MAX_BUFFERS_MAX 131072
+#define DRBD_MAX_BUFFERS_DEF 2048
+
+ /* @4k PageSize -> 4kB - 512MB */
+#define DRBD_UNPLUG_WATERMARK_MIN 1
+#define DRBD_UNPLUG_WATERMARK_MAX 131072
+#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
+
+ /* 0 is disabled.
+ * 200 should be more than enough even for very short timeouts */
+#define DRBD_KO_COUNT_MIN 0
+#define DRBD_KO_COUNT_MAX 200
+#define DRBD_KO_COUNT_DEF 0
+/* } */
+
+/* syncer { */
+ /* FIXME allow rate to be zero? */
+#define DRBD_RATE_MIN 1
+/* channel bonding 10 GbE, or other hardware */
+#define DRBD_RATE_MAX (4 << 20)
+#define DRBD_RATE_DEF 250 /* kb/second */
+
+ /* less than 7 would hit performance unneccessarily.
+ * 3833 is the largest prime that still does fit
+ * into 64 sectors of activity log */
+#define DRBD_AL_EXTENTS_MIN 7
+#define DRBD_AL_EXTENTS_MAX 3833
+#define DRBD_AL_EXTENTS_DEF 127
+
+#define DRBD_AFTER_MIN -1
+#define DRBD_AFTER_MAX 255
+#define DRBD_AFTER_DEF -1
+
+/* } */
+
+/* drbdsetup XY resize -d Z
+ * you are free to reduce the device size to nothing, if you want to.
+ * the upper limit with 64bit kernel, enough ram and flexible meta data
+ * is 16 TB, currently. */
+/* DRBD_MAX_SECTORS */
+#define DRBD_DISK_SIZE_SECT_MIN 0
+#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
+#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
+
+#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
+#define DRBD_FENCING_DEF FP_DONT_CARE
+#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
+#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
+#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
+#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
+
+#define DRBD_MAX_BIO_BVECS_MIN 0
+#define DRBD_MAX_BIO_BVECS_MAX 128
+#define DRBD_MAX_BIO_BVECS_DEF 0
+
+#undef RANGE
+#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
new file mode 100644
index 000000000000..db5721ad50d1
--- /dev/null
+++ b/include/linux/drbd_nl.h
@@ -0,0 +1,137 @@
+/*
+ PAKET( name,
+ TYPE ( pn, pr, member )
+ ...
+ )
+
+ You may never reissue one of the pn arguments
+*/
+
+#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
+#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
+#endif
+
+NL_PACKET(primary, 1,
+ NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
+)
+
+NL_PACKET(secondary, 2, )
+
+NL_PACKET(disk_conf, 3,
+ NL_INT64( 2, T_MAY_IGNORE, disk_size)
+ NL_STRING( 3, T_MANDATORY, backing_dev, 128)
+ NL_STRING( 4, T_MANDATORY, meta_dev, 128)
+ NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
+ NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
+ NL_INTEGER( 7, T_MAY_IGNORE, fencing)
+ NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
+ NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
+ NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
+ /* 55 max_bio_size was available in 8.2.6rc2 */
+ NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
+ NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
+ NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
+)
+
+NL_PACKET(detach, 4, )
+
+NL_PACKET(net_conf, 5,
+ NL_STRING( 8, T_MANDATORY, my_addr, 128)
+ NL_STRING( 9, T_MANDATORY, peer_addr, 128)
+ NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
+ NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
+ NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
+ NL_INTEGER( 14, T_MAY_IGNORE, timeout)
+ NL_INTEGER( 15, T_MANDATORY, wire_protocol)
+ NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
+ NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
+ NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
+ NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
+ NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
+ NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
+ NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
+ NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
+ NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
+ NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
+ NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
+ NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
+ NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
+ /* 59 addr_family was available in GIT, never released */
+ NL_BIT( 60, T_MANDATORY, mind_af)
+ NL_BIT( 27, T_MAY_IGNORE, want_lose)
+ NL_BIT( 28, T_MAY_IGNORE, two_primaries)
+ NL_BIT( 41, T_MAY_IGNORE, always_asbp)
+ NL_BIT( 61, T_MAY_IGNORE, no_cork)
+ NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
+)
+
+NL_PACKET(disconnect, 6, )
+
+NL_PACKET(resize, 7,
+ NL_INT64( 29, T_MAY_IGNORE, resize_size)
+)
+
+NL_PACKET(syncer_conf, 8,
+ NL_INTEGER( 30, T_MAY_IGNORE, rate)
+ NL_INTEGER( 31, T_MAY_IGNORE, after)
+ NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
+ NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
+ NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
+ NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
+ NL_BIT( 65, T_MAY_IGNORE, use_rle)
+)
+
+NL_PACKET(invalidate, 9, )
+NL_PACKET(invalidate_peer, 10, )
+NL_PACKET(pause_sync, 11, )
+NL_PACKET(resume_sync, 12, )
+NL_PACKET(suspend_io, 13, )
+NL_PACKET(resume_io, 14, )
+NL_PACKET(outdate, 15, )
+NL_PACKET(get_config, 16, )
+NL_PACKET(get_state, 17,
+ NL_INTEGER( 33, T_MAY_IGNORE, state_i)
+)
+
+NL_PACKET(get_uuids, 18,
+ NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
+ NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
+)
+
+NL_PACKET(get_timeout_flag, 19,
+ NL_BIT( 36, T_MAY_IGNORE, use_degraded)
+)
+
+NL_PACKET(call_helper, 20,
+ NL_STRING( 38, T_MAY_IGNORE, helper, 32)
+)
+
+/* Tag nr 42 already allocated in drbd-8.1 development. */
+
+NL_PACKET(sync_progress, 23,
+ NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
+)
+
+NL_PACKET(dump_ee, 24,
+ NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
+ NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
+ NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
+ NL_INT64( 48, T_MAY_IGNORE, ee_sector)
+ NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
+ NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
+)
+
+NL_PACKET(start_ov, 25,
+ NL_INT64( 66, T_MAY_IGNORE, start_sector)
+)
+
+NL_PACKET(new_c_uuid, 26,
+ NL_BIT( 63, T_MANDATORY, clear_bm)
+)
+
+#undef NL_PACKET
+#undef NL_INTEGER
+#undef NL_INT64
+#undef NL_BIT
+#undef NL_STRING
+
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
new file mode 100644
index 000000000000..fcdff8410e99
--- /dev/null
+++ b/include/linux/drbd_tag_magic.h
@@ -0,0 +1,83 @@
+#ifndef DRBD_TAG_MAGIC_H
+#define DRBD_TAG_MAGIC_H
+
+#define TT_END 0
+#define TT_REMOVED 0xE000
+
+/* declare packet_type enums */
+enum packet_types {
+#define NL_PACKET(name, number, fields) P_ ## name = number,
+#define NL_INTEGER(pn, pr, member)
+#define NL_INT64(pn, pr, member)
+#define NL_BIT(pn, pr, member)
+#define NL_STRING(pn, pr, member, len)
+#include "drbd_nl.h"
+ P_nl_after_last_packet,
+};
+
+/* These struct are used to deduce the size of the tag lists: */
+#define NL_PACKET(name, number, fields) \
+ struct name ## _tag_len_struct { fields };
+#define NL_INTEGER(pn, pr, member) \
+ int member; int tag_and_len ## member;
+#define NL_INT64(pn, pr, member) \
+ __u64 member; int tag_and_len ## member;
+#define NL_BIT(pn, pr, member) \
+ unsigned char member:1; int tag_and_len ## member;
+#define NL_STRING(pn, pr, member, len) \
+ unsigned char member[len]; int member ## _len; \
+ int tag_and_len ## member;
+#include "linux/drbd_nl.h"
+
+/* declate tag-list-sizes */
+static const int tag_list_sizes[] = {
+#define NL_PACKET(name, number, fields) 2 fields ,
+#define NL_INTEGER(pn, pr, member) + 4 + 4
+#define NL_INT64(pn, pr, member) + 4 + 8
+#define NL_BIT(pn, pr, member) + 4 + 1
+#define NL_STRING(pn, pr, member, len) + 4 + (len)
+#include "drbd_nl.h"
+};
+
+/* The two highest bits are used for the tag type */
+#define TT_MASK 0xC000
+#define TT_INTEGER 0x0000
+#define TT_INT64 0x4000
+#define TT_BIT 0x8000
+#define TT_STRING 0xC000
+/* The next bit indicates if processing of the tag is mandatory */
+#define T_MANDATORY 0x2000
+#define T_MAY_IGNORE 0x0000
+#define TN_MASK 0x1fff
+/* The remaining 13 bits are used to enumerate the tags */
+
+#define tag_type(T) ((T) & TT_MASK)
+#define tag_number(T) ((T) & TN_MASK)
+
+/* declare tag enums */
+#define NL_PACKET(name, number, fields) fields
+enum drbd_tags {
+#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
+#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
+#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
+#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
+#include "drbd_nl.h"
+};
+
+struct tag {
+ const char *name;
+ int type_n_flags;
+ int max_len;
+};
+
+/* declare tag names */
+#define NL_PACKET(name, number, fields) fields
+static const struct tag tag_descriptions[] = {
+#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
+#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
+#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
+#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
+#include "drbd_nl.h"
+};
+
+#endif
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
new file mode 100644
index 000000000000..3a2b2d9b0472
--- /dev/null
+++ b/include/linux/lru_cache.h
@@ -0,0 +1,294 @@
+/*
+ lru_cache.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#ifndef LRU_CACHE_H
+#define LRU_CACHE_H
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/string.h> /* for memset */
+#include <linux/seq_file.h>
+
+/*
+This header file (and its .c file; kernel-doc of functions see there)
+ define a helper framework to easily keep track of index:label associations,
+ and changes to an "active set" of objects, as well as pending transactions,
+ to persistently record those changes.
+
+ We use an LRU policy if it is necessary to "cool down" a region currently in
+ the active set before we can "heat" a previously unused region.
+
+ Because of this later property, it is called "lru_cache".
+ As it actually Tracks Objects in an Active SeT, we could also call it
+ toast (incidentally that is what may happen to the data on the
+ backend storage uppon next resync, if we don't get it right).
+
+What for?
+
+We replicate IO (more or less synchronously) to local and remote disk.
+
+For crash recovery after replication node failure,
+ we need to resync all regions that have been target of in-flight WRITE IO
+ (in use, or "hot", regions), as we don't know wether or not those WRITEs have
+ made it to stable storage.
+
+ To avoid a "full resync", we need to persistently track these regions.
+
+ This is known as "write intent log", and can be implemented as on-disk
+ (coarse or fine grained) bitmap, or other meta data.
+
+ To avoid the overhead of frequent extra writes to this meta data area,
+ usually the condition is softened to regions that _may_ have been target of
+ in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
+ bitmap, trading frequency of meta data transactions against amount of
+ (possibly unneccessary) resync traffic.
+
+ If we set a hard limit on the area that may be "hot" at any given time, we
+ limit the amount of resync traffic needed for crash recovery.
+
+For recovery after replication link failure,
+ we need to resync all blocks that have been changed on the other replica
+ in the mean time, or, if both replica have been changed independently [*],
+ all blocks that have been changed on either replica in the mean time.
+ [*] usually as a result of a cluster split-brain and insufficient protection.
+ but there are valid use cases to do this on purpose.
+
+ Tracking those blocks can be implemented as "dirty bitmap".
+ Having it fine-grained reduces the amount of resync traffic.
+ It should also be persistent, to allow for reboots (or crashes)
+ while the replication link is down.
+
+There are various possible implementations for persistently storing
+write intent log information, three of which are mentioned here.
+
+"Chunk dirtying"
+ The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
+ To reduce the frequency of bitmap updates for write-intent log purposes,
+ one could dirty "chunks" (of some size) at a time of the (fine grained)
+ on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
+ possible, flushing it to disk again when a previously "hot" (and on-disk
+ dirtied as full chunk) area "cools down" again (no IO in flight anymore,
+ and none expected in the near future either).
+
+"Explicit (coarse) write intent bitmap"
+ An other implementation could chose a (probably coarse) explicit bitmap,
+ for write-intent log purposes, additionally to the fine grained dirty bitmap.
+
+"Activity log"
+ Yet an other implementation may keep track of the hot regions, by starting
+ with an empty set, and writing down a journal of region numbers that have
+ become "hot", or have "cooled down" again.
+
+ To be able to use a ring buffer for this journal of changes to the active
+ set, we not only record the actual changes to that set, but also record the
+ not changing members of the set in a round robin fashion. To do so, we use a
+ fixed (but configurable) number of slots which we can identify by index, and
+ associate region numbers (labels) with these indices.
+ For each transaction recording a change to the active set, we record the
+ change itself (index: -old_label, +new_label), and which index is associated
+ with which label (index: current_label) within a certain sliding window that
+ is moved further over the available indices with each such transaction.
+
+ Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
+ accurately reconstruct the active set.
+
+ Sufficiently large depends only on maximum number of active objects, and the
+ size of the sliding window recording "index: current_label" associations within
+ each transaction.
+
+ This is what we call the "activity log".
+
+ Currently we need one activity log transaction per single label change, which
+ does not give much benefit over the "dirty chunks of bitmap" approach, other
+ than potentially less seeks.
+
+ We plan to change the transaction format to support multiple changes per
+ transaction, which then would reduce several (disjoint, "random") updates to
+ the bitmap into one transaction to the activity log ring buffer.
+*/
+
+/* this defines an element in a tracked set
+ * .colision is for hash table lookup.
+ * When we process a new IO request, we know its sector, thus can deduce the
+ * region number (label) easily. To do the label -> object lookup without a
+ * full list walk, we use a simple hash table.
+ *
+ * .list is on one of three lists:
+ * in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
+ * lru: unused but ready to be reused or recycled
+ * (ts_refcnt == 0, lc_number != LC_FREE),
+ * free: unused but ready to be recycled
+ * (ts_refcnt == 0, lc_number == LC_FREE),
+ *
+ * an element is said to be "in the active set",
+ * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
+ *
+ * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
+ * (total memory usage 2 pages), and up to 3833 elements on the act_log
+ * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
+ *
+ * We usually do not actually free these objects again, but only "recycle"
+ * them, as the change "index: -old_label, +LC_FREE" would need a transaction
+ * as well. Which also means that using a kmem_cache to allocate the objects
+ * from wastes some resources.
+ * But it avoids high order page allocations in kmalloc.
+ */
+struct lc_element {
+ struct hlist_node colision;
+ struct list_head list; /* LRU list or free list */
+ unsigned refcnt;
+ /* back "pointer" into ts_cache->element[index],
+ * for paranoia, and for "ts_element_to_index" */
+ unsigned lc_index;
+ /* if we want to track a larger set of objects,
+ * it needs to become arch independend u64 */
+ unsigned lc_number;
+
+ /* special label when on free list */
+#define LC_FREE (~0U)
+};
+
+struct lru_cache {
+ /* the least recently used item is kept at lru->prev */
+ struct list_head lru;
+ struct list_head free;
+ struct list_head in_use;
+
+ /* the pre-created kmem cache to allocate the objects from */
+ struct kmem_cache *lc_cache;
+
+ /* size of tracked objects, used to memset(,0,) them in lc_reset */
+ size_t element_size;
+ /* offset of struct lc_element member in the tracked object */
+ size_t element_off;
+
+ /* number of elements (indices) */
+ unsigned int nr_elements;
+ /* Arbitrary limit on maximum tracked objects. Practical limit is much
+ * lower due to allocation failures, probably. For typical use cases,
+ * nr_elements should be a few thousand at most.
+ * This also limits the maximum value of ts_element.ts_index, allowing the
+ * 8 high bits of .ts_index to be overloaded with flags in the future. */
+#define LC_MAX_ACTIVE (1<<24)
+
+ /* statistics */
+ unsigned used; /* number of lelements currently on in_use list */
+ unsigned long hits, misses, starving, dirty, changed;
+
+ /* see below: flag-bits for lru_cache */
+ unsigned long flags;
+
+ /* when changing the label of an index element */
+ unsigned int new_number;
+
+ /* for paranoia when changing the label of an index element */
+ struct lc_element *changing_element;
+
+ void *lc_private;
+ const char *name;
+
+ /* nr_elements there */
+ struct hlist_head *lc_slot;
+ struct lc_element **lc_element;
+};
+
+
+/* flag-bits for lru_cache */
+enum {
+ /* debugging aid, to catch concurrent access early.
+ * user needs to guarantee exclusive access by proper locking! */
+ __LC_PARANOIA,
+ /* if we need to change the set, but currently there is a changing
+ * transaction pending, we are "dirty", and must deferr further
+ * changing requests */
+ __LC_DIRTY,
+ /* if we need to change the set, but currently there is no free nor
+ * unused element available, we are "starving", and must not give out
+ * further references, to guarantee that eventually some refcnt will
+ * drop to zero and we will be able to make progress again, changing
+ * the set, writing the transaction.
+ * if the statistics say we are frequently starving,
+ * nr_elements is too small. */
+ __LC_STARVING,
+};
+#define LC_PARANOIA (1<<__LC_PARANOIA)
+#define LC_DIRTY (1<<__LC_DIRTY)
+#define LC_STARVING (1<<__LC_STARVING)
+
+extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+ unsigned e_count, size_t e_size, size_t e_off);
+extern void lc_reset(struct lru_cache *lc);
+extern void lc_destroy(struct lru_cache *lc);
+extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
+extern void lc_del(struct lru_cache *lc, struct lc_element *element);
+
+extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
+extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
+extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
+extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
+extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
+
+struct seq_file;
+extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
+
+extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
+ void (*detail) (struct seq_file *, struct lc_element *));
+
+/**
+ * lc_try_lock - can be used to stop lc_get() from changing the tracked set
+ * @lc: the lru cache to operate on
+ *
+ * Note that the reference counts and order on the active and lru lists may
+ * still change. Returns true if we aquired the lock.
+ */
+static inline int lc_try_lock(struct lru_cache *lc)
+{
+ return !test_and_set_bit(__LC_DIRTY, &lc->flags);
+}
+
+/**
+ * lc_unlock - unlock @lc, allow lc_get() to change the set again
+ * @lc: the lru cache to operate on
+ */
+static inline void lc_unlock(struct lru_cache *lc)
+{
+ clear_bit(__LC_DIRTY, &lc->flags);
+ smp_mb__after_clear_bit();
+}
+
+static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
+{
+ struct lc_element *e = lc_find(lc, enr);
+ return e && e->refcnt;
+}
+
+#define lc_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
+extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
+
+#endif
OpenPOWER on IntegriCloud