From d29adb34a94715174c88ca93e8aba955850c9bde Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 2 Dec 2013 19:11:48 -0800 Subject: libceph: block I/O when PAUSE or FULL osd map flags are set The PAUSEWR and PAUSERD flags are meant to stop the cluster from processing writes and reads, respectively. The FULL flag is set when the cluster determines that it is out of space, and will no longer process writes. PAUSEWR and PAUSERD are purely client-side settings already implemented in userspace clients. The osd does nothing special with these flags. When the FULL flag is set, however, the osd responds to all writes with -ENOSPC. For cephfs, this makes sense, but for rbd the block layer translates this into EIO. If a cluster goes from full to non-full quickly, a filesystem on top of rbd will not behave well, since some writes succeed while others get EIO. Fix this by blocking any writes when the FULL flag is set in the osd client. This is the same strategy used by userspace, so apply it by default. A follow-on patch makes this configurable. __map_request() is called to re-target osd requests in case the available osds changed. Add a paused field to a ceph_osd_request, and set it whenever an appropriate osd map flag is set. Avoid queueing paused requests in __map_request(), but force them to be resent if they become unpaused. Also subscribe to the next osd map from the monitor if any of these flags are set, so paused requests can be unblocked as soon as possible. Fixes: http://tracker.ceph.com/issues/6079 Reviewed-by: Sage Weil Signed-off-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8f47625a0661..4fb6a8938957 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -138,6 +138,7 @@ struct ceph_osd_request { __le64 *r_request_pool; void *r_request_pgid; __le32 *r_request_attempts; + bool r_paused; struct ceph_eversion *r_request_reassert_version; int r_result; -- cgit v1.2.1 From 12b4629a9fb80fecaebadc217b13b8776ed8dbef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:23 +0200 Subject: libceph: all features fields must be u64 In preparation for ceph_features.h update, change all features fields from unsigned int/u32 to u64. (ceph.git has ~40 feature bits at this point.) Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/libceph.h | 8 ++++---- include/linux/ceph/messenger.h | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 2e3024881a5e..7d704db60cbb 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -122,8 +122,8 @@ struct ceph_client { int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; struct ceph_messenger msgr; /* messenger instance */ struct ceph_mon_client monc; @@ -192,8 +192,8 @@ extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); extern struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned supported_features, - unsigned required_features); + u64 supported_features, + u64 required_features); extern u64 ceph_client_id(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 7c1420bb1dce..c1d3f5a65273 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -60,8 +60,8 @@ struct ceph_messenger { u32 global_seq; spinlock_t global_seq_lock; - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; }; enum ceph_msg_data_type { @@ -192,7 +192,7 @@ struct ceph_connection { struct ceph_entity_name peer_name; /* peer name */ - unsigned peer_features; + u64 peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -256,8 +256,8 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc); extern void ceph_con_init(struct ceph_connection *con, void *private, -- cgit v1.2.1 From 2b3e0c905af43cfe402a2ef3f800be5dc1684005 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: libceph: update ceph_features.h This updates ceph_features.h so that it has all feature bits defined in ceph.git. In the interim since the last update, ceph.git crossed the "32 feature bits" point, and, the addition of the 33rd bit wasn't handled correctly. The work-around is squashed into this commit and reflects ceph.git commit 053659d05e0349053ef703b414f44965f368b9f0. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 96 +++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 32 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 4c42080347af..003ea719be65 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -4,42 +4,73 @@ /* * feature bits */ -#define CEPH_FEATURE_UID (1<<0) -#define CEPH_FEATURE_NOSRCADDR (1<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) -#define CEPH_FEATURE_FLOCK (1<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) -#define CEPH_FEATURE_MONNAMES (1<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1<<8) -#define CEPH_FEATURE_PGID64 (1<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1<<10) -#define CEPH_FEATURE_PGPOOL3 (1<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1<<12) -#define CEPH_FEATURE_OSDENC (1<<13) -#define CEPH_FEATURE_OMAP (1<<14) -#define CEPH_FEATURE_MONENC (1<<15) -#define CEPH_FEATURE_QUERY_T (1<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1<<20) -#define CEPH_FEATURE_MON_GV (1<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) -#define CEPH_FEATURE_MSG_AUTH (1<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) -#define CEPH_FEATURE_CREATEPOOLID (1<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1<<28) -#define CEPH_FEATURE_MDSENC (1<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) +#define CEPH_FEATURE_UID (1ULL<<0) +#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) +#define CEPH_FEATURE_FLOCK (1ULL<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) +#define CEPH_FEATURE_MONNAMES (1ULL<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) +#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) +#define CEPH_FEATURE_PGID64 (1ULL<<9) +#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) +#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) +#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) +#define CEPH_FEATURE_OSDENC (1ULL<<13) +#define CEPH_FEATURE_OMAP (1ULL<<14) +#define CEPH_FEATURE_MONENC (1ULL<<15) +#define CEPH_FEATURE_QUERY_T (1ULL<<16) +#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) +#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) +#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) +#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) +#define CEPH_FEATURE_MON_GV (1ULL<<21) +#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) +#define CEPH_FEATURE_MSG_AUTH (1ULL<<23) +#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) +#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) +#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) +#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) +#define CEPH_FEATURE_MDSENC (1ULL<<29) +#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) +#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) +#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) +#define CEPH_FEATURE_MON_SCRUB (1ULL<<33) +#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) +#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) +#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ +#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) +#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) + +/* + * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature + * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 + * to mean 33 bit ~0, and introduce a helper below to do the + * translation. + * + * This was introduced by ceph.git commit + * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 + * and fixed by ceph.git commit + * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c + */ +#define CEPH_FEATURE_RESERVED (1ULL<<63) + +static inline u64 ceph_sanitize_features(u64 features) +{ + if (features & CEPH_FEATURE_RESERVED) { + /* everything through OSD_SNAPMAPPER */ + return 0x1ffffffffull; + } else { + return features; + } +} /* * Features supported. */ -#define CEPH_FEATURES_SUPPORTED_DEFAULT \ +#define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ @@ -56,4 +87,5 @@ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC) + #endif -- cgit v1.2.1 From cdff49918c8286ac18593e742ead25242c76c81d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:27 +0200 Subject: crush: support new indep mode and SET_* steps (crush v2) by default Add CRUSH_V2 feature (new indep mode and SET_* steps) to a set of features supported by default. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 003ea719be65..5f42e4412909 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -79,7 +79,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ - CEPH_FEATURE_OSDHASHPSPOOL) + CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_CRUSH_V2) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.1 From 3cea4c3071d4e55e9d7356efe9d0ebf92f0c2204 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: rename ceph_msg::front_max to front_alloc_len Rename front_max field of struct ceph_msg to front_alloc_len to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c1d3f5a65273..861138f7c161 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -157,7 +157,7 @@ struct ceph_msg { bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; - int front_max; + int front_alloc_len; unsigned long ack_stamp; /* tx: when we were acked */ struct ceph_msgpool *pool; -- cgit v1.2.1 From 186e4f7a4b1883f3f46aa15366c0bcebc28fdda7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 22 Nov 2013 14:48:37 +0800 Subject: ceph: handle session flush message Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 2ad7b860f062..26bb587deb78 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -282,6 +282,8 @@ enum { CEPH_SESSION_RENEWCAPS, CEPH_SESSION_STALE, CEPH_SESSION_RECALL_STATE, + CEPH_SESSION_FLUSHMSG, + CEPH_SESSION_FLUSHMSG_ACK, }; extern const char *ceph_session_op_name(int op); -- cgit v1.2.1 From 4ee6a914edbbd2543884f0ad7d58ea471136be32 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 24 Nov 2013 14:43:46 +0800 Subject: ceph: remove exported caps when handling cap import message Version 3 cap import message includes the ID of the exported caps. It allow us to remove the exported caps if we still haven't received the corresponding cap export message. We remove the exported caps because they are stale, keeping them can compromise consistence. Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_fs.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 26bb587deb78..0a37b989b52f 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -459,7 +459,8 @@ struct ceph_mds_reply_cap { __u8 flags; /* CEPH_CAP_FLAG_* */ } __attribute__ ((packed)); -#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */ /* inode record, for bundling with mds reply */ struct ceph_mds_reply_inode { @@ -660,6 +661,14 @@ struct ceph_mds_caps { __le32 time_warp_seq; } __attribute__ ((packed)); +struct ceph_mds_cap_peer { + __le64 cap_id; + __le32 seq; + __le32 mseq; + __le32 mds; + __u8 flags; +} __attribute__ ((packed)); + /* cap release msg head */ struct ceph_mds_cap_release { __le32 num; /* number of cap_items that follow */ -- cgit v1.2.1 From 80213a84a96c3040f5824bce646a184d5dd3dd2b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 21 Jan 2014 11:07:16 +0800 Subject: libceph: support CEPH_FEATURE_EXPORT_PEER Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_features.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 5f42e4412909..d21b34d94436 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -80,7 +80,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ CEPH_FEATURE_OSDHASHPSPOOL | \ - CEPH_FEATURE_CRUSH_V2) + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_EXPORT_PEER) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.1 From eeb0bed5572b1282009dfc2635604df5a35d1a02 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: add ceph_kv{malloc,free}() and switch to them Encapsulate kmalloc vs vmalloc memory allocation and freeing logic into two helpers, ceph_kvmalloc() and ceph_kvfree(), and switch to them. ceph_kvmalloc() kmalloc()'s a maximum of 8 pages, anything bigger is vmalloc()'ed with __GFP_HIGHMEM set. This changes the existing behaviour: - for buffers (ceph_buffer_new()), from trying to kmalloc() everything and using vmalloc() just as a fallback - for messages (ceph_msg_new()), from going to vmalloc() for anything bigger than a page - for messages (ceph_msg_new()), from disallowing vmalloc() to use high memory Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/buffer.h | 1 - include/linux/ceph/libceph.h | 11 +++++++---- include/linux/ceph/messenger.h | 1 - 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h index 58d19014068f..07ad423cc37f 100644 --- a/include/linux/ceph/buffer.h +++ b/include/linux/ceph/buffer.h @@ -17,7 +17,6 @@ struct ceph_buffer { struct kref kref; struct kvec vec; size_t alloc_len; - bool is_vmalloc; }; extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 7d704db60cbb..2f49aa4c4f7f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -173,15 +173,18 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_CACHE_SHIFT); } +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + /* ceph_common.c */ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; +extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern void ceph_kvfree(const void *ptr); extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 861138f7c161..20ee8b63a968 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -154,7 +154,6 @@ struct ceph_msg { struct list_head list_head; /* links for connection lists */ struct kref kref; - bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; int front_alloc_len; -- cgit v1.2.1 From 22116525baec1d63f4878eaa92f0b57946a78819 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: start using oloc abstraction Instead of relying on pool fields in ceph_file_layout (for mapping) and ceph_pg (for enconding), start using ceph_object_locator (oloc) abstraction. Note that userspace oloc currently consists of pool, key, nspace and hash fields, while this one contains only a pool. This is OK, because at this point we only send (i.e. encode) olocs and never have to receive (i.e. decode) them. This makes keeping a copy of ceph_file_layout in every osd request unnecessary, so ceph_osd_request::r_file_layout field is nuked. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 3 ++- include/linux/ceph/osdmap.h | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4fb6a8938957..5b8555109789 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -159,12 +159,13 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ + struct ceph_object_locator r_oloc; + char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ - struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ }; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index d05cc4451af6..256134af4ad4 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -40,8 +40,7 @@ struct ceph_pg_pool_info { }; struct ceph_object_locator { - uint64_t pool; - char *key; + s64 pool; }; struct ceph_pg_mapping { -- cgit v1.2.1 From e8221464fc2bc8c9f7b0c2115abbd75ba23f210a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: move ceph_file_layout helpers to ceph_fs.h Move ceph_file_layout helper macros and inline functions to ceph_fs.h. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_fs.h | 23 +++++++++++++++++++++++ include/linux/ceph/osdmap.h | 27 --------------------------- 2 files changed, 23 insertions(+), 27 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 0a37b989b52f..2623cffc73a1 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -53,6 +53,29 @@ struct ceph_file_layout { __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); +#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) +#define ceph_file_layout_stripe_count(l) \ + ((__s32)le32_to_cpu((l).fl_stripe_count)) +#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) +#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) +#define ceph_file_layout_object_su(l) \ + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) +#define ceph_file_layout_pg_pool(l) \ + ((__s32)le32_to_cpu((l).fl_pg_pool)) + +static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_stripe_unit) * + le32_to_cpu(l->fl_stripe_count); +} + +/* "period" == bytes before i start on a new set of objects */ +static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_object_size) * + le32_to_cpu(l->fl_stripe_count); +} + #define CEPH_MIN_STRIPE_UNIT 65536 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 256134af4ad4..f2679c384625 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -72,33 +72,6 @@ struct ceph_osdmap { struct crush_map *crush; }; -/* - * file layout helpers - */ -#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) -#define ceph_file_layout_stripe_count(l) \ - ((__s32)le32_to_cpu((l).fl_stripe_count)) -#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) -#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) -#define ceph_file_layout_object_su(l) \ - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) -#define ceph_file_layout_pg_pool(l) \ - ((__s32)le32_to_cpu((l).fl_pg_pool)) - -static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_stripe_unit) * - le32_to_cpu(l->fl_stripe_count); -} - -/* "period" == bytes before i start on a new set of objects */ -static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_object_size) * - le32_to_cpu(l->fl_stripe_count); -} - - static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); -- cgit v1.2.1 From 2d0ebc5d591f49131bf8f93b54c5424162c3fb7f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN In preparation for adding oid abstraction, rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5b8555109789..b42f15848cae 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -16,7 +16,7 @@ * Maximum object name size * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) */ -#define MAX_OBJ_NAME_SIZE 100 +#define CEPH_MAX_OID_NAME_LEN 100 struct ceph_msg; struct ceph_snap_context; @@ -161,7 +161,7 @@ struct ceph_osd_request { struct ceph_object_locator r_oloc; - char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ + char r_oid[CEPH_MAX_OID_NAME_LEN]; /* object name */ int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ -- cgit v1.2.1 From 4295f2217a5aa8ef2738e3a368db3c1ceab41212 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: introduce and start using oid abstraction In preparation for tiering support, which would require having two (base and target) object names for each osd request and also copying those names around, introduce struct ceph_object_id (oid) and a couple helpers to facilitate those copies and encapsulate the fact that object name is not necessarily a NUL-terminated string. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 9 +-------- include/linux/ceph/osdmap.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index b42f15848cae..8d8bb53994b1 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -12,12 +12,6 @@ #include #include -/* - * Maximum object name size - * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define CEPH_MAX_OID_NAME_LEN 100 - struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; @@ -160,9 +154,8 @@ struct ceph_osd_request { void *r_priv; /* ditto */ struct ceph_object_locator r_oloc; + struct ceph_object_id r_oid; - char r_oid[CEPH_MAX_OID_NAME_LEN]; /* object name */ - int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index f2679c384625..c85f7d43b861 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -43,6 +43,18 @@ struct ceph_object_locator { s64 pool; }; +/* + * Maximum supported by kernel client object name length + * + * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) + */ +#define CEPH_MAX_OID_NAME_LEN 100 + +struct ceph_object_id { + char name[CEPH_MAX_OID_NAME_LEN]; + int name_len; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -72,6 +84,30 @@ struct ceph_osdmap { struct crush_map *crush; }; +static inline void ceph_oid_set_name(struct ceph_object_id *oid, + const char *name) +{ + int len; + + len = strlen(name); + if (len > sizeof(oid->name)) { + WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", + name, len, sizeof(oid->name)); + len = sizeof(oid->name); + } + + memcpy(oid->name, name, len); + oid->name_len = len; +} + +static inline void ceph_oid_copy(struct ceph_object_id *dest, + struct ceph_object_id *src) +{ + BUG_ON(src->name_len > sizeof(dest->name)); + memcpy(dest->name, src->name, src->name_len); + dest->name_len = src->name_len; +} + static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); -- cgit v1.2.1 From 7c13cb64352230deac24d3cb058387a6c0676f83 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg() Switch ceph_calc_ceph_pg() to new oloc and oid abstractions and rename it to ceph_oloc_oid_to_pg() to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index c85f7d43b861..ebb8ec285de6 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -163,8 +163,11 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool); +extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out); + extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *acting); -- cgit v1.2.1 From 1b3f2ab51095a7aab684bf9f5c14235126188dbc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: CEPH_OSD_FLAG_* enum update Update CEPH_OSD_FLAG_* enum. (We need CEPH_OSD_FLAG_IGNORE_OVERLAY to support tiering). Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/rados.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 68c96a508ac2..96292df4041b 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -344,6 +344,10 @@ enum { CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ + CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ }; enum { -- cgit v1.2.1 From ce7f6a2790464047199f54b66420243d433142bd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: add ceph_pg_pool_by_id() "Lookup pool info by ID" function is hidden in osdmap.c. Expose it to the rest of libceph. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ebb8ec285de6..7f894a64c6c7 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -174,6 +174,9 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, + u64 id); + extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); -- cgit v1.2.1 From 17a13e4028e6ad7ded079cf32370c47bd0e0fc07 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: follow {read,write}_tier fields on osd request submission Overwrite ceph_osd_request::r_oloc.pool with read_tier for read ops and write_tier for write and read+write ops (aka basic tiering support). {read,write}_tier are part of pg_pool_t since v9. This commit bumps our pg_pool_t decode compat version from v7 to v9, all new fields except for {read,write}_tier are ignored. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 7f894a64c6c7..49ff69f0746b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -35,6 +35,8 @@ struct ceph_pg_pool_info { u8 object_hash; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; + s64 read_tier; + s64 write_tier; /* wins for read+write ops */ u64 flags; char *name; }; -- cgit v1.2.1 From 3c972c95c68f455d80ff185aa440857be046bbe0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} Rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} before introducing r_target_{oloc,oid} needed for redirects. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8d8bb53994b1..3170ca6d98b2 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -153,8 +153,8 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - struct ceph_object_locator r_oloc; - struct ceph_object_id r_oid; + struct ceph_object_locator r_base_oloc; + struct ceph_object_id r_base_oid; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ -- cgit v1.2.1 From 205ee1187a671c3b067d7f1e974903b44036f270 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: follow redirect replies from osds Follow redirect replies from osds, for details see ceph.git commit fbbe3ad1220799b7bb00ea30fce581c5eadaf034. v1 (current) version of redirect reply consists of oloc and oid, which expands to pool, key, nspace, hash and oid. However, server-side code that would populate anything other than pool doesn't exist yet, and hence this commit adds support for pool redirects only. To make sure that future server-side updates don't break us, we decode all fields and, if any of key, nspace, hash or oid have a non-default value, error out with "corrupt osd_op_reply ..." message. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3170ca6d98b2..fd47e872ebcc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -155,6 +155,8 @@ struct ceph_osd_request { struct ceph_object_locator r_base_oloc; struct ceph_object_id r_base_oid; + struct ceph_object_locator r_target_oloc; + struct ceph_object_id r_target_oid; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ @@ -162,6 +164,10 @@ struct ceph_osd_request { struct ceph_snap_context *r_snapc; /* snap context for writes */ }; +struct ceph_request_redirect { + struct ceph_object_locator oloc; +}; + struct ceph_osd_event { u64 cookie; int one_shot; -- cgit v1.2.1 From 80e163a58c0c69ef1a0ba3500d9932b14d67bf64 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: support CEPH_FEATURE_OSD_CACHEPOOL feature Announce our (limited, see previous commit) support for CACHEPOOL feature. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index d21b34d94436..138448f766b4 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -80,6 +80,7 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER) -- cgit v1.2.1