summaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/addr.c103
-rw-r--r--fs/ceph/caps.c102
-rw-r--r--fs/ceph/dir.c99
-rw-r--r--fs/ceph/file.c15
-rw-r--r--fs/ceph/inode.c18
-rw-r--r--fs/ceph/locks.c75
-rw-r--r--fs/ceph/mds_client.c73
-rw-r--r--fs/ceph/mdsmap.c42
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h13
-rw-r--r--fs/ceph/xattr.c9
11 files changed, 298 insertions, 253 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
* dirty page counters appropriately. Only called if there is private
* data on the page.
*/
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
page->private = 0;
ClearPagePrivate(page);
} else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
+ dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+ inode, page, page->index, offset, length);
}
}
@@ -438,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
- loff_t page_off = page_offset(page);
- int len = PAGE_CACHE_SIZE;
- loff_t i_size;
- int err = 0;
struct ceph_snap_context *snapc, *oldest;
- u64 snap_size = 0;
+ loff_t page_off = page_offset(page);
long writeback_stat;
+ u64 truncate_size, snap_size = 0;
+ u32 truncate_seq;
+ int err = 0, len = PAGE_CACHE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
@@ -474,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
ceph_put_snap_context(oldest);
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
/* is this a partial page at end of file? */
- if (snap_size)
- i_size = snap_size;
- else
- i_size = i_size_read(inode);
- if (i_size < page_off + len)
- len = i_size - page_off;
+ if (page_off >= snap_size) {
+ dout("%p page eof %llu\n", page, snap_size);
+ goto out;
+ }
+ if (snap_size < page_off + len)
+ len = snap_size - page_off;
dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
inode, page, page->index, page_off, len, snapc);
@@ -494,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
page_off, len,
- ci->i_truncate_seq, ci->i_truncate_size,
+ truncate_seq, truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -631,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
ceph_osdc_put_request(req);
}
-static struct ceph_osd_request *
-ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
- struct ceph_snap_context *snapc, int num_ops)
-{
- struct ceph_fs_client *fsc;
- struct ceph_inode_info *ci;
- struct ceph_vino vino;
-
- fsc = ceph_inode_to_client(inode);
- ci = ceph_inode(inode);
- vino = ceph_vino(inode);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-
- return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
- snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
-}
-
/*
* initiate async writeback
*/
@@ -658,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
@@ -670,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync;
- u64 snap_size;
+ u64 truncate_size, snap_size;
+ u32 truncate_seq;
/*
* Include a 'sync' in the OSD request if this is a data
* integrity write (e.g., O_SYNC write or fsync()), or if our
* cap is being revoked.
*/
- do_sync = wbc->sync_mode == WB_SYNC_ALL;
- if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+ if ((wbc->sync_mode == WB_SYNC_ALL) ||
+ ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
do_sync = 1;
dout("writepages_start %p dosync=%d (mode=%s)\n",
inode, do_sync,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- fsc = ceph_inode_to_client(inode);
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
pr_warning("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
@@ -728,6 +717,14 @@ retry:
snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
+
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
if (last_snapc && snapc != last_snapc) {
/* if we switched to a newer snapc, restart our scan at the
* start of the original file range. */
@@ -739,7 +736,6 @@ retry:
while (!done && index <= end) {
int num_ops = do_sync ? 2 : 1;
- struct ceph_vino vino;
unsigned i;
int first;
pgoff_t next;
@@ -833,17 +829,18 @@ get_more_pages:
* that it will use.
*/
if (locked_pages == 0) {
- size_t size;
-
BUG_ON(pages);
-
/* prepare async write request */
offset = (u64)page_offset(page);
len = wsize;
- req = ceph_writepages_osd_request(inode,
- offset, &len, snapc,
- num_ops);
-
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, num_ops,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, true);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
unlock_page(page);
@@ -854,8 +851,8 @@ get_more_pages:
req->r_inode = inode;
max_pages = calc_pages_for(0, (u64)len);
- size = max_pages * sizeof (*pages);
- pages = kmalloc(size, GFP_NOFS);
+ pages = kmalloc(max_pages * sizeof (*pages),
+ GFP_NOFS);
if (!pages) {
pool = fsc->wb_pagevec_pool;
pages = mempool_alloc(pool, GFP_NOFS);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8a3bcb..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
spin_unlock(&mdsc->caps_list_lock);
}
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
int have;
int alloc = 0;
LIST_HEAD(newcaps);
- int ret = 0;
dout("reserve caps ctx=%p need=%d\n", ctx, need);
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
for (i = have; i < need; i++) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
+ if (!cap)
+ break;
list_add(&cap->caps_item, &newcaps);
alloc++;
}
- BUG_ON(have + alloc != need);
+ /* we didn't manage to reserve as much as we needed */
+ if (have + alloc != need)
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
mdsc->caps_reserve_count, mdsc->caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
}
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
__cap_delay_requeue(mdsc, ci);
}
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap) {
+ if (flags & CEPH_CAP_FLAG_AUTH) {
+ if (ci->i_auth_cap == NULL ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ci->i_auth_cap = cap;
+ } else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (implemented)
*implemented |= cap->implemented;
}
+ /*
+ * exclude caps issued by non-auth MDS, but are been revoking
+ * by the auth MDS. The non-auth MDS should be revoking/exporting
+ * these caps, but the message is delayed.
+ */
+ if (ci->i_auth_cap) {
+ cap = ci->i_auth_cap;
+ have &= ~cap->implemented | cap->issued;
+ }
return have;
}
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask)
{
- struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct rb_node *p;
- int ret = 0;
- spin_lock(&ci->i_ceph_lock);
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
+ if (cap != ocap && __cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask))
+ return 1;
}
+ return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
dout("ceph_caps_revoking %p %s = %d\n", inode,
ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
__ceph_flush_snaps(ci, &session, 1);
+
if (ci->i_flushing_caps) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &cap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
- __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR));
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
}
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
} else {
dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
ceph_cap_string(newcaps));
+ /* non-auth MDS is revoking the newly grant caps ? */
+ if (cap == ci->i_auth_cap &&
+ __ceph_caps_revoking_other(ci, cap, newcaps))
+ check_caps = 2;
+
cap->issued = newcaps;
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
(cap->issued & unless) == 0)) {
if ((cap->issued & drop) &&
(cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
+ int wanted = __ceph_caps_wanted(ci);
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+ wanted |= cap->mds_wanted;
+ dout("encode_inode_release %p cap %p "
+ "%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
+
cap->issued &= ~drop;
cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
+ cap->mds_wanted = wanted;
} else {
dout("encode_inode_release %p cap %p %s"
" (force)\n", inode, cap,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f02d82b7933e..a40ceda47a32 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = filp->private_data;
- struct dentry *parent = filp->f_dentry;
+ struct ceph_file_info *fi = file->private_data;
+ struct dentry *parent = file->f_dentry;
struct inode *dir = parent->d_inode;
struct list_head *p;
struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+ dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
last);
spin_lock(&parent->d_lock);
/* start at beginning? */
- if (filp->f_pos == 2 || last == NULL ||
- filp->f_pos < ceph_dentry(last)->offset) {
+ if (ctx->pos == 2 || last == NULL ||
+ ctx->pos < ceph_dentry(last)->offset) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
if (!d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- filp->f_pos <= di->offset)
+ ctx->pos <= di->offset)
break;
dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
dentry->d_name.len, dentry->d_name.name, di->offset,
- filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+ ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
!dentry->d_inode ? " null" : "");
spin_unlock(&dentry->d_lock);
p = p->prev;
@@ -173,29 +172,27 @@ more:
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+ dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- filp->f_pos = di->offset;
- err = filldir(dirent, dentry->d_name.name,
- dentry->d_name.len, di->offset,
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
- dentry->d_inode->i_mode >> 12);
-
- if (last) {
- if (err < 0) {
+ dentry->d_inode->i_mode >> 12)) {
+ if (last) {
/* remember our position */
fi->dentry = last;
fi->next_offset = di->offset;
- } else {
- dput(last);
}
+ dput(dentry);
+ return 0;
}
- last = dentry;
- if (err < 0)
- goto out;
+ if (last)
+ dput(last);
+ last = dentry;
- filp->f_pos++;
+ ctx->pos++;
/* make sure a dentry wasn't dropped while we didn't have parent lock */
if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
return 0;
}
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = file_inode(filp);
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(filp->f_pos);
- int off = fpos_off(filp->f_pos);
+ unsigned frag = fpos_frag(ctx->pos);
+ int off = fpos_off(ctx->pos);
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
const int max_entries = fsc->mount_options->max_readdir;
const int max_bytes = fsc->mount_options->max_readdir_bytes;
- dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+ dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
- if (filp->f_pos == 0) {
+ if (ctx->pos == 0) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
fi->dir_release_count = atomic_read(&ci->i_release_count);
dout("readdir off 0 -> '.'\n");
- if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+ if (!dir_emit(ctx, ".", 1,
ceph_translate_ino(inode->i_sb, inode->i_ino),
- inode->i_mode >> 12) < 0)
+ inode->i_mode >> 12))
return 0;
- filp->f_pos = 1;
+ ctx->pos = 1;
off = 1;
}
- if (filp->f_pos == 1) {
- ino_t ino = parent_ino(filp->f_dentry);
+ if (ctx->pos == 1) {
+ ino_t ino = parent_ino(file->f_dentry);
dout("readdir off 1 -> '..'\n");
- if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+ if (!dir_emit(ctx, "..", 2,
ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12) < 0)
+ inode->i_mode >> 12))
return 0;
- filp->f_pos = 2;
+ ctx->pos = 2;
off = 2;
}
/* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
- if ((filp->f_pos == 2 || fi->dentry) &&
+ if ((ctx->pos == 2 || fi->dentry) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(filp, dirent, filldir);
+ err = __dcache_readdir(file, ctx);
if (err != -EAGAIN)
return err;
} else {
@@ -327,7 +324,7 @@ more:
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
- req->r_dentry = dget(filp->f_dentry);
+ req->r_dentry = dget(file->f_dentry);
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@ more:
rinfo = &fi->last_readdir->r_reply_info;
dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
rinfo->dir_nr, off, fi->offset);
+
+ ctx->pos = ceph_make_fpos(frag, off);
while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
struct ceph_vino vino;
ino_t ino;
dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, pos,
+ off, off - fi->offset, rinfo->dir_nr, ctx->pos,
rinfo->dir_dname_len[off - fi->offset],
rinfo->dir_dname[off - fi->offset], in);
BUG_ON(!in);
@@ -395,16 +393,15 @@ more:
vino.ino = le64_to_cpu(in->ino);
vino.snap = le64_to_cpu(in->snapid);
ino = ceph_vino_to_ino(vino);
- if (filldir(dirent,
+ if (!dir_emit(ctx,
rinfo->dir_dname[off - fi->offset],
rinfo->dir_dname_len[off - fi->offset],
- pos,
- ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+ ceph_translate_ino(inode->i_sb, ino), ftype)) {
dout("filldir stopping us...\n");
return 0;
}
off++;
- filp->f_pos = pos + 1;
+ ctx->pos++;
}
if (fi->last_name) {
@@ -417,7 +414,7 @@ more:
if (!ceph_frag_is_rightmost(frag)) {
frag = ceph_frag_next(frag);
off = 0;
- filp->f_pos = ceph_make_fpos(frag, off);
+ ctx->pos = ceph_make_fpos(frag, off);
dout("readdir next frag is %x\n", frag);
goto more;
}
@@ -432,11 +429,11 @@ more:
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
dout(" marking %p complete\n", inode);
__ceph_dir_set_complete(ci, fi->dir_release_count);
- ci->i_max_offset = filp->f_pos;
+ ci->i_max_offset = ctx->pos;
}
spin_unlock(&ci->i_ceph_lock);
- dout("readdir %p filp %p done.\n", inode, filp);
+ dout("readdir %p file %p done.\n", inode, file);
return 0;
}
@@ -1268,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
- .readdir = ceph_readdir,
+ .iterate = ceph_readdir,
.llseek = ceph_dir_llseek,
.open = ceph_open,
.release = ceph_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e16907430..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
hold_mutex = true;
@@ -809,7 +808,6 @@ retry_snap:
out:
if (hold_mutex)
mutex_unlock(&inode->i_mutex);
- sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
int ret;
mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
@@ -866,16 +864,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
break;
}
- if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
- offset = -EINVAL;
- goto out;
- }
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e20d62e..f3a2abf28a77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
- dn, dn->d_count,
- realdn, realdn->d_count,
+ dn, d_count(dn),
+ realdn, d_count(realdn),
realdn->d_inode, ceph_vinop(realdn->d_inode));
dput(dn);
dn = realdn;
@@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work)
struct inode *inode = &ci->vfs_inode;
dout("vmtruncate_work %p\n", inode);
- __ceph_do_pending_vmtruncate(inode, true);
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ mutex_unlock(&inode->i_mutex);
iput(inode);
}
@@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
* Make sure any pending truncation is applied before doing anything
* that may depend on it.
*/
-void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock)
+void __ceph_do_pending_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
@@ -1525,11 +1527,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- if (needlock)
- mutex_lock(&inode->i_mutex);
truncate_inode_pages(inode->i_mapping, to);
- if (needlock)
- mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
if (to == ci->i_truncate_size) {
@@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
err = inode_change_ok(inode, attr);
if (err != 0)
@@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
return err;
out:
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 202dd3d68be0..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
}
/**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
* counter variables, so you can prepare pagelist metadata before calling
* ceph_encode_locks.
*/
@@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
}
/**
- * Encode the flock and fcntl locks for the given inode into the pagelist.
- * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
- * sequential flock locks.
- * Must be called with lock_flocks() already held.
- * If we encounter more of a specific lock type than expected,
- * we return the value 1.
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with inode->i_lock already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
*/
-int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
- int num_fcntl_locks, int num_flock_locks)
+int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
- struct ceph_filelock cephlock;
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
+ int l = 0;
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
- err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
- if (err)
- goto fail;
+
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX) {
++seen_fcntl;
@@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &cephlock);
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
+ ++l;
}
- if (err)
- goto fail;
}
-
- err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
- if (err)
- goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_FLOCK) {
++seen_flock;
@@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
err = -ENOSPC;
goto fail;
}
- err = lock_to_ceph_filelock(lock, &cephlock);
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
if (err)
goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
+ ++l;
}
- if (err)
- goto fail;
}
fail:
return err;
}
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks)
+{
+ int err = 0;
+ __le32 nlocks;
+
+ nlocks = cpu_to_le32(num_fcntl_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ err = ceph_pagelist_append(pagelist, flocks,
+ num_fcntl_locks * sizeof(*flocks));
+ if (err)
+ goto out_fail;
+
+ nlocks = cpu_to_le32(num_flock_locks);
+ err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+ if (err)
+ goto out_fail;
+
+ err = ceph_pagelist_append(pagelist,
+ &flocks[num_fcntl_locks],
+ num_flock_locks * sizeof(*flocks));
+out_fail:
+ return err;
+}
+
/*
* Given a pointer to a lock, convert it to a ceph filelock
*/
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4f22671a5bd4..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
num = le32_to_cpu(head->num);
dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
session->s_num_cap_releases += num;
/* requeue completed messages */
@@ -1553,7 +1554,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, dentry->d_count, *base, len, path);
+ dentry, d_count(dentry), *base, len, path);
return path;
}
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
+ cap->mseq = 0; /* and migrate_seq */
if (recon_state->flock) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2478,39 +2480,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->flock) {
int num_fcntl_locks, num_flock_locks;
- struct ceph_pagelist_cursor trunc_point;
-
- ceph_pagelist_set_cursor(pagelist, &trunc_point);
- do {
- lock_flocks();
- ceph_count_locks(inode, &num_fcntl_locks,
- &num_flock_locks);
- rec.v2.flock_len = (2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- unlock_flocks();
-
- /* pre-alloc pagelist */
- ceph_pagelist_truncate(pagelist, &trunc_point);
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_pagelist_reserve(pagelist,
- rec.v2.flock_len);
-
- /* encode locks */
- if (!err) {
- lock_flocks();
- err = ceph_encode_locks(inode,
- pagelist,
- num_fcntl_locks,
- num_flock_locks);
- unlock_flocks();
- }
- } while (err == -ENOSPC);
+ struct ceph_filelock *flocks;
+
+encode_again:
+ spin_lock(&inode->i_lock);
+ ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+ spin_unlock(&inode->i_lock);
+ flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock), GFP_NOFS);
+ if (!flocks) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ spin_lock(&inode->i_lock);
+ err = ceph_encode_locks_to_buffer(inode, flocks,
+ num_fcntl_locks,
+ num_flock_locks);
+ spin_unlock(&inode->i_lock);
+ if (err) {
+ kfree(flocks);
+ if (err == -ENOSPC)
+ goto encode_again;
+ goto out_free;
+ }
+ /*
+ * number of encoded locks is stable, so copy to pagelist
+ */
+ rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
+ (num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock));
+ err = ceph_pagelist_append(pagelist, &rec, reclen);
+ if (!err)
+ err = ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ kfree(flocks);
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
-
out_free:
kfree(path);
out_dput:
@@ -3035,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL)
+ if (mdsc->mdsmap == NULL) {
+ kfree(mdsc);
return -ENOMEM;
+ }
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
+ struct ceph_mds_info *info;
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds >= 0 && mds < m->m_max_mds && state > 0) {
- m->m_info[mds].global_id = global_id;
- m->m_info[mds].state = state;
- m->m_info[mds].addr = addr;
- m->m_info[mds].laggy =
- (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
- m->m_info[mds].num_export_targets = num_export_targets;
- if (num_export_targets) {
- m->m_info[mds].export_targets =
- kcalloc(num_export_targets, sizeof(u32),
- GFP_NOFS);
- for (j = 0; j < num_export_targets; j++)
- m->m_info[mds].export_targets[j] =
- ceph_decode_32(&pexport_targets);
- } else {
- m->m_info[mds].export_targets = NULL;
- }
+
+ if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ continue;
+
+ info = &m->m_info[mds];
+ info->global_id = global_id;
+ info->state = state;
+ info->addr = addr;
+ info->laggy = (laggy_since.tv_sec != 0 ||
+ laggy_since.tv_nsec != 0);
+ info->num_export_targets = num_export_targets;
+ if (num_export_targets) {
+ info->export_targets = kcalloc(num_export_targets,
+ sizeof(u32), GFP_NOFS);
+ if (info->export_targets == NULL)
+ goto badmem;
+ for (j = 0; j < num_export_targets; j++)
+ info->export_targets[j] =
+ ceph_decode_32(&pexport_targets);
+ } else {
+ info->export_targets = NULL;
}
}
@@ -170,7 +174,7 @@ bad:
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
ceph_mdsmap_destroy(m);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(err);
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
}
err = -EINVAL;
dev_name_end--; /* back up to ':' separator */
- if (*dev_name_end != ':') {
+ if (dev_name_end < dev_name || *dev_name_end != ':') {
pr_err("device name is missing path (no : separator in %s)\n",
dev_name);
goto out;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8696be2ff679..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
extern void ceph_queue_vmtruncate(struct inode *inode);
extern void ceph_queue_invalidate(struct inode *inode);
@@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops;
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
-extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
- int p_locks, int f_locks);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+ struct ceph_filelock *flocks,
+ int num_fcntl_locks,
+ int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+ struct ceph_pagelist *pagelist,
+ int num_fcntl_locks, int num_flock_locks);
extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
/* debugfs.c */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
- goto out;
+ return err;
}
+ spin_lock(&ci->i_ceph_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
OpenPOWER on IntegriCloud