summaryrefslogtreecommitdiffstats
path: root/fs/ceph/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r--fs/ceph/file.c322
1 files changed, 209 insertions, 113 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b67eec3532a1..cf0e45b10121 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -30,6 +30,8 @@ static __le32 ceph_flags_sys2wire(u32 flags)
break;
}
+ flags &= ~O_ACCMODE;
+
#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
ceph_sys2wire(O_CREAT);
@@ -41,7 +43,7 @@ static __le32 ceph_flags_sys2wire(u32 flags)
#undef ceph_sys2wire
if (flags)
- dout("unused open flags: %x", flags);
+ dout("unused open flags: %x\n", flags);
return cpu_to_le32(wire_flags);
}
@@ -68,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags)
*/
/*
- * Calculate the length sum of direct io vectors that can
- * be combined into one page vector.
+ * How many pages to get in one call to iov_iter_get_pages(). This
+ * determines the size of the on-stack array used as a buffer.
*/
-static size_t dio_get_pagev_size(const struct iov_iter *it)
+#define ITER_GET_BVECS_PAGES 64
+
+static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec *bvecs)
{
- const struct iovec *iov = it->iov;
- const struct iovec *iovend = iov + it->nr_segs;
- size_t size;
-
- size = iov->iov_len - it->iov_offset;
- /*
- * An iov can be page vectored when both the current tail
- * and the next base are page aligned.
- */
- while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
- (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
- size += iov->iov_len;
- }
- dout("dio_get_pagevlen len = %zu\n", size);
- return size;
+ size_t size = 0;
+ int bvec_idx = 0;
+
+ if (maxsize > iov_iter_count(iter))
+ maxsize = iov_iter_count(iter);
+
+ while (size < maxsize) {
+ struct page *pages[ITER_GET_BVECS_PAGES];
+ ssize_t bytes;
+ size_t start;
+ int idx = 0;
+
+ bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ ITER_GET_BVECS_PAGES, &start);
+ if (bytes < 0)
+ return size ?: bytes;
+
+ iov_iter_advance(iter, bytes);
+ size += bytes;
+
+ for ( ; bytes; idx++, bvec_idx++) {
+ struct bio_vec bv = {
+ .bv_page = pages[idx],
+ .bv_len = min_t(int, bytes, PAGE_SIZE - start),
+ .bv_offset = start,
+ };
+
+ bvecs[bvec_idx] = bv;
+ bytes -= bv.bv_len;
+ start = 0;
+ }
+ }
+
+ return size;
}
/*
- * Allocate a page vector based on (@it, @nbytes).
- * The return value is the tuple describing a page vector,
- * that is (@pages, @page_align, @num_pages).
+ * iov_iter_get_pages() only considers one iov_iter segment, no matter
+ * what maxsize or maxpages are given. For ITER_BVEC that is a single
+ * page.
+ *
+ * Attempt to get up to @maxsize bytes worth of pages from @iter.
+ * Return the number of bytes in the created bio_vec array, or an error.
*/
-static struct page **
-dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
- size_t *page_align, int *num_pages)
+static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec **bvecs, int *num_bvecs)
{
- struct iov_iter tmp_it = *it;
- size_t align;
- struct page **pages;
- int ret = 0, idx, npages;
+ struct bio_vec *bv;
+ size_t orig_count = iov_iter_count(iter);
+ ssize_t bytes;
+ int npages;
- align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
- (PAGE_SIZE - 1);
- npages = calc_pages_for(align, nbytes);
- pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
- if (!pages)
- return ERR_PTR(-ENOMEM);
+ iov_iter_truncate(iter, maxsize);
+ npages = iov_iter_npages(iter, INT_MAX);
+ iov_iter_reexpand(iter, orig_count);
- for (idx = 0; idx < npages; ) {
- size_t start;
- ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
- npages - idx, &start);
- if (ret < 0)
- goto fail;
+ /*
+ * __iter_get_bvecs() may populate only part of the array -- zero it
+ * out.
+ */
+ bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
+ if (!bv)
+ return -ENOMEM;
- iov_iter_advance(&tmp_it, ret);
- nbytes -= ret;
- idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ bytes = __iter_get_bvecs(iter, maxsize, bv);
+ if (bytes < 0) {
+ /*
+ * No pages were pinned -- just free the array.
+ */
+ kvfree(bv);
+ return bytes;
}
- BUG_ON(nbytes != 0);
- *num_pages = npages;
- *page_align = align;
- dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
- return pages;
-fail:
- ceph_put_page_vector(pages, idx, false);
- return ERR_PTR(ret);
+ *bvecs = bv;
+ *num_bvecs = npages;
+ return bytes;
+}
+
+static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
+{
+ int i;
+
+ for (i = 0; i < num_bvecs; i++) {
+ if (bvecs[i].bv_page) {
+ if (should_dirty)
+ set_page_dirty_lock(bvecs[i].bv_page);
+ put_page(bvecs[i].bv_page);
+ }
+ }
+ kvfree(bvecs);
}
/*
@@ -159,13 +196,50 @@ out:
return req;
}
+static int ceph_init_file_info(struct inode *inode, struct file *file,
+ int fmode, bool isdir)
+{
+ struct ceph_file_info *fi;
+
+ dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
+ inode->i_mode, isdir ? "dir" : "regular");
+ BUG_ON(inode->i_fop->release != ceph_release);
+
+ if (isdir) {
+ struct ceph_dir_file_info *dfi =
+ kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
+ if (!dfi) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+
+ file->private_data = dfi;
+ fi = &dfi->file_info;
+ dfi->next_offset = 2;
+ dfi->readdir_cache_idx = -1;
+ } else {
+ fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
+ if (!fi) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+
+ file->private_data = fi;
+ }
+
+ fi->fmode = fmode;
+ spin_lock_init(&fi->rw_contexts_lock);
+ INIT_LIST_HEAD(&fi->rw_contexts);
+
+ return 0;
+}
+
/*
* initialize private struct file data.
* if we fail, clean up by dropping fmode reference on the ceph_inode
*/
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
- struct ceph_file_info *cf;
int ret = 0;
switch (inode->i_mode & S_IFMT) {
@@ -173,22 +247,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
ceph_fscache_register_inode_cookie(inode);
ceph_fscache_file_set_cookie(inode, file);
case S_IFDIR:
- dout("init_file %p %p 0%o (regular)\n", inode, file,
- inode->i_mode);
- cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (!cf) {
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- return -ENOMEM;
- }
- cf->fmode = fmode;
-
- spin_lock_init(&cf->rw_contexts_lock);
- INIT_LIST_HEAD(&cf->rw_contexts);
-
- cf->next_offset = 2;
- cf->readdir_cache_idx = -1;
- file->private_data = cf;
- BUG_ON(inode->i_fop->release != ceph_release);
+ ret = ceph_init_file_info(inode, file, fmode,
+ S_ISDIR(inode->i_mode));
+ if (ret)
+ return ret;
break;
case S_IFLNK:
@@ -278,11 +340,11 @@ int ceph_open(struct inode *inode, struct file *file)
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
- struct ceph_file_info *cf = file->private_data;
+ struct ceph_file_info *fi = file->private_data;
int err;
int flags, fmode, wanted;
- if (cf) {
+ if (fi) {
dout("open file %p is already opened\n", file);
return 0;
}
@@ -375,7 +437,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acls_info acls = {};
- int mask;
+ int mask;
int err;
dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -386,6 +448,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
return -ENAMETOOLONG;
if (flags & O_CREAT) {
+ if (ceph_quota_is_max_files_exceeded(dir))
+ return -EDQUOT;
err = ceph_pre_init_acls(dir, &mode, &acls);
if (err < 0)
return err;
@@ -460,16 +524,27 @@ out_acl:
int ceph_release(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *cf = file->private_data;
- dout("release inode %p file %p\n", inode, file);
- ceph_put_fmode(ci, cf->fmode);
- if (cf->last_readdir)
- ceph_mdsc_put_request(cf->last_readdir);
- kfree(cf->last_name);
- kfree(cf->dir_info);
- WARN_ON(!list_empty(&cf->rw_contexts));
- kmem_cache_free(ceph_file_cachep, cf);
+ if (S_ISDIR(inode->i_mode)) {
+ struct ceph_dir_file_info *dfi = file->private_data;
+ dout("release inode %p dir file %p\n", inode, file);
+ WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
+
+ ceph_put_fmode(ci, dfi->file_info.fmode);
+
+ if (dfi->last_readdir)
+ ceph_mdsc_put_request(dfi->last_readdir);
+ kfree(dfi->last_name);
+ kfree(dfi->dir_info);
+ kmem_cache_free(ceph_dir_file_cachep, dfi);
+ } else {
+ struct ceph_file_info *fi = file->private_data;
+ dout("release inode %p regular file %p\n", inode, file);
+ WARN_ON(!list_empty(&fi->rw_contexts));
+
+ ceph_put_fmode(ci, fi->fmode);
+ kmem_cache_free(ceph_file_cachep, fi);
+ }
/* wake up anyone waiting for caps on this inode */
wake_up_all(&ci->i_cap_wq);
@@ -706,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- int num_pages = calc_pages_for((u64)osd_data->alignment,
- osd_data->length);
- dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
- inode, rc, osd_data->length);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
+ BUG_ON(!osd_data->num_bvecs);
+
+ dout("ceph_aio_complete_req %p rc %d bytes %u\n",
+ inode, rc, osd_data->bvec_pos.iter.bi_size);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
@@ -728,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
} else if (!aio_req->write) {
if (rc == -ENOENT)
rc = 0;
- if (rc >= 0 && osd_data->length > rc) {
- int zoff = osd_data->alignment + rc;
- int zlen = osd_data->length - rc;
+ if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
+ struct iov_iter i;
+ int zlen = osd_data->bvec_pos.iter.bi_size - rc;
+
/*
* If read is satisfied by single OSD request,
* it can pass EOF. Otherwise read is within
@@ -745,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen;
}
- if (zlen > 0)
- ceph_zero_page_vector_range(zoff, zlen,
- osd_data->pages);
+ iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
+ osd_data->num_bvecs,
+ osd_data->bvec_pos.iter.bi_size);
+ iov_iter_advance(&i, rc);
+ iov_iter_zero(zlen, &i);
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty);
+ put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
+ aio_req->should_dirty);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -839,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_vino vino;
struct ceph_osd_request *req;
- struct page **pages;
+ struct bio_vec *bvecs;
struct ceph_aio_request *aio_req = NULL;
int num_pages = 0;
int flags;
@@ -874,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
while (iov_iter_count(iter) > 0) {
- u64 size = dio_get_pagev_size(iter);
- size_t start = 0;
+ u64 size = iov_iter_count(iter);
ssize_t len;
+ if (write)
+ size = min_t(u64, size, fsc->mount_options->wsize);
+ else
+ size = min_t(u64, size, fsc->mount_options->rsize);
+
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &size, 0,
@@ -893,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
break;
}
- if (write)
- size = min_t(u64, size, fsc->mount_options->wsize);
- else
- size = min_t(u64, size, fsc->mount_options->rsize);
-
- len = size;
- pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
- if (IS_ERR(pages)) {
+ len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
+ if (len < 0) {
ceph_osdc_put_request(req);
- ret = PTR_ERR(pages);
+ ret = len;
break;
}
+ if (len != size)
+ osd_req_op_extent_update(req, 0, len);
/*
* To simplify error handling, allow AIO when IO within i_size
@@ -937,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req->r_mtime = mtime;
}
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
- false, false);
+ osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
if (aio_req) {
aio_req->total_len += len;
@@ -951,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
pos += len;
- iov_iter_advance(iter, len);
continue;
}
@@ -964,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (ret == -ENOENT)
ret = 0;
if (ret >= 0 && ret < len && pos + ret < size) {
+ struct iov_iter i;
int zlen = min_t(size_t, len - ret,
size - pos - ret);
- ceph_zero_page_vector_range(start + ret, zlen,
- pages);
+
+ iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
+ len);
+ iov_iter_advance(&i, ret);
+ iov_iter_zero(zlen, &i);
ret += zlen;
}
if (ret >= 0)
len = ret;
}
- ceph_put_page_vector(pages, num_pages, should_dirty);
-
+ put_bvecs(bvecs, num_pages, should_dirty);
ceph_osdc_put_request(req);
if (ret < 0)
break;
pos += len;
- iov_iter_advance(iter, len);
-
if (!write && pos >= size)
break;
@@ -1338,6 +1417,11 @@ retry_snap:
pos = iocb->ki_pos;
count = iov_iter_count(from);
+ if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
err = file_remove_privs(file);
if (err)
goto out;
@@ -1419,6 +1503,7 @@ retry_snap:
if (written >= 0) {
int dirty;
+
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
@@ -1426,6 +1511,8 @@ retry_snap:
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1668,6 +1755,12 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
+ if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
+ ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
+ ret = -EDQUOT;
+ goto unlock;
+ }
+
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
!(mode & FALLOC_FL_PUNCH_HOLE)) {
ret = -ENOSPC;
@@ -1716,6 +1809,9 @@ static long ceph_fallocate(struct file *file, int mode,
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ if ((endoff > size) &&
+ ceph_quota_is_max_bytes_approaching(inode, endoff))
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
ceph_put_cap_refs(ci, got);
OpenPOWER on IntegriCloud