diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-26 15:53:16 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-26 15:53:16 -0700 |
commit | d5a38f6e4668b3110a66cd96ce2096184bf66def (patch) | |
tree | bd2209de25a343e7b01d143abce7bf774122227f /fs/ceph/addr.c | |
parent | 698f415cf5756e320623bdb015a600945743377c (diff) | |
parent | 5ee61e95b6b33c82f6fa1382585faed66aa01245 (diff) | |
download | talos-op-linux-d5a38f6e4668b3110a66cd96ce2096184bf66def.tar.gz talos-op-linux-d5a38f6e4668b3110a66cd96ce2096184bf66def.zip |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil:
"There is quite a bit here, including some overdue refactoring and
cleanup on the mon_client and osd_client code from Ilya, scattered
writeback support for CephFS and a pile of bug fixes from Zheng, and a
few random cleanups and fixes from others"
[ I already decided not to pull this because of it having been rebased
recently, but ended up changing my mind after all. Next time I'll
really hold people to it. Oh well. - Linus ]
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (34 commits)
libceph: use KMEM_CACHE macro
ceph: use kmem_cache_zalloc
rbd: use KMEM_CACHE macro
ceph: use lookup request to revalidate dentry
ceph: kill ceph_get_dentry_parent_inode()
ceph: fix security xattr deadlock
ceph: don't request vxattrs from MDS
ceph: fix mounting same fs multiple times
ceph: remove unnecessary NULL check
ceph: avoid updating directory inode's i_size accidentally
ceph: fix race during filling readdir cache
libceph: use sizeof_footer() more
ceph: kill ceph_empty_snapc
ceph: fix a wrong comparison
ceph: replace CURRENT_TIME by current_fs_time()
ceph: scattered page writeback
libceph: add helper that duplicates last extent operation
libceph: enable large, variable-sized OSD requests
libceph: osdc->req_mempool should be backed by a slab pool
libceph: make r_request msg_size calculation clearer
...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r-- | fs/ceph/addr.c | 324 |
1 files changed, 204 insertions, 120 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 19adeb0ef82a..fc5cae2a0db2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, static int ceph_releasepage(struct page *page, gfp_t g) { - struct inode *inode = page->mapping ? page->mapping->host : NULL; - dout("%p releasepage %p idx %lu\n", inode, page, page->index); + dout("%p releasepage %p idx %lu\n", page->mapping->host, + page, page->index); WARN_ON(PageDirty(page)); /* Can we release the page from the cache? */ @@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != ENOENT) + if (rc < 0 && rc != -ENOENT) goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req, struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_data *osd_data; - unsigned wrote; struct page *page; - int num_pages; - int i; + int num_pages, total_pages = 0; + int i, j; + int rc = req->r_result; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - int rc = req->r_result; - u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - long writeback_stat; - unsigned issued = ceph_caps_issued(ci); + bool remove_page; - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - if (rc >= 0) { - /* - * Assume we wrote the pages we originally sent. The - * osd might reply with fewer pages if our writeback - * raced with a truncation and was adjusted at the osd, - * so don't believe the reply. - */ - wrote = num_pages; - } else { - wrote = 0; + + dout("writepages_finish %p rc %d\n", inode, rc); + if (rc < 0) mapping_set_error(mapping, rc); - } - dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", - inode, rc, bytes, wrote); - /* clean all pages */ - for (i = 0; i < num_pages; i++) { - page = osd_data->pages[i]; - BUG_ON(!page); - WARN_ON(!PageUptodate(page)); + /* + * We lost the cache cap, need to truncate the page before + * it is unlocked, otherwise we'd truncate it later in the + * page truncation thread, possibly losing some data that + * raced its way in + */ + remove_page = !(ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); - writeback_stat = - atomic_long_dec_return(&fsc->writeback_count); - if (writeback_stat < - CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(&fsc->backing_dev_info, - BLK_RW_ASYNC); + /* clean all pages */ + for (i = 0; i < req->r_num_ops; i++) { + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + break; - ceph_put_snap_context(page_snap_context(page)); - page->private = 0; - ClearPagePrivate(page); - dout("unlocking %d %p\n", i, page); - end_page_writeback(page); + osd_data = osd_req_op_extent_osd_data(req, i); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + total_pages += num_pages; + for (j = 0; j < num_pages; j++) { + page = osd_data->pages[j]; + BUG_ON(!page); + WARN_ON(!PageUptodate(page)); + + if (atomic_long_dec_return(&fsc->writeback_count) < + CONGESTION_OFF_THRESH( + fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, + BLK_RW_ASYNC); + + ceph_put_snap_context(page_snap_context(page)); + page->private = 0; + ClearPagePrivate(page); + dout("unlocking %p\n", page); + end_page_writeback(page); + + if (remove_page) + generic_error_remove_page(inode->i_mapping, + page); - /* - * We lost the cache cap, need to truncate the page before - * it is unlocked, otherwise we'd truncate it later in the - * page truncation thread, possibly losing some data that - * raced its way in - */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) - generic_error_remove_page(inode->i_mapping, page); + unlock_page(page); + } + dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", + inode, osd_data->length, rc >= 0 ? num_pages : 0); - unlock_page(page); + ceph_release_pages(osd_data->pages, num_pages); } - dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(osd_data->pages, num_pages); + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); + + osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->pages_from_pool) mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); @@ -778,17 +778,15 @@ retry: while (!done && index <= end) { unsigned i; int first; - pgoff_t next; - int pvec_pages, locked_pages; - struct page **pages = NULL; + pgoff_t strip_unit_end = 0; + int num_ops = 0, op_idx; + int pvec_pages, locked_pages = 0; + struct page **pages = NULL, **data_pages; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; - u64 offset, len; - long writeback_stat; + u64 offset = 0, len = 0; - next = 0; - locked_pages = 0; max_pages = max_pages_ever; get_more_pages: @@ -824,8 +822,8 @@ get_more_pages: unlock_page(page); break; } - if (next && (page->index != next)) { - dout("not consecutive %p\n", page); + if (strip_unit_end && (page->index > strip_unit_end)) { + dout("end of strip unit %p\n", page); unlock_page(page); break; } @@ -867,36 +865,31 @@ get_more_pages: /* * We have something to write. If this is * the first locked page this time through, - * allocate an osd request and a page array - * that it will use. + * calculate max possinle write size and + * allocate a page array */ if (locked_pages == 0) { - BUG_ON(pages); + u64 objnum; + u64 objoff; + /* prepare async write request */ offset = (u64)page_offset(page); len = wsize; - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, - do_sync ? 2 : 1, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, truncate_seq, - truncate_size, true); - if (IS_ERR(req)) { - rc = PTR_ERR(req); + + rc = ceph_calc_file_object_mapping(&ci->i_layout, + offset, len, + &objnum, &objoff, + &len); + if (rc < 0) { unlock_page(page); break; } - if (do_sync) - osd_req_op_init(req, 1, - CEPH_OSD_OP_STARTSYNC, 0); - - req->r_callback = writepages_finish; - req->r_inode = inode; + num_ops = 1 + do_sync; + strip_unit_end = page->index + + ((len - 1) >> PAGE_CACHE_SHIFT); + BUG_ON(pages); max_pages = calc_pages_for(0, (u64)len); pages = kmalloc(max_pages * sizeof (*pages), GFP_NOFS); @@ -905,6 +898,20 @@ get_more_pages: pages = mempool_alloc(pool, GFP_NOFS); BUG_ON(!pages); } + + len = 0; + } else if (page->index != + (offset + len) >> PAGE_CACHE_SHIFT) { + if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : + CEPH_OSD_MAX_OPS)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + break; + } + + num_ops++; + offset = (u64)page_offset(page); + len = 0; } /* note position of first page in pvec */ @@ -913,18 +920,16 @@ get_more_pages: dout("%p will write page %p idx %lu\n", inode, page, page->index); - writeback_stat = - atomic_long_inc_return(&fsc->writeback_count); - if (writeback_stat > CONGESTION_ON_THRESH( + if (atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) { set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); } - set_page_writeback(page); pages[locked_pages] = page; locked_pages++; - next = page->index + 1; + len += PAGE_CACHE_SIZE; } /* did we get anything? */ @@ -944,38 +949,119 @@ get_more_pages: /* shift unused pages over in the pvec... we * will need to release them below. */ for (j = i; j < pvec_pages; j++) { - dout(" pvec leftover page %p\n", - pvec.pages[j]); + dout(" pvec leftover page %p\n", pvec.pages[j]); pvec.pages[j-i+first] = pvec.pages[j]; } pvec.nr -= i-first; } - /* Format the osd request message and submit the write */ +new_request: offset = page_offset(pages[0]); - len = (u64)locked_pages << PAGE_CACHE_SHIFT; - if (snap_size == -1) { - len = min(len, (u64)i_size_read(inode) - offset); - /* writepages_finish() clears writeback pages - * according to the data length, so make sure - * data length covers all locked pages */ - len = max(len, 1 + - ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); - } else { - len = min(len, snap_size - offset); + len = wsize; + + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, false); + if (IS_ERR(req)) { + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, + min(num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); + BUG_ON(IS_ERR(req)); } - dout("writepages got %d pages at %llu~%llu\n", - locked_pages, offset, len); + BUG_ON(len < page_offset(pages[locked_pages - 1]) + + PAGE_CACHE_SIZE - offset); + + req->r_callback = writepages_finish; + req->r_inode = inode; - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + /* Format the osd request message and submit the write */ + len = 0; + data_pages = pages; + op_idx = 0; + for (i = 0; i < locked_pages; i++) { + u64 cur_offset = page_offset(pages[i]); + if (offset + len != cur_offset) { + if (op_idx + do_sync + 1 == req->r_num_ops) + break; + osd_req_op_extent_dup_last(req, op_idx, + cur_offset - offset); + dout("writepages got pages at %llu~%llu\n", + offset, len); + osd_req_op_extent_osd_data_pages(req, op_idx, + data_pages, len, 0, !!pool, false); + osd_req_op_extent_update(req, op_idx, len); - pages = NULL; /* request message now owns the pages array */ - pool = NULL; + len = 0; + offset = cur_offset; + data_pages = pages + i; + op_idx++; + } - /* Update the write op length in case we changed it */ + set_page_writeback(pages[i]); + len += PAGE_CACHE_SIZE; + } + + if (snap_size != -1) { + len = min(len, snap_size - offset); + } else if (i == locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - PAGE_CACHE_SIZE; + len = min(len, (u64)i_size_read(inode) - offset); + len = max(len, min_len); + } + dout("writepages got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, + 0, !!pool, false); + osd_req_op_extent_update(req, op_idx, len); - osd_req_op_extent_update(req, 0, len); + if (do_sync) { + op_idx++; + osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); + } + BUG_ON(op_idx + 1 != req->r_num_ops); + + pool = NULL; + if (i < locked_pages) { + BUG_ON(num_ops <= req->r_num_ops); + num_ops -= req->r_num_ops; + num_ops += do_sync; + locked_pages -= i; + + /* allocate new pages array for next request */ + data_pages = pages; + pages = kmalloc(locked_pages * sizeof (*pages), + GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } + memcpy(pages, data_pages + i, + locked_pages * sizeof(*pages)); + memset(data_pages + i, 0, + locked_pages * sizeof(*pages)); + } else { + BUG_ON(num_ops != req->r_num_ops); + index = pages[i - 1]->index + 1; + /* request message now owns the pages array */ + pages = NULL; + } vino = ceph_vino(inode); ceph_osdc_build_request(req, offset, snapc, vino.snap, @@ -985,9 +1071,10 @@ get_more_pages: BUG_ON(rc); req = NULL; - /* continue? */ - index = next; - wbc->nr_to_write -= locked_pages; + wbc->nr_to_write -= i; + if (pages) + goto new_request; + if (wbc->nr_to_write <= 0) done = 1; @@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ceph_empty_snapc, 0, 0, false); + NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ceph_empty_snapc, - ci->i_truncate_seq, ci->i_truncate_size, - false); + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out; } - rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, - ceph_empty_snapc, + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!rd_req) { err = -ENOMEM; @@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) "%llx.00000000", ci->i_vino.ino); rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); - wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, - ceph_empty_snapc, + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!wr_req) { err = -ENOMEM; |