summaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_reflink.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_reflink.c')
-rw-r--r--fs/xfs/xfs_reflink.c233
1 files changed, 180 insertions, 53 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5289e22cb081..8eaeec9d58ed 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -182,8 +182,7 @@ int
xfs_reflink_trim_around_shared(
struct xfs_inode *ip,
struct xfs_bmbt_irec *irec,
- bool *shared,
- bool *trimmed)
+ bool *shared)
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared(
if (error)
return error;
- *shared = *trimmed = false;
+ *shared = false;
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
@@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared(
*/
irec->br_blockcount = flen;
*shared = true;
- if (flen != aglen)
- *trimmed = true;
return 0;
} else {
/*
@@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared(
* start of the shared region.
*/
irec->br_blockcount = fbno - agbno;
- *trimmed = true;
return 0;
}
}
@@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared(
/*
* Trim the passed in imap to the next shared/unshared extent boundary, and
* if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork. In this case *shared is set to true, else to false.
+ * COW fork.
*
* Note that imap will always contain the block numbers for the existing blocks
* in the data fork, as the upper layers need them for read-modify-write
@@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared(
int
xfs_reflink_reserve_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- bool *shared)
+ struct xfs_bmbt_irec *imap)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got;
int error = 0;
- bool eof = false, trimmed;
+ bool eof = false;
struct xfs_iext_cursor icur;
+ bool shared;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -273,18 +269,16 @@ xfs_reflink_reserve_cow(
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-
- *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, imap, &shared);
if (error)
return error;
/* Not shared? Just report the (potentially capped) extent. */
- if (!*shared)
+ if (!shared)
return 0;
/*
@@ -368,7 +362,6 @@ xfs_find_trim_cow_extent(
xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got;
- bool trimmed;
*found = false;
@@ -376,9 +369,13 @@ xfs_find_trim_cow_extent(
* If we don't find an overlapping extent, trim the range we need to
* allocate to fit the hole we found.
*/
- if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
- got.br_startoff > offset_fsb)
- return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = offset_fsb + count_fsb;
+ if (got.br_startoff > offset_fsb) {
+ xfs_trim_extent(imap, imap->br_startoff,
+ got.br_startoff - imap->br_startoff);
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
+ }
*shared = true;
if (isnullstartblock(got.br_startblock)) {
@@ -1220,35 +1217,92 @@ retry:
return 0;
}
+/* Unlock both inodes after they've been prepped for a range clone. */
+STATIC void
+xfs_reflink_remap_unlock(
+ struct file *file_in,
+ struct file *file_out)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ bool same_inode = (inode_in == inode_out);
+
+ xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+ if (!same_inode)
+ xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+ inode_unlock(inode_out);
+ if (!same_inode)
+ inode_unlock_shared(inode_in);
+}
+
/*
- * Link a range of blocks from one file to another.
+ * If we're reflinking to a point past the destination file's EOF, we must
+ * zero any speculative post-EOF preallocations that sit between the old EOF
+ * and the destination file offset.
*/
-int
-xfs_reflink_remap_range(
+static int
+xfs_reflink_zero_posteof(
+ struct xfs_inode *ip,
+ loff_t pos)
+{
+ loff_t isize = i_size_read(VFS_I(ip));
+
+ if (pos <= isize)
+ return 0;
+
+ trace_xfs_zero_eof(ip, isize, pos - isize);
+ return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
+ &xfs_iomap_ops);
+}
+
+/*
+ * Prepare two files for range cloning. Upon a successful return both inodes
+ * will have the iolock and mmaplock held, the page cache of the out file will
+ * be truncated, and any leases on the out file will have been broken. This
+ * function borrows heavily from xfs_file_aio_write_checks.
+ *
+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
+ * EOF block in the source dedupe range because it's not a complete block match,
+ * hence can introduce a corruption into the file that has it's block replaced.
+ *
+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
+ * "block aligned" for the purposes of cloning entire files. However, if the
+ * source file range includes the EOF block and it lands within the existing EOF
+ * of the destination file, then we can expose stale data from beyond the source
+ * file EOF in the destination file.
+ *
+ * XFS doesn't support partial block sharing, so in both cases we have check
+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
+ * down to the previous whole block and ignore the partial EOF block. While this
+ * means we can't dedupe the last block of a file, this is an acceptible
+ * tradeoff for simplicity on implementation.
+ *
+ * For cloning, we want to share the partial EOF block if it is also the new EOF
+ * block of the destination file. If the partial EOF block lies inside the
+ * existing destination EOF, then we have to abort the clone to avoid exposing
+ * stale data in the destination file. Hence we reject these clone attempts with
+ * -EINVAL in this case.
+ */
+STATIC int
+xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- u64 len,
+ u64 *len,
bool is_dedupe)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- struct xfs_mount *mp = src->i_mount;
bool same_inode = (inode_in == inode_out);
- xfs_fileoff_t sfsbno, dfsbno;
- xfs_filblks_t fsblen;
- xfs_extlen_t cowextsize;
+ u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return -EOPNOTSUPP;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/* Lock both files against IO */
ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
if (ret)
@@ -1270,33 +1324,115 @@ xfs_reflink_remap_range(
goto out_unlock;
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- &len, is_dedupe);
+ len, is_dedupe);
if (ret <= 0)
goto out_unlock;
+ /*
+ * If the dedupe data matches, chop off the partial EOF block
+ * from the source file so we don't try to dedupe the partial
+ * EOF block.
+ */
+ if (is_dedupe) {
+ *len &= ~blkmask;
+ } else if (*len & blkmask) {
+ /*
+ * The user is attempting to share a partial EOF block,
+ * if it's inside the destination EOF then reject it.
+ */
+ if (pos_out + *len < i_size_read(inode_out)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
goto out_unlock;
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
/*
- * Clear out post-eof preallocations because we don't have page cache
- * backing the delayed allocations and they'll never get freed on
- * their own.
+ * Zero existing post-eof speculative preallocations in the destination
+ * file.
*/
- if (xfs_can_free_eofblocks(dest, true)) {
- ret = xfs_free_eofblocks(dest);
- if (ret)
- goto out_unlock;
- }
+ ret = xfs_reflink_zero_posteof(dest, pos_out);
+ if (ret)
+ goto out_unlock;
/* Set flags and remap blocks. */
ret = xfs_reflink_set_inode_flag(src, dest);
if (ret)
goto out_unlock;
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + *len) - 1);
+
+ /* If we're altering the file contents... */
+ if (!is_dedupe) {
+ /*
+ * ...update the timestamps (which will grab the ilock again
+ * from xfs_fs_dirty_inode, so we have to call it before we
+ * take the ilock).
+ */
+ if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+ ret = file_update_time(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /*
+ * ...clear the security bits if the process is not being run
+ * by root. This keeps people from modifying setuid and setgid
+ * binaries.
+ */
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ return 1;
+out_unlock:
+ xfs_reflink_remap_unlock(file_in, file_out);
+ return ret;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t sfsbno, dfsbno;
+ xfs_filblks_t fsblen;
+ xfs_extlen_t cowextsize;
+ ssize_t ret;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, is_dedupe);
+ if (ret <= 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
@@ -1305,10 +1441,6 @@ xfs_reflink_remap_range(
if (ret)
goto out_unlock;
- /* Zap any page cache for the destination file's range. */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + len) - 1);
-
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
@@ -1325,12 +1457,7 @@ xfs_reflink_remap_range(
is_dedupe);
out_unlock:
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock_shared(inode_in);
+ xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
OpenPOWER on IntegriCloud