diff options
Diffstat (limited to 'fs/xfs')
89 files changed, 11933 insertions, 1067 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index fc593c869493..26ef1958b65b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -52,8 +52,11 @@ xfs-y				+= $(addprefix libxfs/, \  				   xfs_inode_fork.o \  				   xfs_inode_buf.o \  				   xfs_log_rlimit.o \ +				   xfs_ag_resv.o \  				   xfs_rmap.o \  				   xfs_rmap_btree.o \ +				   xfs_refcount.o \ +				   xfs_refcount_btree.o \  				   xfs_sb.o \  				   xfs_symlink_remote.o \  				   xfs_trans_resv.o \ @@ -87,6 +90,7 @@ xfs-y				+= xfs_aops.o \  				   xfs_message.o \  				   xfs_mount.o \  				   xfs_mru_cache.o \ +				   xfs_reflink.o \  				   xfs_stats.o \  				   xfs_super.o \  				   xfs_symlink.o \ @@ -99,16 +103,20 @@ xfs-y				+= xfs_aops.o \  # low-level transaction/log code  xfs-y				+= xfs_log.o \  				   xfs_log_cil.o \ +				   xfs_bmap_item.o \  				   xfs_buf_item.o \  				   xfs_extfree_item.o \  				   xfs_icreate_item.o \  				   xfs_inode_item.o \ +				   xfs_refcount_item.o \  				   xfs_rmap_item.o \  				   xfs_log_recover.o \  				   xfs_trans_ail.o \ +				   xfs_trans_bmap.o \  				   xfs_trans_buf.o \  				   xfs_trans_extfree.o \  				   xfs_trans_inode.o \ +				   xfs_trans_refcount.o \  				   xfs_trans_rmap.o \  # optional features diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c new file mode 100644 index 000000000000..e5ebc3770460 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_rmap_btree.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" + +/* + * Per-AG Block Reservations + * + * For some kinds of allocation group metadata structures, it is advantageous + * to reserve a small number of blocks in each AG so that future expansions of + * that data structure do not encounter ENOSPC because errors during a btree + * split cause the filesystem to go offline. + * + * Prior to the introduction of reflink, this wasn't an issue because the free + * space btrees maintain a reserve of space (the AGFL) to handle any expansion + * that may be necessary; and allocations of other metadata (inodes, BMBT, + * dir/attr) aren't restricted to a single AG.  However, with reflink it is + * possible to allocate all the space in an AG, have subsequent reflink/CoW + * activity expand the refcount btree, and discover that there's no space left + * to handle that expansion.  Since we can calculate the maximum size of the + * refcount btree, we can reserve space for it and avoid ENOSPC. + * + * Handling per-AG reservations consists of three changes to the allocator's + * behavior:  First, because these reservations are always needed, we decrease + * the ag_max_usable counter to reflect the size of the AG after the reserved + * blocks are taken.  Second, the reservations must be reflected in the + * fdblocks count to maintain proper accounting.  Third, each AG must maintain + * its own reserved block counter so that we can calculate the amount of space + * that must remain free to maintain the reservations.  Fourth, the "remaining + * reserved blocks" count must be used when calculating the length of the + * longest free extent in an AG and to clamp maxlen in the per-AG allocation + * functions.  In other words, we maintain a virtual allocation via in-core + * accounting tricks so that we don't have to clean up after a crash. :) + * + * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type + * values via struct xfs_alloc_arg or directly to the xfs_free_extent + * function.  It might seem a little funny to maintain a reservoir of blocks + * to feed another reservoir, but the AGFL only holds enough blocks to get + * through the next transaction.  The per-AG reservation is to ensure (we + * hope) that each AG never runs out of blocks.  Each data structure wanting + * to use the reservation system should update ask/used in xfs_ag_resv_init. + */ + +/* + * Are we critically low on blocks?  For now we'll define that as the number + * of blocks we can get our hands on being less than 10% of what we reserved + * or less than some arbitrary number (maximum btree height). + */ +bool +xfs_ag_resv_critical( +	struct xfs_perag		*pag, +	enum xfs_ag_resv_type		type) +{ +	xfs_extlen_t			avail; +	xfs_extlen_t			orig; + +	switch (type) { +	case XFS_AG_RESV_METADATA: +		avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; +		orig = pag->pag_meta_resv.ar_asked; +		break; +	case XFS_AG_RESV_AGFL: +		avail = pag->pagf_freeblks + pag->pagf_flcount - +			pag->pag_meta_resv.ar_reserved; +		orig = pag->pag_agfl_resv.ar_asked; +		break; +	default: +		ASSERT(0); +		return false; +	} + +	trace_xfs_ag_resv_critical(pag, type, avail); + +	/* Critically low if less than 10% or max btree height remains. */ +	return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, +			pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, +			XFS_RANDOM_AG_RESV_CRITICAL); +} + +/* + * How many blocks are reserved but not used, and therefore must not be + * allocated away? + */ +xfs_extlen_t +xfs_ag_resv_needed( +	struct xfs_perag		*pag, +	enum xfs_ag_resv_type		type) +{ +	xfs_extlen_t			len; + +	len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; +	switch (type) { +	case XFS_AG_RESV_METADATA: +	case XFS_AG_RESV_AGFL: +		len -= xfs_perag_resv(pag, type)->ar_reserved; +		break; +	case XFS_AG_RESV_NONE: +		/* empty */ +		break; +	default: +		ASSERT(0); +	} + +	trace_xfs_ag_resv_needed(pag, type, len); + +	return len; +} + +/* Clean out a reservation */ +static int +__xfs_ag_resv_free( +	struct xfs_perag		*pag, +	enum xfs_ag_resv_type		type) +{ +	struct xfs_ag_resv		*resv; +	xfs_extlen_t			oldresv; +	int				error; + +	trace_xfs_ag_resv_free(pag, type, 0); + +	resv = xfs_perag_resv(pag, type); +	pag->pag_mount->m_ag_max_usable += resv->ar_asked; +	/* +	 * AGFL blocks are always considered "free", so whatever +	 * was reserved at mount time must be given back at umount. +	 */ +	if (type == XFS_AG_RESV_AGFL) +		oldresv = resv->ar_orig_reserved; +	else +		oldresv = resv->ar_reserved; +	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); +	resv->ar_reserved = 0; +	resv->ar_asked = 0; + +	if (error) +		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, +				error, _RET_IP_); +	return error; +} + +/* Free a per-AG reservation. */ +int +xfs_ag_resv_free( +	struct xfs_perag		*pag) +{ +	int				error; +	int				err2; + +	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); +	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); +	if (err2 && !error) +		error = err2; +	return error; +} + +static int +__xfs_ag_resv_init( +	struct xfs_perag		*pag, +	enum xfs_ag_resv_type		type, +	xfs_extlen_t			ask, +	xfs_extlen_t			used) +{ +	struct xfs_mount		*mp = pag->pag_mount; +	struct xfs_ag_resv		*resv; +	int				error; + +	resv = xfs_perag_resv(pag, type); +	if (used > ask) +		ask = used; +	resv->ar_asked = ask; +	resv->ar_reserved = resv->ar_orig_reserved = ask - used; +	mp->m_ag_max_usable -= ask; + +	trace_xfs_ag_resv_init(pag, type, ask); + +	error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); +	if (error) +		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, +				error, _RET_IP_); + +	return error; +} + +/* Create a per-AG block reservation. */ +int +xfs_ag_resv_init( +	struct xfs_perag		*pag) +{ +	xfs_extlen_t			ask; +	xfs_extlen_t			used; +	int				error = 0; + +	/* Create the metadata reservation. */ +	if (pag->pag_meta_resv.ar_asked == 0) { +		ask = used = 0; + +		error = xfs_refcountbt_calc_reserves(pag->pag_mount, +				pag->pag_agno, &ask, &used); +		if (error) +			goto out; + +		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, +				ask, used); +		if (error) +			goto out; +	} + +	/* Create the AGFL metadata reservation */ +	if (pag->pag_agfl_resv.ar_asked == 0) { +		ask = used = 0; + +		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, +				&ask, &used); +		if (error) +			goto out; + +		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); +		if (error) +			goto out; +	} + +out: +	return error; +} + +/* Allocate a block from the reservation. */ +void +xfs_ag_resv_alloc_extent( +	struct xfs_perag		*pag, +	enum xfs_ag_resv_type		type, +	struct xfs_alloc_arg		*args) +{ +	struct xfs_ag_resv		*resv; +	xfs_extlen_t			len; +	uint				field; + +	trace_xfs_ag_resv_alloc_extent(pag, type, args->len); + +	switch (type) { +	case XFS_AG_RESV_METADATA: +	case XFS_AG_RESV_AGFL: +		resv = xfs_perag_resv(pag, type); +		break; +	default: +		ASSERT(0); +		/* fall through */ +	case XFS_AG_RESV_NONE: +		field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : +				       XFS_TRANS_SB_FDBLOCKS; +		xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); +		return; +	} + +	len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); +	resv->ar_reserved -= len; +	if (type == XFS_AG_RESV_AGFL) +		return; +	/* Allocations of reserved blocks only need on-disk sb updates... */ +	xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); +	/* ...but non-reserved blocks need in-core and on-disk updates. */ +	if (args->len > len) +		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, +				-((int64_t)args->len - len)); +} + +/* Free a block to the reservation. */ +void +xfs_ag_resv_free_extent( +	struct xfs_perag		*pag, +	enum xfs_ag_resv_type		type, +	struct xfs_trans		*tp, +	xfs_extlen_t			len) +{ +	xfs_extlen_t			leftover; +	struct xfs_ag_resv		*resv; + +	trace_xfs_ag_resv_free_extent(pag, type, len); + +	switch (type) { +	case XFS_AG_RESV_METADATA: +	case XFS_AG_RESV_AGFL: +		resv = xfs_perag_resv(pag, type); +		break; +	default: +		ASSERT(0); +		/* fall through */ +	case XFS_AG_RESV_NONE: +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); +		return; +	} + +	leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); +	resv->ar_reserved += leftover; +	if (type == XFS_AG_RESV_AGFL) +		return; +	/* Freeing into the reserved pool only requires on-disk update... */ +	xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); +	/* ...but freeing beyond that requires in-core and on-disk update. */ +	if (len > leftover) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); +} diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h new file mode 100644 index 000000000000..8d6c687deef3 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#ifndef __XFS_AG_RESV_H__ +#define	__XFS_AG_RESV_H__ + +int xfs_ag_resv_free(struct xfs_perag *pag); +int xfs_ag_resv_init(struct xfs_perag *pag); + +bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); +xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag, +		enum xfs_ag_resv_type type); + +void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, +		struct xfs_alloc_arg *args); +void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, +		struct xfs_trans *tp, xfs_extlen_t len); + +#endif	/* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 05b5243d89f6..effb64cf714f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -37,6 +37,7 @@  #include "xfs_trans.h"  #include "xfs_buf_item.h"  #include "xfs_log.h" +#include "xfs_ag_resv.h"  struct workqueue_struct *xfs_alloc_wq; @@ -51,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);  STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,  		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +unsigned int +xfs_refc_block( +	struct xfs_mount	*mp) +{ +	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) +		return XFS_RMAP_BLOCK(mp) + 1; +	if (xfs_sb_version_hasfinobt(&mp->m_sb)) +		return XFS_FIBT_BLOCK(mp) + 1; +	return XFS_IBT_BLOCK(mp) + 1; +} +  xfs_extlen_t  xfs_prealloc_blocks(  	struct xfs_mount	*mp)  { +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		return xfs_refc_block(mp) + 1;  	if (xfs_sb_version_hasrmapbt(&mp->m_sb))  		return XFS_RMAP_BLOCK(mp) + 1;  	if (xfs_sb_version_hasfinobt(&mp->m_sb)) @@ -74,14 +88,8 @@ xfs_prealloc_blocks(   * extents need to be actually allocated. To get around this, we explicitly set   * aside a few blocks which will not be reserved in delayed allocation.   * - * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist - * and 4 more to handle a potential split of the file's bmap btree. - * - * When rmap is enabled, we must also be able to handle two rmap btree inserts - * to record both the file data extent and a new bmbt block.  The bmbt block - * might not be in the same AG as the file data extent.  In the worst case - * the bmap btree splits multiple levels and all the new blocks come from - * different AGs, so set aside enough to handle rmap btree splits in all AGs. + * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a + * potential split of the file's bmap btree.   */  unsigned int  xfs_alloc_set_aside( @@ -90,8 +98,6 @@ xfs_alloc_set_aside(  	unsigned int		blocks;  	blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); -	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) -		blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;  	return blocks;  } @@ -122,6 +128,8 @@ xfs_alloc_ag_max_usable(  		blocks++;		/* finobt root block */  	if (xfs_sb_version_hasrmapbt(&mp->m_sb))  		blocks++; 		/* rmap root block */ +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		blocks++;		/* refcount root block */  	return mp->m_sb.sb_agblocks - blocks;  } @@ -265,7 +273,7 @@ xfs_alloc_compute_diff(  	xfs_agblock_t	wantbno,	/* target starting block */  	xfs_extlen_t	wantlen,	/* target length */  	xfs_extlen_t	alignment,	/* target alignment */ -	char		userdata,	/* are we allocating data? */ +	int		datatype,	/* are we allocating data? */  	xfs_agblock_t	freebno,	/* freespace's starting block */  	xfs_extlen_t	freelen,	/* freespace's length */  	xfs_agblock_t	*newbnop)	/* result: best start block from free */ @@ -276,6 +284,7 @@ xfs_alloc_compute_diff(  	xfs_extlen_t	newlen1=0;	/* length with newbno1 */  	xfs_extlen_t	newlen2=0;	/* length with newbno2 */  	xfs_agblock_t	wantend;	/* end of target extent */ +	bool		userdata = xfs_alloc_is_userdata(datatype);  	ASSERT(freelen >= wantlen);  	freeend = freebno + freelen; @@ -680,12 +689,29 @@ xfs_alloc_ag_vextent(  	xfs_alloc_arg_t	*args)	/* argument structure for allocation */  {  	int		error=0; +	xfs_extlen_t	reservation; +	xfs_extlen_t	oldmax;  	ASSERT(args->minlen > 0);  	ASSERT(args->maxlen > 0);  	ASSERT(args->minlen <= args->maxlen);  	ASSERT(args->mod < args->prod);  	ASSERT(args->alignment > 0); + +	/* +	 * Clamp maxlen to the amount of free space minus any reservations +	 * that have been made. +	 */ +	oldmax = args->maxlen; +	reservation = xfs_ag_resv_needed(args->pag, args->resv); +	if (args->maxlen > args->pag->pagf_freeblks - reservation) +		args->maxlen = args->pag->pagf_freeblks - reservation; +	if (args->maxlen == 0) { +		args->agbno = NULLAGBLOCK; +		args->maxlen = oldmax; +		return 0; +	} +  	/*  	 * Branch to correct routine based on the type.  	 */ @@ -705,12 +731,14 @@ xfs_alloc_ag_vextent(  		/* NOTREACHED */  	} +	args->maxlen = oldmax; +  	if (error || args->agbno == NULLAGBLOCK)  		return error;  	ASSERT(args->len >= args->minlen);  	ASSERT(args->len <= args->maxlen); -	ASSERT(!args->wasfromfl || !args->isfl); +	ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);  	ASSERT(args->agbno % args->alignment == 0);  	/* if not file data, insert new block into the reverse map btree */ @@ -732,12 +760,7 @@ xfs_alloc_ag_vextent(  					      args->agbno, args->len));  	} -	if (!args->isfl) { -		xfs_trans_mod_sb(args->tp, args->wasdel ? -				 XFS_TRANS_SB_RES_FDBLOCKS : -				 XFS_TRANS_SB_FDBLOCKS, -				 -((long)(args->len))); -	} +	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);  	XFS_STATS_INC(args->mp, xs_allocx);  	XFS_STATS_ADD(args->mp, xs_allocb, args->len); @@ -917,7 +940,7 @@ xfs_alloc_find_best_extent(  			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,  						       args->alignment, -						       args->userdata, *sbnoa, +						       args->datatype, *sbnoa,  						       *slena, &new);  			/* @@ -1101,7 +1124,7 @@ restart:  			if (args->len < blen)  				continue;  			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, -				args->alignment, args->userdata, ltbnoa, +				args->alignment, args->datatype, ltbnoa,  				ltlena, <new);  			if (ltnew != NULLAGBLOCK &&  			    (args->len > blen || ltdiff < bdiff)) { @@ -1254,7 +1277,7 @@ restart:  			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);  			xfs_alloc_fix_len(args);  			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, -				args->alignment, args->userdata, ltbnoa, +				args->alignment, args->datatype, ltbnoa,  				ltlena, <new);  			error = xfs_alloc_find_best_extent(args, @@ -1271,7 +1294,7 @@ restart:  			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);  			xfs_alloc_fix_len(args);  			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, -				args->alignment, args->userdata, gtbnoa, +				args->alignment, args->datatype, gtbnoa,  				gtlena, >new);  			error = xfs_alloc_find_best_extent(args, @@ -1331,7 +1354,7 @@ restart:  	}  	rlen = args->len;  	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, -				     args->userdata, ltbnoa, ltlena, <new); +				     args->datatype, ltbnoa, ltlena, <new);  	ASSERT(ltnew >= ltbno);  	ASSERT(ltnew + rlen <= ltbnoa + ltlena);  	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); @@ -1583,6 +1606,7 @@ xfs_alloc_ag_vextent_small(  	int		*stat)	/* status: 0-freelist, 1-normal/none */  {  	struct xfs_owner_info	oinfo; +	struct xfs_perag	*pag;  	int		error;  	xfs_agblock_t	fbno;  	xfs_extlen_t	flen; @@ -1600,7 +1624,8 @@ xfs_alloc_ag_vextent_small(  	 * to respect minleft even when pulling from the  	 * freelist.  	 */ -	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && +	else if (args->minlen == 1 && args->alignment == 1 && +		 args->resv != XFS_AG_RESV_AGFL &&  		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)  		  > args->minleft)) {  		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1608,9 +1633,9 @@ xfs_alloc_ag_vextent_small(  			goto error0;  		if (fbno != NULLAGBLOCK) {  			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, -					     args->userdata); +			      xfs_alloc_allow_busy_reuse(args->datatype)); -			if (args->userdata) { +			if (xfs_alloc_is_userdata(args->datatype)) {  				xfs_buf_t	*bp;  				bp = xfs_btree_get_bufs(args->mp, args->tp, @@ -1629,13 +1654,18 @@ xfs_alloc_ag_vextent_small(  			/*  			 * If we're feeding an AGFL block to something that  			 * doesn't live in the free space, we need to clear -			 * out the OWN_AG rmap. +			 * out the OWN_AG rmap and add the block back to +			 * the AGFL per-AG reservation.  			 */  			xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);  			error = xfs_rmap_free(args->tp, args->agbp, args->agno,  					fbno, 1, &oinfo);  			if (error)  				goto error0; +			pag = xfs_perag_get(args->mp, args->agno); +			xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, +					args->tp, 1); +			xfs_perag_put(pag);  			*stat = 0;  			return 0; @@ -1683,7 +1713,7 @@ xfs_free_ag_extent(  	xfs_agblock_t		bno,  	xfs_extlen_t		len,  	struct xfs_owner_info	*oinfo, -	int			isfl) +	enum xfs_ag_resv_type	type)  {  	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */  	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */ @@ -1911,21 +1941,22 @@ xfs_free_ag_extent(  	 */  	pag = xfs_perag_get(mp, agno);  	error = xfs_alloc_update_counters(tp, pag, agbp, len); +	xfs_ag_resv_free_extent(pag, type, tp, len);  	xfs_perag_put(pag);  	if (error)  		goto error0; -	if (!isfl) -		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);  	XFS_STATS_INC(mp, xs_freex);  	XFS_STATS_ADD(mp, xs_freeb, len); -	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); +	trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, +			haveleft, haveright);  	return 0;   error0: -	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); +	trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, +			-1, -1);  	if (bno_cur)  		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);  	if (cnt_cur) @@ -1950,21 +1981,43 @@ xfs_alloc_compute_maxlevels(  }  /* - * Find the length of the longest extent in an AG. + * Find the length of the longest extent in an AG.  The 'need' parameter + * specifies how much space we're going to need for the AGFL and the + * 'reserved' parameter tells us how many blocks in this AG are reserved for + * other callers.   */  xfs_extlen_t  xfs_alloc_longest_free_extent(  	struct xfs_mount	*mp,  	struct xfs_perag	*pag, -	xfs_extlen_t		need) +	xfs_extlen_t		need, +	xfs_extlen_t		reserved)  {  	xfs_extlen_t		delta = 0; +	/* +	 * If the AGFL needs a recharge, we'll have to subtract that from the +	 * longest extent. +	 */  	if (need > pag->pagf_flcount)  		delta = need - pag->pagf_flcount; +	/* +	 * If we cannot maintain others' reservations with space from the +	 * not-longest freesp extents, we'll have to subtract /that/ from +	 * the longest extent too. +	 */ +	if (pag->pagf_freeblks - pag->pagf_longest < reserved) +		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); + +	/* +	 * If the longest extent is long enough to satisfy all the +	 * reservations and AGFL rules in place, we can return this extent. +	 */  	if (pag->pagf_longest > delta)  		return pag->pagf_longest - delta; + +	/* Otherwise, let the caller try for 1 block if there's space. */  	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;  } @@ -2004,20 +2057,24 @@ xfs_alloc_space_available(  {  	struct xfs_perag	*pag = args->pag;  	xfs_extlen_t		longest; +	xfs_extlen_t		reservation; /* blocks that are still reserved */  	int			available;  	if (flags & XFS_ALLOC_FLAG_FREEING)  		return true; +	reservation = xfs_ag_resv_needed(pag, args->resv); +  	/* do we have enough contiguous free space for the allocation? */ -	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); +	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, +			reservation);  	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)  		return false; -	/* do have enough free space remaining for the allocation? */ +	/* do we have enough free space remaining for the allocation? */  	available = (int)(pag->pagf_freeblks + pag->pagf_flcount - -			  min_free - args->total); -	if (available < (int)args->minleft) +			  reservation - min_free - args->total); +	if (available < (int)args->minleft || available <= 0)  		return false;  	return true; @@ -2058,7 +2115,7 @@ xfs_alloc_fix_freelist(  	 * somewhere else if we are not being asked to try harder at this  	 * point  	 */ -	if (pag->pagf_metadata && args->userdata && +	if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&  	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {  		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));  		goto out_agbp_relse; @@ -2124,7 +2181,7 @@ xfs_alloc_fix_freelist(  		if (error)  			goto out_agbp_relse;  		error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, -					   &targs.oinfo, 1); +					   &targs.oinfo, XFS_AG_RESV_AGFL);  		if (error)  			goto out_agbp_relse;  		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2135,7 +2192,7 @@ xfs_alloc_fix_freelist(  	targs.mp = mp;  	targs.agbp = agbp;  	targs.agno = args->agno; -	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; +	targs.alignment = targs.minlen = targs.prod = 1;  	targs.type = XFS_ALLOCTYPE_THIS_AG;  	targs.pag = pag;  	error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); @@ -2146,6 +2203,7 @@ xfs_alloc_fix_freelist(  	while (pag->pagf_flcount < need) {  		targs.agbno = 0;  		targs.maxlen = need - pag->pagf_flcount; +		targs.resv = XFS_AG_RESV_AGFL;  		/* Allocate as many blocks as possible at once. */  		error = xfs_alloc_ag_vextent(&targs); @@ -2278,6 +2336,9 @@ xfs_alloc_log_agf(  		offsetof(xfs_agf_t, agf_btreeblks),  		offsetof(xfs_agf_t, agf_uuid),  		offsetof(xfs_agf_t, agf_rmap_blocks), +		offsetof(xfs_agf_t, agf_refcount_blocks), +		offsetof(xfs_agf_t, agf_refcount_root), +		offsetof(xfs_agf_t, agf_refcount_level),  		/* needed so that we don't log the whole rest of the structure: */  		offsetof(xfs_agf_t, agf_spare64),  		sizeof(xfs_agf_t) @@ -2415,6 +2476,10 @@ xfs_agf_verify(  	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))  		return false; +	if (xfs_sb_version_hasreflink(&mp->m_sb) && +	    be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) +		return false; +  	return true;;  } @@ -2535,6 +2600,7 @@ xfs_alloc_read_agf(  			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);  		pag->pagf_levels[XFS_BTNUM_RMAPi] =  			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); +		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);  		spin_lock_init(&pag->pagb_lock);  		pag->pagb_count = 0;  		pag->pagb_tree = RB_ROOT; @@ -2633,7 +2699,7 @@ xfs_alloc_vextent(  		 * Try near allocation first, then anywhere-in-ag after  		 * the first a.g. fails.  		 */ -		if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && +		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&  		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {  			args->fsbno = XFS_AGB_TO_FSB(mp,  					((mp->m_agfrotor / rotorstep) % @@ -2766,7 +2832,7 @@ xfs_alloc_vextent(  #endif  		/* Zero the extent if we were asked to do so */ -		if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { +		if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {  			error = xfs_zero_extent(args->ip, args->fsbno, args->len);  			if (error)  				goto error0; @@ -2825,7 +2891,8 @@ xfs_free_extent(  	struct xfs_trans	*tp,	/* transaction pointer */  	xfs_fsblock_t		bno,	/* starting block number of extent */  	xfs_extlen_t		len,	/* length of extent */ -	struct xfs_owner_info	*oinfo)	/* extent owner */ +	struct xfs_owner_info	*oinfo,	/* extent owner */ +	enum xfs_ag_resv_type	type)	/* block reservation type */  {  	struct xfs_mount	*mp = tp->t_mountp;  	struct xfs_buf		*agbp; @@ -2834,6 +2901,7 @@ xfs_free_extent(  	int			error;  	ASSERT(len != 0); +	ASSERT(type != XFS_AG_RESV_AGFL);  	if (XFS_TEST_ERROR(false, mp,  			XFS_ERRTAG_FREE_EXTENT, @@ -2851,7 +2919,7 @@ xfs_free_extent(  		agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),  				err); -	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); +	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);  	if (error)  		goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6fe2d6b7cfe9..7c404a6b0ae3 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {  	xfs_extlen_t	len;		/* output: actual size of extent */  	xfs_alloctype_t	type;		/* allocation type XFS_ALLOCTYPE_... */  	xfs_alloctype_t	otype;		/* original allocation type */ +	int		datatype;	/* mask defining data type treatment */  	char		wasdel;		/* set if allocation was prev delayed */  	char		wasfromfl;	/* set if allocation is from freelist */ -	char		isfl;		/* set if is freelist blocks - !acctg */ -	char		userdata;	/* mask defining userdata treatment */  	xfs_fsblock_t	firstblock;	/* io first block allocated */  	struct xfs_owner_info	oinfo;	/* owner of blocks being allocated */ +	enum xfs_ag_resv_type	resv;	/* block reservation to use */  } xfs_alloc_arg_t;  /* - * Defines for userdata + * Defines for datatype   */  #define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/  #define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */  #define XFS_ALLOC_USERDATA_ZERO		(1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY		(1 << 3)/* Busy extents not allowed */ + +static inline bool +xfs_alloc_is_userdata(int datatype) +{ +	return (datatype & ~XFS_ALLOC_NOBUSY) != 0; +} + +static inline bool +xfs_alloc_allow_busy_reuse(int datatype) +{ +	return (datatype & XFS_ALLOC_NOBUSY) == 0; +}  /* freespace limit calculations */  #define XFS_ALLOC_AGFL_RESERVE	4 @@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);  unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);  xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, -		struct xfs_perag *pag, xfs_extlen_t need); +		struct xfs_perag *pag, xfs_extlen_t need, +		xfs_extlen_t reserved);  unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,  		struct xfs_perag *pag); @@ -184,7 +198,8 @@ xfs_free_extent(  	struct xfs_trans	*tp,	/* transaction pointer */  	xfs_fsblock_t		bno,	/* starting block number of extent */  	xfs_extlen_t		len,	/* length of extent */ -	struct xfs_owner_info	*oinfo);/* extent owner */ +	struct xfs_owner_info	*oinfo,	/* extent owner */ +	enum xfs_ag_resv_type	type);	/* block reservation type */  int				/* error */  xfs_alloc_lookup_ge( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b060bca93402..c27344cf38e1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -47,6 +47,8 @@  #include "xfs_attr_leaf.h"  #include "xfs_filestream.h"  #include "xfs_rmap.h" +#include "xfs_ag_resv.h" +#include "xfs_refcount.h"  kmem_zone_t		*xfs_bmap_free_item_zone; @@ -139,7 +141,8 @@ xfs_bmbt_lookup_ge(   */  static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)  { -	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && +	return whichfork != XFS_COW_FORK && +		XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&  		XFS_IFORK_NEXTENTS(ip, whichfork) >  			XFS_IFORK_MAXEXT(ip, whichfork);  } @@ -149,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)   */  static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)  { -	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && +	return whichfork != XFS_COW_FORK && +		XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&  		XFS_IFORK_NEXTENTS(ip, whichfork) <=  			XFS_IFORK_MAXEXT(ip, whichfork);  } @@ -639,6 +643,7 @@ xfs_bmap_btree_to_extents(  	mp = ip->i_mount;  	ifp = XFS_IFORK_PTR(ip, whichfork); +	ASSERT(whichfork != XFS_COW_FORK);  	ASSERT(ifp->if_flags & XFS_IFEXTENTS);  	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);  	rblock = ifp->if_broot; @@ -705,6 +710,7 @@ xfs_bmap_extents_to_btree(  	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */  	mp = ip->i_mount; +	ASSERT(whichfork != XFS_COW_FORK);  	ifp = XFS_IFORK_PTR(ip, whichfork);  	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); @@ -747,6 +753,7 @@ xfs_bmap_extents_to_btree(  		args.type = XFS_ALLOCTYPE_START_BNO;  		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);  	} else if (dfops->dop_low) { +try_another_ag:  		args.type = XFS_ALLOCTYPE_START_BNO;  		args.fsbno = *firstblock;  	} else { @@ -761,6 +768,21 @@ xfs_bmap_extents_to_btree(  		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);  		return error;  	} + +	/* +	 * During a CoW operation, the allocation and bmbt updates occur in +	 * different transactions.  The mapping code tries to put new bmbt +	 * blocks near extents being mapped, but the only way to guarantee this +	 * is if the alloc and the mapping happen in a single transaction that +	 * has a block reservation.  That isn't the case here, so if we run out +	 * of space we'll try again with another AG. +	 */ +	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && +	    args.fsbno == NULLFSBLOCK && +	    args.type == XFS_ALLOCTYPE_NEAR_BNO) { +		dfops->dop_low = true; +		goto try_another_ag; +	}  	/*  	 * Allocation can't fail, the space was reserved.  	 */ @@ -836,6 +858,7 @@ xfs_bmap_local_to_extents_empty(  {  	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork); +	ASSERT(whichfork != XFS_COW_FORK);  	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);  	ASSERT(ifp->if_bytes == 0);  	ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); @@ -895,6 +918,7 @@ xfs_bmap_local_to_extents(  	 * file currently fits in an inode.  	 */  	if (*firstblock == NULLFSBLOCK) { +try_another_ag:  		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);  		args.type = XFS_ALLOCTYPE_START_BNO;  	} else { @@ -907,6 +931,19 @@ xfs_bmap_local_to_extents(  	if (error)  		goto done; +	/* +	 * During a CoW operation, the allocation and bmbt updates occur in +	 * different transactions.  The mapping code tries to put new bmbt +	 * blocks near extents being mapped, but the only way to guarantee this +	 * is if the alloc and the mapping happen in a single transaction that +	 * has a block reservation.  That isn't the case here, so if we run out +	 * of space we'll try again with another AG. +	 */ +	if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && +	    args.fsbno == NULLFSBLOCK && +	    args.type == XFS_ALLOCTYPE_NEAR_BNO) { +		goto try_another_ag; +	}  	/* Can't fail, the space was reserved. */  	ASSERT(args.fsbno != NULLFSBLOCK);  	ASSERT(args.len == 1); @@ -1388,7 +1425,7 @@ xfs_bmap_search_multi_extents(   * Else, *lastxp will be set to the index of the found   * entry; *gotp will contain the entry.   */ -STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */ +xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */  xfs_bmap_search_extents(  	xfs_inode_t     *ip,            /* incore inode pointer */  	xfs_fileoff_t   bno,            /* block number searched for */ @@ -1669,7 +1706,8 @@ xfs_bmap_one_block(   */  STATIC int				/* error */  xfs_bmap_add_extent_delay_real( -	struct xfs_bmalloca	*bma) +	struct xfs_bmalloca	*bma, +	int			whichfork)  {  	struct xfs_bmbt_irec	*new = &bma->got;  	int			diff;	/* temp value */ @@ -1687,11 +1725,14 @@ xfs_bmap_add_extent_delay_real(  	xfs_filblks_t		temp=0;	/* value for da_new calculations */  	xfs_filblks_t		temp2=0;/* value for da_new calculations */  	int			tmp_rval;	/* partial logging flags */ -	int			whichfork = XFS_DATA_FORK;  	struct xfs_mount	*mp; +	xfs_extnum_t		*nextents;  	mp = bma->ip->i_mount;  	ifp = XFS_IFORK_PTR(bma->ip, whichfork); +	ASSERT(whichfork != XFS_ATTR_FORK); +	nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : +						&bma->ip->i_d.di_nextents);  	ASSERT(bma->idx >= 0);  	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1705,6 +1746,9 @@ xfs_bmap_add_extent_delay_real(  #define	RIGHT		r[1]  #define	PREV		r[2] +	if (whichfork == XFS_COW_FORK) +		state |= BMAP_COWFORK; +  	/*  	 * Set up a bunch of variables to make the tests simpler.  	 */ @@ -1791,7 +1835,7 @@ xfs_bmap_add_extent_delay_real(  		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);  		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); -		bma->ip->i_d.di_nextents--; +		(*nextents)--;  		if (bma->cur == NULL)  			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;  		else { @@ -1893,7 +1937,7 @@ xfs_bmap_add_extent_delay_real(  		xfs_bmbt_set_startblock(ep, new->br_startblock);  		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); -		bma->ip->i_d.di_nextents++; +		(*nextents)++;  		if (bma->cur == NULL)  			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;  		else { @@ -1963,7 +2007,7 @@ xfs_bmap_add_extent_delay_real(  		temp = PREV.br_blockcount - new->br_blockcount;  		xfs_bmbt_set_blockcount(ep, temp);  		xfs_iext_insert(bma->ip, bma->idx, 1, new, state); -		bma->ip->i_d.di_nextents++; +		(*nextents)++;  		if (bma->cur == NULL)  			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;  		else { @@ -2047,7 +2091,7 @@ xfs_bmap_add_extent_delay_real(  		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);  		xfs_bmbt_set_blockcount(ep, temp);  		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); -		bma->ip->i_d.di_nextents++; +		(*nextents)++;  		if (bma->cur == NULL)  			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;  		else { @@ -2116,7 +2160,7 @@ xfs_bmap_add_extent_delay_real(  		RIGHT.br_blockcount = temp2;  		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */  		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); -		bma->ip->i_d.di_nextents++; +		(*nextents)++;  		if (bma->cur == NULL)  			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;  		else { @@ -2214,7 +2258,8 @@ xfs_bmap_add_extent_delay_real(  	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);  done: -	bma->logflags |= rval; +	if (whichfork != XFS_COW_FORK) +		bma->logflags |= rval;  	return error;  #undef	LEFT  #undef	RIGHT @@ -2758,6 +2803,7 @@ done:  STATIC void  xfs_bmap_add_extent_hole_delay(  	xfs_inode_t		*ip,	/* incore inode pointer */ +	int			whichfork,  	xfs_extnum_t		*idx,	/* extent number to update/insert */  	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */  { @@ -2769,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay(  	int			state;  /* state bits, accessed thru macros */  	xfs_filblks_t		temp=0;	/* temp for indirect calculations */ -	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); +	ifp = XFS_IFORK_PTR(ip, whichfork);  	state = 0; +	if (whichfork == XFS_COW_FORK) +		state |= BMAP_COWFORK;  	ASSERT(isnullstartblock(new->br_startblock));  	/* @@ -2788,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay(  	 * Check and set flags if the current (right) segment exists.  	 * If it doesn't exist, we're converting the hole at end-of-file.  	 */ -	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { +	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {  		state |= BMAP_RIGHT_VALID;  		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2922,6 +2970,7 @@ xfs_bmap_add_extent_hole_real(  	ASSERT(!isnullstartblock(new->br_startblock));  	ASSERT(!bma->cur ||  	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); +	ASSERT(whichfork != XFS_COW_FORK);  	XFS_STATS_INC(mp, xs_add_exlist); @@ -3347,7 +3396,8 @@ xfs_bmap_adjacent(  	mp = ap->ip->i_mount;  	nullfb = *ap->firstblock == NULLFSBLOCK; -	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; +	rt = XFS_IS_REALTIME_INODE(ap->ip) && +		xfs_alloc_is_userdata(ap->datatype);  	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);  	/*  	 * If allocating at eof, and there's a previous real block, @@ -3501,7 +3551,8 @@ xfs_bmap_longest_free_extent(  	}  	longest = xfs_alloc_longest_free_extent(mp, pag, -					xfs_alloc_min_freelist(mp, pag)); +				xfs_alloc_min_freelist(mp, pag), +				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));  	if (*blen < longest)  		*blen = longest; @@ -3622,7 +3673,7 @@ xfs_bmap_btalloc(  {  	xfs_mount_t	*mp;		/* mount point structure */  	xfs_alloctype_t	atype = 0;	/* type for allocation routines */ -	xfs_extlen_t	align;		/* minimum allocation alignment */ +	xfs_extlen_t	align = 0;	/* minimum allocation alignment */  	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */  	xfs_agnumber_t	ag;  	xfs_alloc_arg_t	args; @@ -3645,7 +3696,10 @@ xfs_bmap_btalloc(  	else if (mp->m_dalign)  		stripe_align = mp->m_dalign; -	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; +	if (ap->flags & XFS_BMAPI_COWFORK) +		align = xfs_get_cowextsz_hint(ap->ip); +	else if (xfs_alloc_is_userdata(ap->datatype)) +		align = xfs_get_extsz_hint(ap->ip);  	if (unlikely(align)) {  		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,  						align, 0, ap->eof, 0, ap->conv, @@ -3658,7 +3712,8 @@ xfs_bmap_btalloc(  	nullfb = *ap->firstblock == NULLFSBLOCK;  	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);  	if (nullfb) { -		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { +		if (xfs_alloc_is_userdata(ap->datatype) && +		    xfs_inode_is_filestream(ap->ip)) {  			ag = xfs_filestream_lookup_ag(ap->ip);  			ag = (ag != NULLAGNUMBER) ? ag : 0;  			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); @@ -3698,7 +3753,8 @@ xfs_bmap_btalloc(  		 * enough for the request.  If one isn't found, then adjust  		 * the minimum allocation size to the largest space found.  		 */ -		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) +		if (xfs_alloc_is_userdata(ap->datatype) && +		    xfs_inode_is_filestream(ap->ip))  			error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);  		else  			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); @@ -3781,9 +3837,9 @@ xfs_bmap_btalloc(  	}  	args.minleft = ap->minleft;  	args.wasdel = ap->wasdel; -	args.isfl = 0; -	args.userdata = ap->userdata; -	if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) +	args.resv = XFS_AG_RESV_NONE; +	args.datatype = ap->datatype; +	if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)  		args.ip = ap->ip;  	error = xfs_alloc_vextent(&args); @@ -3850,7 +3906,8 @@ xfs_bmap_btalloc(  		ASSERT(nullfb || fb_agno == args.agno ||  		       (ap->dfops->dop_low && fb_agno < args.agno));  		ap->length = args.len; -		ap->ip->i_d.di_nblocks += args.len; +		if (!(ap->flags & XFS_BMAPI_COWFORK)) +			ap->ip->i_d.di_nblocks += args.len;  		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);  		if (ap->wasdel)  			ap->ip->i_delayed_blks -= args.len; @@ -3870,6 +3927,63 @@ xfs_bmap_btalloc(  }  /* + * For a remap operation, just "allocate" an extent at the address that the + * caller passed in, and ensure that the AGFL is the right size.  The caller + * will then map the "allocated" extent into the file somewhere. + */ +STATIC int +xfs_bmap_remap_alloc( +	struct xfs_bmalloca	*ap) +{ +	struct xfs_trans	*tp = ap->tp; +	struct xfs_mount	*mp = tp->t_mountp; +	xfs_agblock_t		bno; +	struct xfs_alloc_arg	args; +	int			error; + +	/* +	 * validate that the block number is legal - the enables us to detect +	 * and handle a silent filesystem corruption rather than crashing. +	 */ +	memset(&args, 0, sizeof(struct xfs_alloc_arg)); +	args.tp = ap->tp; +	args.mp = ap->tp->t_mountp; +	bno = *ap->firstblock; +	args.agno = XFS_FSB_TO_AGNO(mp, bno); +	args.agbno = XFS_FSB_TO_AGBNO(mp, bno); +	if (args.agno >= mp->m_sb.sb_agcount || +	    args.agbno >= mp->m_sb.sb_agblocks) +		return -EFSCORRUPTED; + +	/* "Allocate" the extent from the range we passed in. */ +	trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); +	ap->blkno = bno; +	ap->ip->i_d.di_nblocks += ap->length; +	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + +	/* Fix the freelist, like a real allocator does. */ +	args.datatype = ap->datatype; +	args.pag = xfs_perag_get(args.mp, args.agno); +	ASSERT(args.pag); + +	/* +	 * The freelist fixing code will decline the allocation if +	 * the size and shape of the free space doesn't allow for +	 * allocating the extent and updating all the metadata that +	 * happens during an allocation.  We're remapping, not +	 * allocating, so skip that check by pretending to be freeing. +	 */ +	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); +	if (error) +		goto error0; +error0: +	xfs_perag_put(args.pag); +	if (error) +		trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); +	return error; +} + +/*   * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.   * It figures out where to ask the underlying allocator to put the new extent.   */ @@ -3877,7 +3991,10 @@ STATIC int  xfs_bmap_alloc(  	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */  { -	if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) +	if (ap->flags & XFS_BMAPI_REMAP) +		return xfs_bmap_remap_alloc(ap); +	if (XFS_IS_REALTIME_INODE(ap->ip) && +	    xfs_alloc_is_userdata(ap->datatype))  		return xfs_bmap_rtalloc(ap);  	return xfs_bmap_btalloc(ap);  } @@ -4005,12 +4122,11 @@ xfs_bmapi_read(  	int			error;  	int			eof;  	int			n = 0; -	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ? -						XFS_ATTR_FORK : XFS_DATA_FORK; +	int			whichfork = xfs_bmapi_whichfork(flags);  	ASSERT(*nmap >= 1);  	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| -			   XFS_BMAPI_IGSTATE))); +			   XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK)));  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));  	if (unlikely(XFS_TEST_ERROR( @@ -4028,6 +4144,16 @@ xfs_bmapi_read(  	ifp = XFS_IFORK_PTR(ip, whichfork); +	/* No CoW fork?  Return a hole. */ +	if (whichfork == XFS_COW_FORK && !ifp) { +		mval->br_startoff = bno; +		mval->br_startblock = HOLESTARTBLOCK; +		mval->br_blockcount = len; +		mval->br_state = XFS_EXT_NORM; +		*nmap = 1; +		return 0; +	} +  	if (!(ifp->if_flags & XFS_IFEXTENTS)) {  		error = xfs_iread_extents(NULL, ip, whichfork);  		if (error) @@ -4074,9 +4200,10 @@ xfs_bmapi_read(  	return 0;  } -STATIC int +int  xfs_bmapi_reserve_delalloc(  	struct xfs_inode	*ip, +	int			whichfork,  	xfs_fileoff_t		aoff,  	xfs_filblks_t		len,  	struct xfs_bmbt_irec	*got, @@ -4085,7 +4212,7 @@ xfs_bmapi_reserve_delalloc(  	int			eof)  {  	struct xfs_mount	*mp = ip->i_mount; -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);  	xfs_extlen_t		alen;  	xfs_extlen_t		indlen;  	char			rt = XFS_IS_REALTIME_INODE(ip); @@ -4097,7 +4224,10 @@ xfs_bmapi_reserve_delalloc(  		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);  	/* Figure out the extent size, adjust alen */ -	extsz = xfs_get_extsz_hint(ip); +	if (whichfork == XFS_COW_FORK) +		extsz = xfs_get_cowextsz_hint(ip); +	else +		extsz = xfs_get_extsz_hint(ip);  	if (extsz) {  		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,  					       1, 0, &aoff, &alen); @@ -4144,7 +4274,7 @@ xfs_bmapi_reserve_delalloc(  	got->br_startblock = nullstartblock(indlen);  	got->br_blockcount = alen;  	got->br_state = XFS_EXT_NORM; -	xfs_bmap_add_extent_hole_delay(ip, lastx, got); +	xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);  	/*  	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay @@ -4170,98 +4300,12 @@ out_unreserve_quota:  	return error;  } -/* - * Map file blocks to filesystem blocks, adding delayed allocations as needed. - */ -int -xfs_bmapi_delay( -	struct xfs_inode	*ip,	/* incore inode */ -	xfs_fileoff_t		bno,	/* starting file offs. mapped */ -	xfs_filblks_t		len,	/* length to map in file */ -	struct xfs_bmbt_irec	*mval,	/* output: map values */ -	int			*nmap,	/* i/o: mval size/count */ -	int			flags)	/* XFS_BMAPI_... */ -{ -	struct xfs_mount	*mp = ip->i_mount; -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); -	struct xfs_bmbt_irec	got;	/* current file extent record */ -	struct xfs_bmbt_irec	prev;	/* previous file extent record */ -	xfs_fileoff_t		obno;	/* old block number (offset) */ -	xfs_fileoff_t		end;	/* end of mapped file region */ -	xfs_extnum_t		lastx;	/* last useful extent number */ -	int			eof;	/* we've hit the end of extents */ -	int			n = 0;	/* current extent index */ -	int			error = 0; - -	ASSERT(*nmap >= 1); -	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); -	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); -	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - -	if (unlikely(XFS_TEST_ERROR( -	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && -	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), -	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { -		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); -		return -EFSCORRUPTED; -	} - -	if (XFS_FORCED_SHUTDOWN(mp)) -		return -EIO; - -	XFS_STATS_INC(mp, xs_blk_mapw); - -	if (!(ifp->if_flags & XFS_IFEXTENTS)) { -		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); -		if (error) -			return error; -	} - -	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); -	end = bno + len; -	obno = bno; - -	while (bno < end && n < *nmap) { -		if (eof || got.br_startoff > bno) { -			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, -							   &prev, &lastx, eof); -			if (error) { -				if (n == 0) { -					*nmap = 0; -					return error; -				} -				break; -			} -		} - -		/* set up the extent map to return. */ -		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); -		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); - -		/* If we're done, stop now. */ -		if (bno >= end || n >= *nmap) -			break; - -		/* Else go on to the next record. */ -		prev = got; -		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) -			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); -		else -			eof = 1; -	} - -	*nmap = n; -	return 0; -} - -  static int  xfs_bmapi_allocate(  	struct xfs_bmalloca	*bma)  {  	struct xfs_mount	*mp = bma->ip->i_mount; -	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? -						XFS_ATTR_FORK : XFS_DATA_FORK; +	int			whichfork = xfs_bmapi_whichfork(bma->flags);  	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);  	int			tmp_logflags = 0;  	int			error; @@ -4287,15 +4331,21 @@ xfs_bmapi_allocate(  	}  	/* -	 * Indicate if this is the first user data in the file, or just any -	 * user data. And if it is userdata, indicate whether it needs to -	 * be initialised to zero during allocation. +	 * Set the data type being allocated. For the data fork, the first data +	 * in the file is treated differently to all other allocations. For the +	 * attribute fork, we only need to ensure the allocated range is not on +	 * the busy list.  	 */  	if (!(bma->flags & XFS_BMAPI_METADATA)) { -		bma->userdata = (bma->offset == 0) ? -			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; +		bma->datatype = XFS_ALLOC_NOBUSY; +		if (whichfork == XFS_DATA_FORK) { +			if (bma->offset == 0) +				bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; +			else +				bma->datatype |= XFS_ALLOC_USERDATA; +		}  		if (bma->flags & XFS_BMAPI_ZERO) -			bma->userdata |= XFS_ALLOC_USERDATA_ZERO; +			bma->datatype |= XFS_ALLOC_USERDATA_ZERO;  	}  	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; @@ -4350,7 +4400,7 @@ xfs_bmapi_allocate(  		bma->got.br_state = XFS_EXT_UNWRITTEN;  	if (bma->wasdel) -		error = xfs_bmap_add_extent_delay_real(bma); +		error = xfs_bmap_add_extent_delay_real(bma, whichfork);  	else  		error = xfs_bmap_add_extent_hole_real(bma, whichfork); @@ -4380,8 +4430,7 @@ xfs_bmapi_convert_unwritten(  	xfs_filblks_t		len,  	int			flags)  { -	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ? -						XFS_ATTR_FORK : XFS_DATA_FORK; +	int			whichfork = xfs_bmapi_whichfork(flags);  	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);  	int			tmp_logflags = 0;  	int			error; @@ -4397,6 +4446,8 @@ xfs_bmapi_convert_unwritten(  			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))  		return 0; +	ASSERT(whichfork != XFS_COW_FORK); +  	/*  	 * Modify (by adding) the state flag, if writing.  	 */ @@ -4503,8 +4554,7 @@ xfs_bmapi_write(  	orig_mval = mval;  	orig_nmap = *nmap;  #endif -	whichfork = (flags & XFS_BMAPI_ATTRFORK) ? -		XFS_ATTR_FORK : XFS_DATA_FORK; +	whichfork = xfs_bmapi_whichfork(flags);  	ASSERT(*nmap >= 1);  	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); @@ -4513,6 +4563,11 @@ xfs_bmapi_write(  	ASSERT(len > 0);  	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); +	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); +	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); +	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); +	ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);  	/* zeroing is for currently only for data extents, not metadata */  	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4565,7 +4620,7 @@ xfs_bmapi_write(  	bma.tp = tp;  	bma.ip = ip;  	bma.total = total; -	bma.userdata = 0; +	bma.datatype = 0;  	bma.dfops = dfops;  	bma.firstblock = firstblock; @@ -4574,6 +4629,14 @@ xfs_bmapi_write(  		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);  		/* +		 * Make sure we only reflink into a hole. +		 */ +		if (flags & XFS_BMAPI_REMAP) +			ASSERT(inhole); +		if (flags & XFS_BMAPI_COWFORK) +			ASSERT(!inhole); + +		/*  		 * First, deal with the hole before the allocated space  		 * that we found, if any.  		 */ @@ -4603,6 +4666,17 @@ xfs_bmapi_write(  				goto error0;  			if (bma.blkno == NULLFSBLOCK)  				break; + +			/* +			 * If this is a CoW allocation, record the data in +			 * the refcount btree for orphan recovery. +			 */ +			if (whichfork == XFS_COW_FORK) { +				error = xfs_refcount_alloc_cow_extent(mp, dfops, +						bma.blkno, bma.length); +				if (error) +					goto error0; +			}  		}  		/* Deal with the allocated space we found.  */ @@ -4768,7 +4842,8 @@ xfs_bmap_del_extent(  	xfs_btree_cur_t		*cur,	/* if null, not a btree */  	xfs_bmbt_irec_t		*del,	/* data to remove from extents */  	int			*logflagsp, /* inode logging flags */ -	int			whichfork) /* data or attr fork */ +	int			whichfork, /* data or attr fork */ +	int			bflags)	/* bmapi flags */  {  	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */  	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */ @@ -4797,6 +4872,8 @@ xfs_bmap_del_extent(  	if (whichfork == XFS_ATTR_FORK)  		state |= BMAP_ATTRFORK; +	else if (whichfork == XFS_COW_FORK) +		state |= BMAP_COWFORK;  	ifp = XFS_IFORK_PTR(ip, whichfork);  	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / @@ -4877,6 +4954,7 @@ xfs_bmap_del_extent(  		/*  		 * Matches the whole extent.  Delete the entry.  		 */ +		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);  		xfs_iext_remove(ip, *idx, 1,  				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);  		--*idx; @@ -5060,9 +5138,16 @@ xfs_bmap_del_extent(  	/*  	 * If we need to, add to list of extents to delete.  	 */ -	if (do_fx) -		xfs_bmap_add_free(mp, dfops, del->br_startblock, -				del->br_blockcount, NULL); +	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { +		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { +			error = xfs_refcount_decrease_extent(mp, dfops, del); +			if (error) +				goto done; +		} else +			xfs_bmap_add_free(mp, dfops, del->br_startblock, +					del->br_blockcount, NULL); +	} +  	/*  	 * Adjust inode # blocks in the file.  	 */ @@ -5071,7 +5156,7 @@ xfs_bmap_del_extent(  	/*  	 * Adjust quota data.  	 */ -	if (qfield) +	if (qfield && !(bflags & XFS_BMAPI_REMAP))  		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);  	/* @@ -5086,6 +5171,175 @@ done:  	return error;  } +/* Remove an extent from the CoW fork.  Similar to xfs_bmap_del_extent. */ +int +xfs_bunmapi_cow( +	struct xfs_inode		*ip, +	struct xfs_bmbt_irec		*del) +{ +	xfs_filblks_t			da_new; +	xfs_filblks_t			da_old; +	xfs_fsblock_t			del_endblock = 0; +	xfs_fileoff_t			del_endoff; +	int				delay; +	struct xfs_bmbt_rec_host	*ep; +	int				error; +	struct xfs_bmbt_irec		got; +	xfs_fileoff_t			got_endoff; +	struct xfs_ifork		*ifp; +	struct xfs_mount		*mp; +	xfs_filblks_t			nblks; +	struct xfs_bmbt_irec		new; +	/* REFERENCED */ +	uint				qfield; +	xfs_filblks_t			temp; +	xfs_filblks_t			temp2; +	int				state = BMAP_COWFORK; +	int				eof; +	xfs_extnum_t			eidx; + +	mp = ip->i_mount; +	XFS_STATS_INC(mp, xs_del_exlist); + +	ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, +			&eidx, &got, &new); + +	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp; +	ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / +		(uint)sizeof(xfs_bmbt_rec_t))); +	ASSERT(del->br_blockcount > 0); +	ASSERT(got.br_startoff <= del->br_startoff); +	del_endoff = del->br_startoff + del->br_blockcount; +	got_endoff = got.br_startoff + got.br_blockcount; +	ASSERT(got_endoff >= del_endoff); +	delay = isnullstartblock(got.br_startblock); +	ASSERT(isnullstartblock(del->br_startblock) == delay); +	qfield = 0; +	error = 0; +	/* +	 * If deleting a real allocation, must free up the disk space. +	 */ +	if (!delay) { +		nblks = del->br_blockcount; +		qfield = XFS_TRANS_DQ_BCOUNT; +		/* +		 * Set up del_endblock and cur for later. +		 */ +		del_endblock = del->br_startblock + del->br_blockcount; +		da_old = da_new = 0; +	} else { +		da_old = startblockval(got.br_startblock); +		da_new = 0; +		nblks = 0; +	} +	qfield = qfield; +	nblks = nblks; + +	/* +	 * Set flag value to use in switch statement. +	 * Left-contig is 2, right-contig is 1. +	 */ +	switch (((got.br_startoff == del->br_startoff) << 1) | +		(got_endoff == del_endoff)) { +	case 3: +		/* +		 * Matches the whole extent.  Delete the entry. +		 */ +		xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK); +		--eidx; +		break; + +	case 2: +		/* +		 * Deleting the first part of the extent. +		 */ +		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); +		xfs_bmbt_set_startoff(ep, del_endoff); +		temp = got.br_blockcount - del->br_blockcount; +		xfs_bmbt_set_blockcount(ep, temp); +		if (delay) { +			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), +				da_old); +			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); +			trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); +			da_new = temp; +			break; +		} +		xfs_bmbt_set_startblock(ep, del_endblock); +		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); +		break; + +	case 1: +		/* +		 * Deleting the last part of the extent. +		 */ +		temp = got.br_blockcount - del->br_blockcount; +		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); +		xfs_bmbt_set_blockcount(ep, temp); +		if (delay) { +			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), +				da_old); +			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); +			trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); +			da_new = temp; +			break; +		} +		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); +		break; + +	case 0: +		/* +		 * Deleting the middle of the extent. +		 */ +		temp = del->br_startoff - got.br_startoff; +		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); +		xfs_bmbt_set_blockcount(ep, temp); +		new.br_startoff = del_endoff; +		temp2 = got_endoff - del_endoff; +		new.br_blockcount = temp2; +		new.br_state = got.br_state; +		if (!delay) { +			new.br_startblock = del_endblock; +		} else { +			temp = xfs_bmap_worst_indlen(ip, temp); +			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); +			temp2 = xfs_bmap_worst_indlen(ip, temp2); +			new.br_startblock = nullstartblock((int)temp2); +			da_new = temp + temp2; +			while (da_new > da_old) { +				if (temp) { +					temp--; +					da_new--; +					xfs_bmbt_set_startblock(ep, +						nullstartblock((int)temp)); +				} +				if (da_new == da_old) +					break; +				if (temp2) { +					temp2--; +					da_new--; +					new.br_startblock = +						nullstartblock((int)temp2); +				} +			} +		} +		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); +		xfs_iext_insert(ip, eidx + 1, 1, &new, state); +		++eidx; +		break; +	} + +	/* +	 * Account for change in delayed indirect blocks. +	 * Nothing to do for disk quota accounting here. +	 */ +	ASSERT(da_old >= da_new); +	if (da_old > da_new) +		xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); + +	return error; +} +  /*   * Unmap (remove) blocks from a file.   * If nexts is nonzero then the number of extents to remove is limited to @@ -5093,17 +5347,16 @@ done:   * *done is set.   */  int						/* error */ -xfs_bunmapi( +__xfs_bunmapi(  	xfs_trans_t		*tp,		/* transaction pointer */  	struct xfs_inode	*ip,		/* incore inode */  	xfs_fileoff_t		bno,		/* starting offset to unmap */ -	xfs_filblks_t		len,		/* length to unmap in file */ +	xfs_filblks_t		*rlen,		/* i/o: amount remaining */  	int			flags,		/* misc flags */  	xfs_extnum_t		nexts,		/* number of extents max */  	xfs_fsblock_t		*firstblock,	/* first allocated block  						   controls a.g. for allocs */ -	struct xfs_defer_ops	*dfops,		/* i/o: list extents to free */ -	int			*done)		/* set if not done yet */ +	struct xfs_defer_ops	*dfops)		/* i/o: deferred updates */  {  	xfs_btree_cur_t		*cur;		/* bmap btree cursor */  	xfs_bmbt_irec_t		del;		/* extent being deleted */ @@ -5125,11 +5378,12 @@ xfs_bunmapi(  	int			wasdel;		/* was a delayed alloc extent */  	int			whichfork;	/* data or attribute fork */  	xfs_fsblock_t		sum; +	xfs_filblks_t		len = *rlen;	/* length to unmap in file */  	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); -	whichfork = (flags & XFS_BMAPI_ATTRFORK) ? -		XFS_ATTR_FORK : XFS_DATA_FORK; +	whichfork = xfs_bmapi_whichfork(flags); +	ASSERT(whichfork != XFS_COW_FORK);  	ifp = XFS_IFORK_PTR(ip, whichfork);  	if (unlikely(  	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5151,7 +5405,7 @@ xfs_bunmapi(  		return error;  	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);  	if (nextents == 0) { -		*done = 1; +		*rlen = 0;  		return 0;  	}  	XFS_STATS_INC(mp, xs_blk_unmap); @@ -5396,7 +5650,7 @@ xfs_bunmapi(  			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;  		error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, -				&tmp_logflags, whichfork); +				&tmp_logflags, whichfork, flags);  		logflags |= tmp_logflags;  		if (error)  			goto error0; @@ -5422,7 +5676,10 @@ nodelete:  			extno++;  		}  	} -	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; +	if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) +		*rlen = 0; +	else +		*rlen = bno - start + 1;  	/*  	 * Convert to a btree if necessary. @@ -5478,6 +5735,27 @@ error0:  	return error;  } +/* Unmap a range of a file. */ +int +xfs_bunmapi( +	xfs_trans_t		*tp, +	struct xfs_inode	*ip, +	xfs_fileoff_t		bno, +	xfs_filblks_t		len, +	int			flags, +	xfs_extnum_t		nexts, +	xfs_fsblock_t		*firstblock, +	struct xfs_defer_ops	*dfops, +	int			*done) +{ +	int			error; + +	error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock, +			dfops); +	*done = (len == 0); +	return error; +} +  /*   * Determine whether an extent shift can be accomplished by a merge with the   * extent that precedes the target hole of the shift. @@ -6057,3 +6335,146 @@ out:  	xfs_trans_cancel(tp);  	return error;  } + +/* Deferred mapping is only for real extents in the data fork. */ +static bool +xfs_bmap_is_update_needed( +	struct xfs_bmbt_irec	*bmap) +{ +	return  bmap->br_startblock != HOLESTARTBLOCK && +		bmap->br_startblock != DELAYSTARTBLOCK; +} + +/* Record a bmap intent. */ +static int +__xfs_bmap_add( +	struct xfs_mount		*mp, +	struct xfs_defer_ops		*dfops, +	enum xfs_bmap_intent_type	type, +	struct xfs_inode		*ip, +	int				whichfork, +	struct xfs_bmbt_irec		*bmap) +{ +	int				error; +	struct xfs_bmap_intent		*bi; + +	trace_xfs_bmap_defer(mp, +			XFS_FSB_TO_AGNO(mp, bmap->br_startblock), +			type, +			XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), +			ip->i_ino, whichfork, +			bmap->br_startoff, +			bmap->br_blockcount, +			bmap->br_state); + +	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); +	INIT_LIST_HEAD(&bi->bi_list); +	bi->bi_type = type; +	bi->bi_owner = ip; +	bi->bi_whichfork = whichfork; +	bi->bi_bmap = *bmap; + +	error = xfs_defer_join(dfops, bi->bi_owner); +	if (error) { +		kmem_free(bi); +		return error; +	} + +	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); +	return 0; +} + +/* Map an extent into a file. */ +int +xfs_bmap_map_extent( +	struct xfs_mount	*mp, +	struct xfs_defer_ops	*dfops, +	struct xfs_inode	*ip, +	struct xfs_bmbt_irec	*PREV) +{ +	if (!xfs_bmap_is_update_needed(PREV)) +		return 0; + +	return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip, +			XFS_DATA_FORK, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_bmap_unmap_extent( +	struct xfs_mount	*mp, +	struct xfs_defer_ops	*dfops, +	struct xfs_inode	*ip, +	struct xfs_bmbt_irec	*PREV) +{ +	if (!xfs_bmap_is_update_needed(PREV)) +		return 0; + +	return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip, +			XFS_DATA_FORK, PREV); +} + +/* + * Process one of the deferred bmap operations.  We pass back the + * btree cursor to maintain our lock on the bmapbt between calls. + */ +int +xfs_bmap_finish_one( +	struct xfs_trans		*tp, +	struct xfs_defer_ops		*dfops, +	struct xfs_inode		*ip, +	enum xfs_bmap_intent_type	type, +	int				whichfork, +	xfs_fileoff_t			startoff, +	xfs_fsblock_t			startblock, +	xfs_filblks_t			blockcount, +	xfs_exntst_t			state) +{ +	struct xfs_bmbt_irec		bmap; +	int				nimaps = 1; +	xfs_fsblock_t			firstfsb; +	int				flags = XFS_BMAPI_REMAP; +	int				done; +	int				error = 0; + +	bmap.br_startblock = startblock; +	bmap.br_startoff = startoff; +	bmap.br_blockcount = blockcount; +	bmap.br_state = state; + +	trace_xfs_bmap_deferred(tp->t_mountp, +			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, +			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), +			ip->i_ino, whichfork, startoff, blockcount, state); + +	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) +		return -EFSCORRUPTED; +	if (whichfork == XFS_ATTR_FORK) +		flags |= XFS_BMAPI_ATTRFORK; + +	if (XFS_TEST_ERROR(false, tp->t_mountp, +			XFS_ERRTAG_BMAP_FINISH_ONE, +			XFS_RANDOM_BMAP_FINISH_ONE)) +		return -EIO; + +	switch (type) { +	case XFS_BMAP_MAP: +		firstfsb = bmap.br_startblock; +		error = xfs_bmapi_write(tp, ip, bmap.br_startoff, +					bmap.br_blockcount, flags, &firstfsb, +					bmap.br_blockcount, &bmap, &nimaps, +					dfops); +		break; +	case XFS_BMAP_UNMAP: +		error = xfs_bunmapi(tp, ip, bmap.br_startoff, +				bmap.br_blockcount, flags, 1, &firstfsb, +				dfops, &done); +		ASSERT(done); +		break; +	default: +		ASSERT(0); +		error = -EFSCORRUPTED; +	} + +	return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 254034f96941..f97db7132564 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -54,7 +54,7 @@ struct xfs_bmalloca {  	bool			wasdel;	/* replacing a delayed allocation */  	bool			aeof;	/* allocated space at eof */  	bool			conv;	/* overwriting unwritten extents */ -	char			userdata;/* userdata mask */ +	int			datatype;/* data type being allocated */  	int			flags;  }; @@ -97,6 +97,19 @@ struct xfs_extent_free_item   */  #define XFS_BMAPI_ZERO		0x080 +/* + * Map the inode offset to the block given in ap->firstblock.  Primarily + * used for reflink.  The range must be in a hole, and this flag cannot be + * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. + * + * For bunmapi, this flag unmaps the range without adjusting quota, reducing + * refcount, or freeing the blocks. + */ +#define XFS_BMAPI_REMAP		0x100 + +/* Map something in the CoW fork. */ +#define XFS_BMAPI_COWFORK	0x200 +  #define XFS_BMAPI_FLAGS \  	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \  	{ XFS_BMAPI_METADATA,	"METADATA" }, \ @@ -105,12 +118,24 @@ struct xfs_extent_free_item  	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \  	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \  	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \ -	{ XFS_BMAPI_ZERO,	"ZERO" } +	{ XFS_BMAPI_ZERO,	"ZERO" }, \ +	{ XFS_BMAPI_REMAP,	"REMAP" }, \ +	{ XFS_BMAPI_COWFORK,	"COWFORK" }  static inline int xfs_bmapi_aflag(int w)  { -	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); +	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : +	       (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); +} + +static inline int xfs_bmapi_whichfork(int bmapi_flags) +{ +	if (bmapi_flags & XFS_BMAPI_COWFORK) +		return XFS_COW_FORK; +	else if (bmapi_flags & XFS_BMAPI_ATTRFORK) +		return XFS_ATTR_FORK; +	return XFS_DATA_FORK;  }  /* @@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w)  #define BMAP_LEFT_VALID		(1 << 6)  #define BMAP_RIGHT_VALID	(1 << 7)  #define BMAP_ATTRFORK		(1 << 8) +#define BMAP_COWFORK		(1 << 9)  #define XFS_BMAP_EXT_FLAGS \  	{ BMAP_LEFT_CONTIG,	"LC" }, \  	{ BMAP_RIGHT_CONTIG,	"RC" }, \  	{ BMAP_LEFT_FILLING,	"LF" }, \  	{ BMAP_RIGHT_FILLING,	"RF" }, \ -	{ BMAP_ATTRFORK,	"ATTR" } +	{ BMAP_ATTRFORK,	"ATTR" }, \ +	{ BMAP_COWFORK,		"COW" }  /* @@ -181,18 +208,20 @@ int	xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,  int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,  		xfs_filblks_t len, struct xfs_bmbt_irec *mval,  		int *nmap, int flags); -int	xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, -		xfs_filblks_t len, struct xfs_bmbt_irec *mval, -		int *nmap, int flags);  int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,  		xfs_fileoff_t bno, xfs_filblks_t len, int flags,  		xfs_fsblock_t *firstblock, xfs_extlen_t total,  		struct xfs_bmbt_irec *mval, int *nmap,  		struct xfs_defer_ops *dfops); +int	__xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, +		xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, +		xfs_extnum_t nexts, xfs_fsblock_t *firstblock, +		struct xfs_defer_ops *dfops);  int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,  		xfs_fileoff_t bno, xfs_filblks_t len, int flags,  		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,  		struct xfs_defer_ops *dfops, int *done); +int	xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del);  int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,  		xfs_extnum_t num);  uint	xfs_default_attroffset(struct xfs_inode *ip); @@ -202,5 +231,35 @@ int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,  		struct xfs_defer_ops *dfops, enum shift_direction direction,  		int num_exts);  int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +struct xfs_bmbt_rec_host * +	xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, +		int fork, int *eofp, xfs_extnum_t *lastxp, +		struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); +int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, +		xfs_fileoff_t aoff, xfs_filblks_t len, +		struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, +		xfs_extnum_t *lastx, int eof); + +enum xfs_bmap_intent_type { +	XFS_BMAP_MAP = 1, +	XFS_BMAP_UNMAP, +}; + +struct xfs_bmap_intent { +	struct list_head			bi_list; +	enum xfs_bmap_intent_type		bi_type; +	struct xfs_inode			*bi_owner; +	int					bi_whichfork; +	struct xfs_bmbt_irec			bi_bmap; +}; + +int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, +		struct xfs_inode *ip, enum xfs_bmap_intent_type type, +		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, +		xfs_filblks_t blockcount, xfs_exntst_t state); +int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +		struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);  #endif	/* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cd85274e810c..8007d2ba9aef 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,6 +453,7 @@ xfs_bmbt_alloc_block(  	if (args.fsbno == NULLFSBLOCK) {  		args.fsbno = be64_to_cpu(start->l); +try_another_ag:  		args.type = XFS_ALLOCTYPE_START_BNO;  		/*  		 * Make sure there is sufficient room left in the AG to @@ -482,6 +483,22 @@ xfs_bmbt_alloc_block(  	if (error)  		goto error0; +	/* +	 * During a CoW operation, the allocation and bmbt updates occur in +	 * different transactions.  The mapping code tries to put new bmbt +	 * blocks near extents being mapped, but the only way to guarantee this +	 * is if the alloc and the mapping happen in a single transaction that +	 * has a block reservation.  That isn't the case here, so if we run out +	 * of space we'll try again with another AG. +	 */ +	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && +	    args.fsbno == NULLFSBLOCK && +	    args.type == XFS_ALLOCTYPE_NEAR_BNO) { +		cur->bc_private.b.dfops->dop_low = true; +		args.fsbno = cur->bc_private.b.firstblock; +		goto try_another_ag; +	} +  	if (args.fsbno == NULLFSBLOCK && args.minleft) {  		/*  		 * Could not find an AG with enough free space to satisfy @@ -777,6 +794,7 @@ xfs_bmbt_init_cursor(  {  	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);  	struct xfs_btree_cur	*cur; +	ASSERT(whichfork != XFS_COW_FORK);  	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 08569792fe20..5c8e6f2ce44f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -45,9 +45,10 @@ kmem_zone_t	*xfs_btree_cur_zone;   */  static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {  	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, -	  XFS_FIBT_MAGIC }, +	  XFS_FIBT_MAGIC, 0 },  	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, -	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } +	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, +	  XFS_REFC_CRC_MAGIC }  };  #define xfs_btree_magic(cur) \  	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -1216,6 +1217,9 @@ xfs_btree_set_refs(  	case XFS_BTNUM_RMAP:  		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);  		break; +	case XFS_BTNUM_REFC: +		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); +		break;  	default:  		ASSERT(0);  	} @@ -2070,7 +2074,7 @@ __xfs_btree_updkeys(  	struct xfs_buf		*bp0,  	bool			force_all)  { -	union xfs_btree_bigkey	key;	/* keys from current level */ +	union xfs_btree_key	key;	/* keys from current level */  	union xfs_btree_key	*lkey;	/* keys from the next level up */  	union xfs_btree_key	*hkey;  	union xfs_btree_key	*nlkey;	/* keys from the next level up */ @@ -2086,7 +2090,7 @@ __xfs_btree_updkeys(  	trace_xfs_btree_updkeys(cur, level, bp0); -	lkey = (union xfs_btree_key *)&key; +	lkey = &key;  	hkey = xfs_btree_high_key_from_key(cur, lkey);  	xfs_btree_get_keys(cur, block, lkey);  	for (level++; level < cur->bc_nlevels; level++) { @@ -3226,7 +3230,7 @@ xfs_btree_insrec(  	struct xfs_buf		*bp;	/* buffer for block */  	union xfs_btree_ptr	nptr;	/* new block ptr */  	struct xfs_btree_cur	*ncur;	/* new btree cursor */ -	union xfs_btree_bigkey	nkey;	/* new block key */ +	union xfs_btree_key	nkey;	/* new block key */  	union xfs_btree_key	*lkey;  	int			optr;	/* old key/record index */  	int			ptr;	/* key/record index */ @@ -3241,7 +3245,7 @@ xfs_btree_insrec(  	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);  	ncur = NULL; -	lkey = (union xfs_btree_key *)&nkey; +	lkey = &nkey;  	/*  	 * If we have an external root pointer, and we've made it to the @@ -3444,14 +3448,14 @@ xfs_btree_insert(  	union xfs_btree_ptr	nptr;	/* new block number (split result) */  	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */  	struct xfs_btree_cur	*pcur;	/* previous level's cursor */ -	union xfs_btree_bigkey	bkey;	/* key of block to insert */ +	union xfs_btree_key	bkey;	/* key of block to insert */  	union xfs_btree_key	*key;  	union xfs_btree_rec	rec;	/* record to insert */  	level = 0;  	ncur = NULL;  	pcur = cur; -	key = (union xfs_btree_key *)&bkey; +	key = &bkey;  	xfs_btree_set_ptr_null(cur, &nptr); @@ -4797,3 +4801,50 @@ xfs_btree_query_range(  	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,  			fn, priv);  } + +/* + * Calculate the number of blocks needed to store a given number of records + * in a short-format (per-AG metadata) btree. + */ +xfs_extlen_t +xfs_btree_calc_size( +	struct xfs_mount	*mp, +	uint			*limits, +	unsigned long long	len) +{ +	int			level; +	int			maxrecs; +	xfs_extlen_t		rval; + +	maxrecs = limits[0]; +	for (level = 0, rval = 0; len > 1; level++) { +		len += maxrecs - 1; +		do_div(len, maxrecs); +		maxrecs = limits[1]; +		rval += len; +	} +	return rval; +} + +int +xfs_btree_count_blocks_helper( +	struct xfs_btree_cur	*cur, +	int			level, +	void			*data) +{ +	xfs_extlen_t		*blocks = data; +	(*blocks)++; + +	return 0; +} + +/* Count the blocks in a btree and return the result in *blocks. */ +int +xfs_btree_count_blocks( +	struct xfs_btree_cur	*cur, +	xfs_extlen_t		*blocks) +{ +	*blocks = 0; +	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, +			blocks); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 04d0865e5e6d..c2b01d1c79ee 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -37,30 +37,19 @@ union xfs_btree_ptr {  	__be64			l;	/* long form ptr */  }; -union xfs_btree_key { -	struct xfs_bmbt_key		bmbt; -	xfs_bmdr_key_t			bmbr;	/* bmbt root block */ -	xfs_alloc_key_t			alloc; -	struct xfs_inobt_key		inobt; -	struct xfs_rmap_key		rmap; -}; -  /* - * In-core key that holds both low and high keys for overlapped btrees. - * The two keys are packed next to each other on disk, so do the same - * in memory.  Preserve the existing xfs_btree_key as a single key to - * avoid the mental model breakage that would happen if we passed a - * bigkey into a function that operates on a single key. + * The in-core btree key.  Overlapping btrees actually store two keys + * per pointer, so we reserve enough memory to hold both.  The __*bigkey + * items should never be accessed directly.   */ -union xfs_btree_bigkey { +union xfs_btree_key {  	struct xfs_bmbt_key		bmbt;  	xfs_bmdr_key_t			bmbr;	/* bmbt root block */  	xfs_alloc_key_t			alloc;  	struct xfs_inobt_key		inobt; -	struct { -		struct xfs_rmap_key	rmap; -		struct xfs_rmap_key	rmap_hi; -	}; +	struct xfs_rmap_key		rmap; +	struct xfs_rmap_key		__rmap_bigkey[2]; +	struct xfs_refcount_key		refc;  };  union xfs_btree_rec { @@ -69,6 +58,7 @@ union xfs_btree_rec {  	struct xfs_alloc_rec		alloc;  	struct xfs_inobt_rec		inobt;  	struct xfs_rmap_rec		rmap; +	struct xfs_refcount_rec		refc;  };  /* @@ -84,6 +74,7 @@ union xfs_btree_rec {  #define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)  #define	XFS_BTNUM_FINO	((xfs_btnum_t)XFS_BTNUM_FINOi)  #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi) +#define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)  /*   * For logging record fields. @@ -117,6 +108,7 @@ do {    \  	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \  	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \  	case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ +	case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \  	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\  	}       \  } while (0) @@ -139,6 +131,8 @@ do {    \  		__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \  	case XFS_BTNUM_RMAP:	\  		__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ +	case XFS_BTNUM_REFC:	\ +		__XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \  	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \  	}       \  } while (0) @@ -229,6 +223,15 @@ union xfs_btree_irec {  	struct xfs_bmbt_irec		b;  	struct xfs_inobt_rec_incore	i;  	struct xfs_rmap_irec		r; +	struct xfs_refcount_irec	rc; +}; + +/* Per-AG btree private information. */ +union xfs_btree_cur_private { +	struct { +		unsigned long	nr_ops;		/* # record updates */ +		int		shape_changes;	/* # of extent splits */ +	} refc;  };  /* @@ -255,6 +258,7 @@ typedef struct xfs_btree_cur  			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */  			struct xfs_defer_ops *dfops;	/* deferred updates */  			xfs_agnumber_t	agno;	/* ag number */ +			union xfs_btree_cur_private	priv;  		} a;  		struct {			/* needed for BMAP */  			struct xfs_inode *ip;	/* pointer to our inode */ @@ -513,6 +517,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);  bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);  uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,  				 unsigned long len); +xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, +		unsigned long long len);  /* return codes */  #define XFS_BTREE_QUERY_RANGE_CONTINUE	0	/* keep iterating */ @@ -529,4 +535,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,  int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,  		xfs_btree_visit_blocks_fn fn, void *data); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +  #endif	/* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c221d0ecd52e..613c5cf19436 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -81,6 +81,10 @@   *   - For each work item attached to the log intent item,   *     * Perform the described action.   *     * Attach the work item to the log done item. + *     * If the result of doing the work was -EAGAIN, ->finish work + *       wants a new transaction.  See the "Requesting a Fresh + *       Transaction while Finishing Deferred Work" section below for + *       details.   *   * The key here is that we must log an intent item for all pending   * work items every time we roll the transaction, and that we must log @@ -88,6 +92,34 @@   * we can perform complex remapping operations, chaining intent items   * as needed.   * + * Requesting a Fresh Transaction while Finishing Deferred Work + * + * If ->finish_item decides that it needs a fresh transaction to + * finish the work, it must ask its caller (xfs_defer_finish) for a + * continuation.  The most likely cause of this circumstance are the + * refcount adjust functions deciding that they've logged enough items + * to be at risk of exceeding the transaction reservation. + * + * To get a fresh transaction, we want to log the existing log done + * item to prevent the log intent item from replaying, immediately log + * a new log intent item with the unfinished work items, roll the + * transaction, and re-call ->finish_item wherever it left off.  The + * log done item and the new log intent item must be in the same + * transaction or atomicity cannot be guaranteed; defer_finish ensures + * that this happens. + * + * This requires some coordination between ->finish_item and + * defer_finish.  Upon deciding to request a new transaction, + * ->finish_item should update the current work item to reflect the + * unfinished work.  Next, it should reset the log done item's list + * count to the number of items finished, and return -EAGAIN. + * defer_finish sees the -EAGAIN, logs the new log intent item + * with the remaining work items, and leaves the xfs_defer_pending + * item at the head of the dop_work queue.  Then it rolls the + * transaction and picks up processing where it left off.  It is + * required that ->finish_item must be careful to leave enough + * transaction reservation to fit the new log intent item. + *   * This is an example of remapping the extent (E, E+B) into file X at   * offset A and dealing with the extent (C, C+B) already being mapped   * there: @@ -104,21 +136,26 @@   * | Intent to add rmap (X, E, A, B)                 |   * +-------------------------------------------------+   * | Reduce refcount for extent (C, B)               | t2 - * | Done reducing refcount for extent (C, B)        | + * | Done reducing refcount for extent (C, 9)        | + * | Intent to reduce refcount for extent (C+9, B-9) | + * | (ran out of space after 9 refcount updates)     | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C+9, B+9)           | t3 + * | Done reducing refcount for extent (C+9, B-9)    |   * | Increase refcount for extent (E, B)             |   * | Done increasing refcount for extent (E, B)      |   * | Intent to free extent (C, B)                    |   * | Intent to free extent (F, 1) (refcountbt block) |   * | Intent to remove rmap (F, 1, REFC)              |   * +-------------------------------------------------+ - * | Remove rmap (X, C, A, B)                        | t3 + * | Remove rmap (X, C, A, B)                        | t4   * | Done removing rmap (X, C, A, B)                 |   * | Add rmap (X, E, A, B)                           |   * | Done adding rmap (X, E, A, B)                   |   * | Remove rmap (F, 1, REFC)                        |   * | Done removing rmap (F, 1, REFC)                 |   * +-------------------------------------------------+ - * | Free extent (C, B)                              | t4 + * | Free extent (C, B)                              | t5   * | Done freeing extent (C, B)                      |   * | Free extent (D, 1)                              |   * | Done freeing extent (D, 1)                      | @@ -141,6 +178,9 @@   * - Intent to free extent (C, B)   * - Intent to free extent (F, 1) (refcountbt block)   * - Intent to remove rmap (F, 1, REFC) + * + * Note that the continuation requested between t2 and t3 is likely to + * reoccur.   */  static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; @@ -323,7 +363,16 @@ xfs_defer_finish(  			dfp->dfp_count--;  			error = dfp->dfp_type->finish_item(*tp, dop, li,  					dfp->dfp_done, &state); -			if (error) { +			if (error == -EAGAIN) { +				/* +				 * Caller wants a fresh transaction; +				 * put the work item back on the list +				 * and jump out. +				 */ +				list_add(li, &dfp->dfp_work); +				dfp->dfp_count++; +				break; +			} else if (error) {  				/*  				 * Clean up after ourselves and jump out.  				 * xfs_defer_cancel will take care of freeing @@ -335,9 +384,25 @@ xfs_defer_finish(  				goto out;  			}  		} -		/* Done with the dfp, free it. */ -		list_del(&dfp->dfp_list); -		kmem_free(dfp); +		if (error == -EAGAIN) { +			/* +			 * Caller wants a fresh transaction, so log a +			 * new log intent item to replace the old one +			 * and roll the transaction.  See "Requesting +			 * a Fresh Transaction while Finishing +			 * Deferred Work" above. +			 */ +			dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, +					dfp->dfp_count); +			dfp->dfp_done = NULL; +			list_for_each(li, &dfp->dfp_work) +				dfp->dfp_type->log_item(*tp, dfp->dfp_intent, +						li); +		} else { +			/* Done with the dfp, free it. */ +			list_del(&dfp->dfp_list); +			kmem_free(dfp); +		}  		if (cleanup_fn)  			cleanup_fn(*tp, state, error); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index e96533d178cf..f6e93ef0bffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -51,6 +51,8 @@ struct xfs_defer_pending {   * find all the space it needs.   */  enum xfs_defer_ops_type { +	XFS_DEFER_OPS_TYPE_BMAP, +	XFS_DEFER_OPS_TYPE_REFCOUNT,  	XFS_DEFER_OPS_TYPE_RMAP,  	XFS_DEFER_OPS_TYPE_FREE,  	XFS_DEFER_OPS_TYPE_MAX, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 270fb5cf4fa1..f6547fc5e016 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -456,9 +456,11 @@ xfs_sb_has_compat_feature(  #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */  #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */ +#define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */  #define XFS_SB_FEAT_RO_COMPAT_ALL \  		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \ -		 XFS_SB_FEAT_RO_COMPAT_RMAPBT) +		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ +		 XFS_SB_FEAT_RO_COMPAT_REFLINK)  #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL  static inline bool  xfs_sb_has_ro_compat_feature( @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)  		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);  } +static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) +{ +	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && +		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); +} +  /*   * end of superblock version macros   */ @@ -641,14 +649,17 @@ typedef struct xfs_agf {  	uuid_t		agf_uuid;	/* uuid of filesystem */  	__be32		agf_rmap_blocks;	/* rmapbt blocks used */ -	__be32		agf_padding;		/* padding */ +	__be32		agf_refcount_blocks;	/* refcountbt blocks used */ + +	__be32		agf_refcount_root;	/* refcount tree root block */ +	__be32		agf_refcount_level;	/* refcount btree levels */  	/*  	 * reserve some contiguous space for future logged fields before we add  	 * the unlogged fields. This makes the range logging via flags and  	 * structure offsets much simpler.  	 */ -	__be64		agf_spare64[15]; +	__be64		agf_spare64[14];  	/* unlogged fields, written during buffer writeback. */  	__be64		agf_lsn;	/* last write sequence */ @@ -674,8 +685,11 @@ typedef struct xfs_agf {  #define	XFS_AGF_BTREEBLKS	0x00000800  #define	XFS_AGF_UUID		0x00001000  #define	XFS_AGF_RMAP_BLOCKS	0x00002000 -#define	XFS_AGF_SPARE64		0x00004000 -#define	XFS_AGF_NUM_BITS	15 +#define	XFS_AGF_REFCOUNT_BLOCKS	0x00004000 +#define	XFS_AGF_REFCOUNT_ROOT	0x00008000 +#define	XFS_AGF_REFCOUNT_LEVEL	0x00010000 +#define	XFS_AGF_SPARE64		0x00020000 +#define	XFS_AGF_NUM_BITS	18  #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)  #define XFS_AGF_FLAGS \ @@ -693,6 +707,9 @@ typedef struct xfs_agf {  	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \  	{ XFS_AGF_UUID,		"UUID" }, \  	{ XFS_AGF_RMAP_BLOCKS,	"RMAP_BLOCKS" }, \ +	{ XFS_AGF_REFCOUNT_BLOCKS,	"REFCOUNT_BLOCKS" }, \ +	{ XFS_AGF_REFCOUNT_ROOT,	"REFCOUNT_ROOT" }, \ +	{ XFS_AGF_REFCOUNT_LEVEL,	"REFCOUNT_LEVEL" }, \  	{ XFS_AGF_SPARE64,	"SPARE64" }  /* disk block (xfs_daddr_t) in the AG */ @@ -885,7 +902,8 @@ typedef struct xfs_dinode {  	__be64		di_changecount;	/* number of attribute changes */  	__be64		di_lsn;		/* flush sequence */  	__be64		di_flags2;	/* more random flags */ -	__u8		di_pad2[16];	/* more padding for future expansion */ +	__be32		di_cowextsize;	/* basic cow extent size for file */ +	__u8		di_pad2[12];	/* more padding for future expansion */  	/* fields only written to during inode creation */  	xfs_timestamp_t	di_crtime;	/* time created */ @@ -1041,9 +1059,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)   * 16 bits of the XFS_XFLAG_s range.   */  #define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */ +#define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */  #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_ANY		(XFS_DIFLAG2_DAX) +#define XFS_DIFLAG2_ANY \ +	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)  /*   * Inode number format: @@ -1353,7 +1376,9 @@ struct xfs_owner_info {  #define XFS_RMAP_OWN_AG		(-5ULL)	/* AG freespace btree blocks */  #define XFS_RMAP_OWN_INOBT	(-6ULL)	/* Inode btree blocks */  #define XFS_RMAP_OWN_INODES	(-7ULL)	/* Inode chunk */ -#define XFS_RMAP_OWN_MIN	(-8ULL) /* guard */ +#define XFS_RMAP_OWN_REFC	(-8ULL) /* refcount tree */ +#define XFS_RMAP_OWN_COW	(-9ULL) /* cow allocations */ +#define XFS_RMAP_OWN_MIN	(-10ULL) /* guard */  #define XFS_RMAP_NON_INODE_OWNER(owner)	(!!((owner) & (1ULL << 63))) @@ -1434,6 +1459,62 @@ typedef __be32 xfs_rmap_ptr_t;  	 XFS_IBT_BLOCK(mp) + 1)  /* + * Reference Count Btree format definitions + * + */ +#define	XFS_REFC_CRC_MAGIC	0x52334643	/* 'R3FC' */ + +unsigned int xfs_refc_block(struct xfs_mount *mp); + +/* + * Data record/key structure + * + * Each record associates a range of physical blocks (starting at + * rc_startblock and ending rc_blockcount blocks later) with a reference + * count (rc_refcount).  Extents that are being used to stage a copy on + * write (CoW) operation are recorded in the refcount btree with a + * refcount of 1.  All other records must have a refcount > 1 and must + * track an extent mapped only by file data forks. + * + * Extents with a single owner (attributes, metadata, non-shared file + * data) are not tracked here.  Free space is also not tracked here. + * This is consistent with pre-reflink XFS. + */ + +/* + * Extents that are being used to stage a copy on write are stored + * in the refcount btree with a refcount of 1 and the upper bit set + * on the startblock.  This speeds up mount time deletion of stale + * staging extents because they're all at the right side of the tree. + */ +#define XFS_REFC_COW_START		((xfs_agblock_t)(1U << 31)) +#define REFCNTBT_COWFLAG_BITLEN		1 +#define REFCNTBT_AGBLOCK_BITLEN		31 + +struct xfs_refcount_rec { +	__be32		rc_startblock;	/* starting block number */ +	__be32		rc_blockcount;	/* count of blocks */ +	__be32		rc_refcount;	/* number of inodes linked here */ +}; + +struct xfs_refcount_key { +	__be32		rc_startblock;	/* starting block number */ +}; + +struct xfs_refcount_irec { +	xfs_agblock_t	rc_startblock;	/* starting block number */ +	xfs_extlen_t	rc_blockcount;	/* count of free blocks */ +	xfs_nlink_t	rc_refcount;	/* number of inodes linked here */ +}; + +#define MAXREFCOUNT	((xfs_nlink_t)~0U) +#define MAXREFCEXTLEN	((xfs_extlen_t)~0U) + +/* btree pointer type */ +typedef __be32 xfs_refcount_ptr_t; + + +/*   * BMAP Btree format definitions   *   * This includes both the root block definition that sits inside an inode fork diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 79455058b752..b72dc821d78b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -81,14 +81,16 @@ struct getbmapx {  #define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */  #define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */  #define BMV_IF_NO_HOLES		0x10	/* Do not return holes */ +#define BMV_IF_COWFORK		0x20	/* return CoW fork rather than data */  #define BMV_IF_VALID	\  	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\ -	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES) +	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)  /*	bmv_oflags values - returned for each non-header segment */  #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */  #define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */  #define BMV_OF_LAST		0x4	/* segment is the last in the file */ +#define BMV_OF_SHARED		0x8	/* segment shared with another file */  /*   * Structure for XFS_IOC_FSSETDM. @@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks {  #define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */  #define XFS_FSOP_GEOM_FLAGS_FINOBT	0x20000	/* free inode btree */  #define XFS_FSOP_GEOM_FLAGS_SPINODES	0x40000	/* sparse inode chunks	*/ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* Reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK	0x100000 /* files can share blocks */  /*   * Minimum and maximum sizes need for growth checks. @@ -275,7 +278,8 @@ typedef struct xfs_bstat {  #define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/  	__u16		bs_forkoff;	/* inode fork offset in bytes	*/  	__u16		bs_projid_hi;	/* higher part of project id	*/ -	unsigned char	bs_pad[10];	/* pad space, unused		*/ +	unsigned char	bs_pad[6];	/* pad space, unused		*/ +	__u32		bs_cowextsize;	/* cow extent size		*/  	__u32		bs_dmevmask;	/* DMIG event mask		*/  	__u16		bs_dmstate;	/* DMIG state info		*/  	__u16		bs_aextents;	/* attribute number of extents	*/ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 31ca2208c03d..eab68ae2e011 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -132,7 +132,7 @@ xfs_inobt_free_block(  	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);  	return xfs_free_extent(cur->bc_tp,  			XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, -			&oinfo); +			&oinfo, XFS_AG_RESV_NONE);  }  STATIC int diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4b9769e23c83..8de9a3a29589 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -256,6 +256,7 @@ xfs_inode_from_disk(  		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);  		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);  		to->di_flags2 = be64_to_cpu(from->di_flags2); +		to->di_cowextsize = be32_to_cpu(from->di_cowextsize);  	}  } @@ -305,7 +306,7 @@ xfs_inode_to_disk(  		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);  		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);  		to->di_flags2 = cpu_to_be64(from->di_flags2); - +		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);  		to->di_ino = cpu_to_be64(ip->i_ino);  		to->di_lsn = cpu_to_be64(lsn);  		memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -357,6 +358,7 @@ xfs_log_dinode_to_disk(  		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);  		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);  		to->di_flags2 = cpu_to_be64(from->di_flags2); +		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);  		to->di_ino = cpu_to_be64(from->di_ino);  		to->di_lsn = cpu_to_be64(from->di_lsn);  		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); @@ -373,6 +375,9 @@ xfs_dinode_verify(  	struct xfs_inode	*ip,  	struct xfs_dinode	*dip)  { +	uint16_t		flags; +	uint64_t		flags2; +  	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))  		return false; @@ -389,6 +394,23 @@ xfs_dinode_verify(  		return false;  	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))  		return false; + +	flags = be16_to_cpu(dip->di_flags); +	flags2 = be64_to_cpu(dip->di_flags2); + +	/* don't allow reflink/cowextsize if we don't have reflink */ +	if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && +            !xfs_sb_version_hasreflink(&mp->m_sb)) +		return false; + +	/* don't let reflink and realtime mix */ +	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) +		return false; + +	/* don't let reflink and dax mix */ +	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) +		return false; +  	return true;  } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7c4dd321b215..62d9d4681c8c 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -47,6 +47,7 @@ struct xfs_icdinode {  	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */  	__uint64_t	di_flags2;	/* more random flags */ +	__uint32_t	di_cowextsize;	/* basic cow extent size for file */  	xfs_ictimestamp_t di_crtime;	/* time created */  }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bbcc8c7a44b3..5dd56d3dbb3a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -121,6 +121,26 @@ xfs_iformat_fork(  		return -EFSCORRUPTED;  	} +	if (unlikely(xfs_is_reflink_inode(ip) && +	    (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { +		xfs_warn(ip->i_mount, +			"corrupt dinode %llu, wrong file type for reflink.", +			ip->i_ino); +		XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", +				     XFS_ERRLEVEL_LOW, ip->i_mount, dip); +		return -EFSCORRUPTED; +	} + +	if (unlikely(xfs_is_reflink_inode(ip) && +	    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { +		xfs_warn(ip->i_mount, +			"corrupt dinode %llu, has reflink+realtime flag set.", +			ip->i_ino); +		XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", +				     XFS_ERRLEVEL_LOW, ip->i_mount, dip); +		return -EFSCORRUPTED; +	} +  	switch (VFS_I(ip)->i_mode & S_IFMT) {  	case S_IFIFO:  	case S_IFCHR: @@ -186,9 +206,14 @@ xfs_iformat_fork(  		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);  		return -EFSCORRUPTED;  	} -	if (error) { +	if (error)  		return error; + +	if (xfs_is_reflink_inode(ip)) { +		ASSERT(ip->i_cowfp == NULL); +		xfs_ifork_init_cow(ip);  	} +  	if (!XFS_DFORK_Q(dip))  		return 0; @@ -208,7 +233,8 @@ xfs_iformat_fork(  			XFS_CORRUPTION_ERROR("xfs_iformat(8)",  					     XFS_ERRLEVEL_LOW,  					     ip->i_mount, dip); -			return -EFSCORRUPTED; +			error = -EFSCORRUPTED; +			break;  		}  		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); @@ -226,6 +252,9 @@ xfs_iformat_fork(  	if (error) {  		kmem_zone_free(xfs_ifork_zone, ip->i_afp);  		ip->i_afp = NULL; +		if (ip->i_cowfp) +			kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); +		ip->i_cowfp = NULL;  		xfs_idestroy_fork(ip, XFS_DATA_FORK);  	}  	return error; @@ -740,6 +769,9 @@ xfs_idestroy_fork(  	if (whichfork == XFS_ATTR_FORK) {  		kmem_zone_free(xfs_ifork_zone, ip->i_afp);  		ip->i_afp = NULL; +	} else if (whichfork == XFS_COW_FORK) { +		kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); +		ip->i_cowfp = NULL;  	}  } @@ -927,6 +959,19 @@ xfs_iext_get_ext(  	}  } +/* Convert bmap state flags to an inode fork. */ +struct xfs_ifork * +xfs_iext_state_to_fork( +	struct xfs_inode	*ip, +	int			state) +{ +	if (state & BMAP_COWFORK) +		return ip->i_cowfp; +	else if (state & BMAP_ATTRFORK) +		return ip->i_afp; +	return &ip->i_df; +} +  /*   * Insert new item(s) into the extent records for incore inode   * fork 'ifp'.  'count' new items are inserted at index 'idx'. @@ -939,7 +984,7 @@ xfs_iext_insert(  	xfs_bmbt_irec_t	*new,		/* items to insert */  	int		state)		/* type of extent conversion */  { -	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; +	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);  	xfs_extnum_t	i;		/* extent record index */  	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); @@ -1189,7 +1234,7 @@ xfs_iext_remove(  	int		ext_diff,	/* number of extents to remove */  	int		state)		/* type of extent conversion */  { -	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; +	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);  	xfs_extnum_t	nextents;	/* number of extents in file */  	int		new_size;	/* size of extents after removal */ @@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs(  		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;  	}  } + +/* + * Initialize an inode's copy-on-write fork. + */ +void +xfs_ifork_init_cow( +	struct xfs_inode	*ip) +{ +	if (ip->i_cowfp) +		return; + +	ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, +				       KM_SLEEP | KM_NOFS); +	ip->i_cowfp->if_flags = XFS_IFEXTENTS; +	ip->i_cformat = XFS_DINODE_FMT_EXTENTS; +	ip->i_cnextents = 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index f95e072ae646..c9476f50e32d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -92,7 +92,9 @@ typedef struct xfs_ifork {  #define XFS_IFORK_PTR(ip,w)		\  	((w) == XFS_DATA_FORK ? \  		&(ip)->i_df : \ -		(ip)->i_afp) +		((w) == XFS_ATTR_FORK ? \ +			(ip)->i_afp : \ +			(ip)->i_cowfp))  #define XFS_IFORK_DSIZE(ip) \  	(XFS_IFORK_Q(ip) ? \  		XFS_IFORK_BOFF(ip) : \ @@ -105,26 +107,38 @@ typedef struct xfs_ifork {  #define XFS_IFORK_SIZE(ip,w) \  	((w) == XFS_DATA_FORK ? \  		XFS_IFORK_DSIZE(ip) : \ -		XFS_IFORK_ASIZE(ip)) +		((w) == XFS_ATTR_FORK ? \ +			XFS_IFORK_ASIZE(ip) : \ +			0))  #define XFS_IFORK_FORMAT(ip,w) \  	((w) == XFS_DATA_FORK ? \  		(ip)->i_d.di_format : \ -		(ip)->i_d.di_aformat) +		((w) == XFS_ATTR_FORK ? \ +			(ip)->i_d.di_aformat : \ +			(ip)->i_cformat))  #define XFS_IFORK_FMT_SET(ip,w,n) \  	((w) == XFS_DATA_FORK ? \  		((ip)->i_d.di_format = (n)) : \ -		((ip)->i_d.di_aformat = (n))) +		((w) == XFS_ATTR_FORK ? \ +			((ip)->i_d.di_aformat = (n)) : \ +			((ip)->i_cformat = (n))))  #define XFS_IFORK_NEXTENTS(ip,w) \  	((w) == XFS_DATA_FORK ? \  		(ip)->i_d.di_nextents : \ -		(ip)->i_d.di_anextents) +		((w) == XFS_ATTR_FORK ? \ +			(ip)->i_d.di_anextents : \ +			(ip)->i_cnextents))  #define XFS_IFORK_NEXT_SET(ip,w,n) \  	((w) == XFS_DATA_FORK ? \  		((ip)->i_d.di_nextents = (n)) : \ -		((ip)->i_d.di_anextents = (n))) +		((w) == XFS_ATTR_FORK ? \ +			((ip)->i_d.di_anextents = (n)) : \ +			((ip)->i_cnextents = (n))))  #define XFS_IFORK_MAXEXT(ip, w) \  	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); +  int		xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);  void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,  				struct xfs_inode_log_item *, int); @@ -169,4 +183,6 @@ void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);  extern struct kmem_zone	*xfs_ifork_zone; +extern void xfs_ifork_init_cow(struct xfs_inode *ip); +  #endif	/* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a6eed43fa7cd..083cdd6d6c28 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr)  #define XLOG_REG_TYPE_ICREATE		20  #define XLOG_REG_TYPE_RUI_FORMAT	21  #define XLOG_REG_TYPE_RUD_FORMAT	22 -#define XLOG_REG_TYPE_MAX		22 +#define XLOG_REG_TYPE_CUI_FORMAT	23 +#define XLOG_REG_TYPE_CUD_FORMAT	24 +#define XLOG_REG_TYPE_BUI_FORMAT	25 +#define XLOG_REG_TYPE_BUD_FORMAT	26 +#define XLOG_REG_TYPE_MAX		26  /*   * Flags to log operation header @@ -231,6 +235,10 @@ typedef struct xfs_trans_header {  #define	XFS_LI_ICREATE		0x123f  #define	XFS_LI_RUI		0x1240	/* rmap update intent */  #define	XFS_LI_RUD		0x1241 +#define	XFS_LI_CUI		0x1242	/* refcount update intent */ +#define	XFS_LI_CUD		0x1243 +#define	XFS_LI_BUI		0x1244	/* bmbt update intent */ +#define	XFS_LI_BUD		0x1245  #define XFS_LI_TYPE_DESC \  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \ @@ -242,7 +250,11 @@ typedef struct xfs_trans_header {  	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }, \  	{ XFS_LI_ICREATE,	"XFS_LI_ICREATE" }, \  	{ XFS_LI_RUI,		"XFS_LI_RUI" }, \ -	{ XFS_LI_RUD,		"XFS_LI_RUD" } +	{ XFS_LI_RUD,		"XFS_LI_RUD" }, \ +	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \ +	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \ +	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \ +	{ XFS_LI_BUD,		"XFS_LI_BUD" }  /*   * Inode Log Item Format definitions. @@ -411,7 +423,8 @@ struct xfs_log_dinode {  	__uint64_t	di_changecount;	/* number of attribute changes */  	xfs_lsn_t	di_lsn;		/* flush sequence */  	__uint64_t	di_flags2;	/* more random flags */ -	__uint8_t	di_pad2[16];	/* more padding for future expansion */ +	__uint32_t	di_cowextsize;	/* basic cow extent size for file */ +	__uint8_t	di_pad2[12];	/* more padding for future expansion */  	/* fields only written to during inode creation */  	xfs_ictimestamp_t di_crtime;	/* time created */ @@ -622,8 +635,11 @@ struct xfs_map_extent {  /* rmap me_flags: upper bits are flags, lower byte is type code */  #define XFS_RMAP_EXTENT_MAP		1 +#define XFS_RMAP_EXTENT_MAP_SHARED	2  #define XFS_RMAP_EXTENT_UNMAP		3 +#define XFS_RMAP_EXTENT_UNMAP_SHARED	4  #define XFS_RMAP_EXTENT_CONVERT		5 +#define XFS_RMAP_EXTENT_CONVERT_SHARED	6  #define XFS_RMAP_EXTENT_ALLOC		7  #define XFS_RMAP_EXTENT_FREE		8  #define XFS_RMAP_EXTENT_TYPE_MASK	0xFF @@ -647,9 +663,17 @@ struct xfs_rui_log_format {  	__uint16_t		rui_size;	/* size of this item */  	__uint32_t		rui_nextents;	/* # extents to free */  	__uint64_t		rui_id;		/* rui identifier */ -	struct xfs_map_extent	rui_extents[1];	/* array of extents to rmap */ +	struct xfs_map_extent	rui_extents[];	/* array of extents to rmap */  }; +static inline size_t +xfs_rui_log_format_sizeof( +	unsigned int		nr) +{ +	return sizeof(struct xfs_rui_log_format) + +			nr * sizeof(struct xfs_map_extent); +} +  /*   * This is the structure used to lay out an rud log item in the   * log.  The rud_extents array is a variable size array whose @@ -663,6 +687,102 @@ struct xfs_rud_log_format {  };  /* + * CUI/CUD (refcount update) log format definitions + */ +struct xfs_phys_extent { +	__uint64_t		pe_startblock; +	__uint32_t		pe_len; +	__uint32_t		pe_flags; +}; + +/* refcount pe_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_refcount_intent_type. */ +#define XFS_REFCOUNT_EXTENT_TYPE_MASK	0xFF + +#define XFS_REFCOUNT_EXTENT_FLAGS	(XFS_REFCOUNT_EXTENT_TYPE_MASK) + +/* + * This is the structure used to lay out a cui log item in the + * log.  The cui_extents field is a variable size array whose + * size is given by cui_nextents. + */ +struct xfs_cui_log_format { +	__uint16_t		cui_type;	/* cui log item type */ +	__uint16_t		cui_size;	/* size of this item */ +	__uint32_t		cui_nextents;	/* # extents to free */ +	__uint64_t		cui_id;		/* cui identifier */ +	struct xfs_phys_extent	cui_extents[];	/* array of extents */ +}; + +static inline size_t +xfs_cui_log_format_sizeof( +	unsigned int		nr) +{ +	return sizeof(struct xfs_cui_log_format) + +			nr * sizeof(struct xfs_phys_extent); +} + +/* + * This is the structure used to lay out a cud log item in the + * log.  The cud_extents array is a variable size array whose + * size is given by cud_nextents; + */ +struct xfs_cud_log_format { +	__uint16_t		cud_type;	/* cud log item type */ +	__uint16_t		cud_size;	/* size of this item */ +	__uint32_t		__pad; +	__uint64_t		cud_cui_id;	/* id of corresponding cui */ +}; + +/* + * BUI/BUD (inode block mapping) log format definitions + */ + +/* bmbt me_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_bmap_intent_type. */ +#define XFS_BMAP_EXTENT_TYPE_MASK	0xFF + +#define XFS_BMAP_EXTENT_ATTR_FORK	(1U << 31) +#define XFS_BMAP_EXTENT_UNWRITTEN	(1U << 30) + +#define XFS_BMAP_EXTENT_FLAGS		(XFS_BMAP_EXTENT_TYPE_MASK | \ +					 XFS_BMAP_EXTENT_ATTR_FORK | \ +					 XFS_BMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an bui log item in the + * log.  The bui_extents field is a variable size array whose + * size is given by bui_nextents. + */ +struct xfs_bui_log_format { +	__uint16_t		bui_type;	/* bui log item type */ +	__uint16_t		bui_size;	/* size of this item */ +	__uint32_t		bui_nextents;	/* # extents to free */ +	__uint64_t		bui_id;		/* bui identifier */ +	struct xfs_map_extent	bui_extents[];	/* array of extents to bmap */ +}; + +static inline size_t +xfs_bui_log_format_sizeof( +	unsigned int		nr) +{ +	return sizeof(struct xfs_bui_log_format) + +			nr * sizeof(struct xfs_map_extent); +} + +/* + * This is the structure used to lay out an bud log item in the + * log.  The bud_extents array is a variable size array whose + * size is given by bud_nextents; + */ +struct xfs_bud_log_format { +	__uint16_t		bud_type;	/* bud log item type */ +	__uint16_t		bud_size;	/* size of this item */ +	__uint32_t		__pad; +	__uint64_t		bud_bui_id;	/* id of corresponding bui */ +}; + +/*   * Dquot Log format definitions.   *   * The first two fields must be the type and size fitting into diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c new file mode 100644 index 000000000000..b177ef33cd4c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -0,0 +1,1698 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_refcount.h" +#include "xfs_rmap.h" + +/* Allowable refcount adjustment amounts. */ +enum xfs_refc_adjust_op { +	XFS_REFCOUNT_ADJUST_INCREASE	= 1, +	XFS_REFCOUNT_ADJUST_DECREASE	= -1, +	XFS_REFCOUNT_ADJUST_COW_ALLOC	= 0, +	XFS_REFCOUNT_ADJUST_COW_FREE	= -1, +}; + +STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur, +		xfs_agblock_t agbno, xfs_extlen_t aglen, +		struct xfs_defer_ops *dfops); +STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, +		xfs_agblock_t agbno, xfs_extlen_t aglen, +		struct xfs_defer_ops *dfops); + +/* + * Look up the first record less than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_le( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	int			*stat) +{ +	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, +			XFS_LOOKUP_LE); +	cur->bc_rec.rc.rc_startblock = bno; +	cur->bc_rec.rc.rc_blockcount = 0; +	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Look up the first record greater than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_ge( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	int			*stat) +{ +	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, +			XFS_LOOKUP_GE); +	cur->bc_rec.rc.rc_startblock = bno; +	cur->bc_rec.rc.rc_blockcount = 0; +	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* Convert on-disk record to in-core format. */ +static inline void +xfs_refcount_btrec_to_irec( +	union xfs_btree_rec		*rec, +	struct xfs_refcount_irec	*irec) +{ +	irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); +	irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); +	irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*irec, +	int				*stat) +{ +	union xfs_btree_rec		*rec; +	int				error; + +	error = xfs_btree_get_rec(cur, &rec, stat); +	if (!error && *stat == 1) { +		xfs_refcount_btrec_to_irec(rec, irec); +		trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, +				irec); +	} +	return error; +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_update( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*irec) +{ +	union xfs_btree_rec	rec; +	int			error; + +	trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); +	rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); +	rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); +	rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); +	error = xfs_btree_update(cur, &rec); +	if (error) +		trace_xfs_refcount_update_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Insert the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_insert( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*irec, +	int				*i) +{ +	int				error; + +	trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); +	cur->bc_rec.rc.rc_startblock = irec->rc_startblock; +	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; +	cur->bc_rec.rc.rc_refcount = irec->rc_refcount; +	error = xfs_btree_insert(cur, i); +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); +out_error: +	if (error) +		trace_xfs_refcount_insert_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Remove the record referred to by cur, then set the pointer to the spot + * where the record could be re-inserted, in case we want to increment or + * decrement the cursor. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_delete( +	struct xfs_btree_cur	*cur, +	int			*i) +{ +	struct xfs_refcount_irec	irec; +	int			found_rec; +	int			error; + +	error = xfs_refcount_get_rec(cur, &irec, &found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); +	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); +	error = xfs_btree_delete(cur, i); +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); +	if (error) +		goto out_error; +	error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); +out_error: +	if (error) +		trace_xfs_refcount_delete_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Adjusting the Reference Count + * + * As stated elsewhere, the reference count btree (refcbt) stores + * >1 reference counts for extents of physical blocks.  In this + * operation, we're either raising or lowering the reference count of + * some subrange stored in the tree: + * + *      <------ adjustment range ------> + * ----+   +---+-----+ +--+--------+--------- + *  2  |   | 3 |  4  | |17|   55   |   10 + * ----+   +---+-----+ +--+--------+--------- + * X axis is physical blocks number; + * reference counts are the numbers inside the rectangles + * + * The first thing we need to do is to ensure that there are no + * refcount extents crossing either boundary of the range to be + * adjusted.  For any extent that does cross a boundary, split it into + * two extents so that we can increment the refcount of one of the + * pieces later: + * + *      <------ adjustment range ------> + * ----+   +---+-----+ +--+--------+----+---- + *  2  |   | 3 |  2  | |17|   55   | 10 | 10 + * ----+   +---+-----+ +--+--------+----+---- + * + * For this next step, let's assume that all the physical blocks in + * the adjustment range are mapped to a file and are therefore in use + * at least once.  Therefore, we can infer that any gap in the + * refcount tree within the adjustment range represents a physical + * extent with refcount == 1: + * + *      <------ adjustment range ------> + * ----+---+---+-----+-+--+--------+----+---- + *  2  |"1"| 3 |  2  |1|17|   55   | 10 | 10 + * ----+---+---+-----+-+--+--------+----+---- + *      ^ + * + * For each extent that falls within the interval range, figure out + * which extent is to the left or the right of that extent.  Now we + * have a left, current, and right extent.  If the new reference count + * of the center extent enables us to merge left, center, and right + * into one record covering all three, do so.  If the center extent is + * at the left end of the range, abuts the left extent, and its new + * reference count matches the left extent's record, then merge them. + * If the center extent is at the right end of the range, abuts the + * right extent, and the reference counts match, merge those.  In the + * example, we can left merge (assuming an increment operation): + * + *      <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + *    2    | 3 |  2  |1|17|   55   | 10 | 10 + * --------+---+-----+-+--+--------+----+---- + *          ^ + * + * For all other extents within the range, adjust the reference count + * or delete it if the refcount falls below 2.  If we were + * incrementing, the end result looks like this: + * + *      <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + *    2    | 4 |  3  |2|18|   56   | 11 | 10 + * --------+---+-----+-+--+--------+----+---- + * + * The result of a decrement operation looks as such: + * + *      <------ adjustment range ------> + * ----+   +---+       +--+--------+----+---- + *  2  |   | 2 |       |16|   54   |  9 | 10 + * ----+   +---+       +--+--------+----+---- + *      DDDD    111111DD + * + * The blocks marked "D" are freed; the blocks marked "1" are only + * referenced once and therefore the record is removed from the + * refcount btree. + */ + +/* Next block after this extent. */ +static inline xfs_agblock_t +xfs_refc_next( +	struct xfs_refcount_irec	*rc) +{ +	return rc->rc_startblock + rc->rc_blockcount; +} + +/* + * Split a refcount extent that crosses agbno. + */ +STATIC int +xfs_refcount_split_extent( +	struct xfs_btree_cur		*cur, +	xfs_agblock_t			agbno, +	bool				*shape_changed) +{ +	struct xfs_refcount_irec	rcext, tmp; +	int				found_rec; +	int				error; + +	*shape_changed = false; +	error = xfs_refcount_lookup_le(cur, agbno, &found_rec); +	if (error) +		goto out_error; +	if (!found_rec) +		return 0; + +	error = xfs_refcount_get_rec(cur, &rcext, &found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); +	if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) +		return 0; + +	*shape_changed = true; +	trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, +			&rcext, agbno); + +	/* Establish the right extent. */ +	tmp = rcext; +	tmp.rc_startblock = agbno; +	tmp.rc_blockcount -= (agbno - rcext.rc_startblock); +	error = xfs_refcount_update(cur, &tmp); +	if (error) +		goto out_error; + +	/* Insert the left extent. */ +	tmp = rcext; +	tmp.rc_blockcount = agbno - rcext.rc_startblock; +	error = xfs_refcount_insert(cur, &tmp, &found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); +	return error; + +out_error: +	trace_xfs_refcount_split_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Merge the left, center, and right extents. + */ +STATIC int +xfs_refcount_merge_center_extents( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*left, +	struct xfs_refcount_irec	*center, +	struct xfs_refcount_irec	*right, +	unsigned long long		extlen, +	xfs_agblock_t			*agbno, +	xfs_extlen_t			*aglen) +{ +	int				error; +	int				found_rec; + +	trace_xfs_refcount_merge_center_extents(cur->bc_mp, +			cur->bc_private.a.agno, left, center, right); + +	/* +	 * Make sure the center and right extents are not in the btree. +	 * If the center extent was synthesized, the first delete call +	 * removes the right extent and we skip the second deletion. +	 * If center and right were in the btree, then the first delete +	 * call removes the center and the second one removes the right +	 * extent. +	 */ +	error = xfs_refcount_lookup_ge(cur, center->rc_startblock, +			&found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	error = xfs_refcount_delete(cur, &found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	if (center->rc_refcount > 1) { +		error = xfs_refcount_delete(cur, &found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); +	} + +	/* Enlarge the left extent. */ +	error = xfs_refcount_lookup_le(cur, left->rc_startblock, +			&found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	left->rc_blockcount = extlen; +	error = xfs_refcount_update(cur, left); +	if (error) +		goto out_error; + +	*aglen = 0; +	return error; + +out_error: +	trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Merge with the left extent. + */ +STATIC int +xfs_refcount_merge_left_extent( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*left, +	struct xfs_refcount_irec	*cleft, +	xfs_agblock_t			*agbno, +	xfs_extlen_t			*aglen) +{ +	int				error; +	int				found_rec; + +	trace_xfs_refcount_merge_left_extent(cur->bc_mp, +			cur->bc_private.a.agno, left, cleft); + +	/* If the extent at agbno (cleft) wasn't synthesized, remove it. */ +	if (cleft->rc_refcount > 1) { +		error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, +				&found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); + +		error = xfs_refcount_delete(cur, &found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); +	} + +	/* Enlarge the left extent. */ +	error = xfs_refcount_lookup_le(cur, left->rc_startblock, +			&found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	left->rc_blockcount += cleft->rc_blockcount; +	error = xfs_refcount_update(cur, left); +	if (error) +		goto out_error; + +	*agbno += cleft->rc_blockcount; +	*aglen -= cleft->rc_blockcount; +	return error; + +out_error: +	trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Merge with the right extent. + */ +STATIC int +xfs_refcount_merge_right_extent( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*right, +	struct xfs_refcount_irec	*cright, +	xfs_agblock_t			*agbno, +	xfs_extlen_t			*aglen) +{ +	int				error; +	int				found_rec; + +	trace_xfs_refcount_merge_right_extent(cur->bc_mp, +			cur->bc_private.a.agno, cright, right); + +	/* +	 * If the extent ending at agbno+aglen (cright) wasn't synthesized, +	 * remove it. +	 */ +	if (cright->rc_refcount > 1) { +		error = xfs_refcount_lookup_le(cur, cright->rc_startblock, +			&found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); + +		error = xfs_refcount_delete(cur, &found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); +	} + +	/* Enlarge the right extent. */ +	error = xfs_refcount_lookup_le(cur, right->rc_startblock, +			&found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	right->rc_startblock -= cright->rc_blockcount; +	right->rc_blockcount += cright->rc_blockcount; +	error = xfs_refcount_update(cur, right); +	if (error) +		goto out_error; + +	*aglen -= cright->rc_blockcount; +	return error; + +out_error: +	trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +#define XFS_FIND_RCEXT_SHARED	1 +#define XFS_FIND_RCEXT_COW	2 +/* + * Find the left extent and the one after it (cleft).  This function assumes + * that we've already split any extent crossing agbno. + */ +STATIC int +xfs_refcount_find_left_extents( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*left, +	struct xfs_refcount_irec	*cleft, +	xfs_agblock_t			agbno, +	xfs_extlen_t			aglen, +	int				flags) +{ +	struct xfs_refcount_irec	tmp; +	int				error; +	int				found_rec; + +	left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; +	error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); +	if (error) +		goto out_error; +	if (!found_rec) +		return 0; + +	error = xfs_refcount_get_rec(cur, &tmp, &found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	if (xfs_refc_next(&tmp) != agbno) +		return 0; +	if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) +		return 0; +	if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) +		return 0; +	/* We have a left extent; retrieve (or invent) the next right one */ +	*left = tmp; + +	error = xfs_btree_increment(cur, 0, &found_rec); +	if (error) +		goto out_error; +	if (found_rec) { +		error = xfs_refcount_get_rec(cur, &tmp, &found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); + +		/* if tmp starts at the end of our range, just use that */ +		if (tmp.rc_startblock == agbno) +			*cleft = tmp; +		else { +			/* +			 * There's a gap in the refcntbt at the start of the +			 * range we're interested in (refcount == 1) so +			 * synthesize the implied extent and pass it back. +			 * We assume here that the agbno/aglen range was +			 * passed in from a data fork extent mapping and +			 * therefore is allocated to exactly one owner. +			 */ +			cleft->rc_startblock = agbno; +			cleft->rc_blockcount = min(aglen, +					tmp.rc_startblock - agbno); +			cleft->rc_refcount = 1; +		} +	} else { +		/* +		 * No extents, so pretend that there's one covering the whole +		 * range. +		 */ +		cleft->rc_startblock = agbno; +		cleft->rc_blockcount = aglen; +		cleft->rc_refcount = 1; +	} +	trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, +			left, cleft, agbno); +	return error; + +out_error: +	trace_xfs_refcount_find_left_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Find the right extent and the one before it (cright).  This function + * assumes that we've already split any extents crossing agbno + aglen. + */ +STATIC int +xfs_refcount_find_right_extents( +	struct xfs_btree_cur		*cur, +	struct xfs_refcount_irec	*right, +	struct xfs_refcount_irec	*cright, +	xfs_agblock_t			agbno, +	xfs_extlen_t			aglen, +	int				flags) +{ +	struct xfs_refcount_irec	tmp; +	int				error; +	int				found_rec; + +	right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; +	error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); +	if (error) +		goto out_error; +	if (!found_rec) +		return 0; + +	error = xfs_refcount_get_rec(cur, &tmp, &found_rec); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + +	if (tmp.rc_startblock != agbno + aglen) +		return 0; +	if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) +		return 0; +	if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) +		return 0; +	/* We have a right extent; retrieve (or invent) the next left one */ +	*right = tmp; + +	error = xfs_btree_decrement(cur, 0, &found_rec); +	if (error) +		goto out_error; +	if (found_rec) { +		error = xfs_refcount_get_rec(cur, &tmp, &found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, +				out_error); + +		/* if tmp ends at the end of our range, just use that */ +		if (xfs_refc_next(&tmp) == agbno + aglen) +			*cright = tmp; +		else { +			/* +			 * There's a gap in the refcntbt at the end of the +			 * range we're interested in (refcount == 1) so +			 * create the implied extent and pass it back. +			 * We assume here that the agbno/aglen range was +			 * passed in from a data fork extent mapping and +			 * therefore is allocated to exactly one owner. +			 */ +			cright->rc_startblock = max(agbno, xfs_refc_next(&tmp)); +			cright->rc_blockcount = right->rc_startblock - +					cright->rc_startblock; +			cright->rc_refcount = 1; +		} +	} else { +		/* +		 * No extents, so pretend that there's one covering the whole +		 * range. +		 */ +		cright->rc_startblock = agbno; +		cright->rc_blockcount = aglen; +		cright->rc_refcount = 1; +	} +	trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, +			cright, right, agbno + aglen); +	return error; + +out_error: +	trace_xfs_refcount_find_right_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* Is this extent valid? */ +static inline bool +xfs_refc_valid( +	struct xfs_refcount_irec	*rc) +{ +	return rc->rc_startblock != NULLAGBLOCK; +} + +/* + * Try to merge with any extents on the boundaries of the adjustment range. + */ +STATIC int +xfs_refcount_merge_extents( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		*agbno, +	xfs_extlen_t		*aglen, +	enum xfs_refc_adjust_op adjust, +	int			flags, +	bool			*shape_changed) +{ +	struct xfs_refcount_irec	left = {0}, cleft = {0}; +	struct xfs_refcount_irec	cright = {0}, right = {0}; +	int				error; +	unsigned long long		ulen; +	bool				cequal; + +	*shape_changed = false; +	/* +	 * Find the extent just below agbno [left], just above agbno [cleft], +	 * just below (agbno + aglen) [cright], and just above (agbno + aglen) +	 * [right]. +	 */ +	error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, +			*aglen, flags); +	if (error) +		return error; +	error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, +			*aglen, flags); +	if (error) +		return error; + +	/* No left or right extent to merge; exit. */ +	if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right)) +		return 0; + +	cequal = (cleft.rc_startblock == cright.rc_startblock) && +		 (cleft.rc_blockcount == cright.rc_blockcount); + +	/* Try to merge left, cleft, and right.  cleft must == cright. */ +	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + +			right.rc_blockcount; +	if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && +	    xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && +	    left.rc_refcount == cleft.rc_refcount + adjust && +	    right.rc_refcount == cleft.rc_refcount + adjust && +	    ulen < MAXREFCEXTLEN) { +		*shape_changed = true; +		return xfs_refcount_merge_center_extents(cur, &left, &cleft, +				&right, ulen, agbno, aglen); +	} + +	/* Try to merge left and cleft. */ +	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; +	if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && +	    left.rc_refcount == cleft.rc_refcount + adjust && +	    ulen < MAXREFCEXTLEN) { +		*shape_changed = true; +		error = xfs_refcount_merge_left_extent(cur, &left, &cleft, +				agbno, aglen); +		if (error) +			return error; + +		/* +		 * If we just merged left + cleft and cleft == cright, +		 * we no longer have a cright to merge with right.  We're done. +		 */ +		if (cequal) +			return 0; +	} + +	/* Try to merge cright and right. */ +	ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; +	if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && +	    right.rc_refcount == cright.rc_refcount + adjust && +	    ulen < MAXREFCEXTLEN) { +		*shape_changed = true; +		return xfs_refcount_merge_right_extent(cur, &right, &cright, +				agbno, aglen); +	} + +	return error; +} + +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs.  Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 24 bytes + * per record.  We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + * + * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing + * true incorrectly is a shutdown FS; the penalty for guessing false + * incorrectly is more transaction rolls than might be necessary. + * Be conservative here. + */ +static bool +xfs_refcount_still_have_space( +	struct xfs_btree_cur		*cur) +{ +	unsigned long			overhead; + +	overhead = cur->bc_private.a.priv.refc.shape_changes * +			xfs_allocfree_log_count(cur->bc_mp, 1); +	overhead *= cur->bc_mp->m_sb.sb_blocksize; + +	/* +	 * Only allow 2 refcount extent updates per transaction if the +	 * refcount continue update "error" has been injected. +	 */ +	if (cur->bc_private.a.priv.refc.nr_ops > 2 && +	    XFS_TEST_ERROR(false, cur->bc_mp, +			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, +			XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) +		return false; + +	if (cur->bc_private.a.priv.refc.nr_ops == 0) +		return true; +	else if (overhead > cur->bc_tp->t_log_res) +		return false; +	return  cur->bc_tp->t_log_res - overhead > +		cur->bc_private.a.priv.refc.nr_ops * 32; +} + +/* + * Adjust the refcounts of middle extents.  At this point we should have + * split extents that crossed the adjustment range; merged with adjacent + * extents; and updated agbno/aglen to reflect the merges.  Therefore, + * all we have to do is update the extents inside [agbno, agbno + aglen]. + */ +STATIC int +xfs_refcount_adjust_extents( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		*agbno, +	xfs_extlen_t		*aglen, +	enum xfs_refc_adjust_op	adj, +	struct xfs_defer_ops	*dfops, +	struct xfs_owner_info	*oinfo) +{ +	struct xfs_refcount_irec	ext, tmp; +	int				error; +	int				found_rec, found_tmp; +	xfs_fsblock_t			fsbno; + +	/* Merging did all the work already. */ +	if (*aglen == 0) +		return 0; + +	error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); +	if (error) +		goto out_error; + +	while (*aglen > 0 && xfs_refcount_still_have_space(cur)) { +		error = xfs_refcount_get_rec(cur, &ext, &found_rec); +		if (error) +			goto out_error; +		if (!found_rec) { +			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; +			ext.rc_blockcount = 0; +			ext.rc_refcount = 0; +		} + +		/* +		 * Deal with a hole in the refcount tree; if a file maps to +		 * these blocks and there's no refcountbt record, pretend that +		 * there is one with refcount == 1. +		 */ +		if (ext.rc_startblock != *agbno) { +			tmp.rc_startblock = *agbno; +			tmp.rc_blockcount = min(*aglen, +					ext.rc_startblock - *agbno); +			tmp.rc_refcount = 1 + adj; +			trace_xfs_refcount_modify_extent(cur->bc_mp, +					cur->bc_private.a.agno, &tmp); + +			/* +			 * Either cover the hole (increment) or +			 * delete the range (decrement). +			 */ +			if (tmp.rc_refcount) { +				error = xfs_refcount_insert(cur, &tmp, +						&found_tmp); +				if (error) +					goto out_error; +				XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +						found_tmp == 1, out_error); +				cur->bc_private.a.priv.refc.nr_ops++; +			} else { +				fsbno = XFS_AGB_TO_FSB(cur->bc_mp, +						cur->bc_private.a.agno, +						tmp.rc_startblock); +				xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, +						tmp.rc_blockcount, oinfo); +			} + +			(*agbno) += tmp.rc_blockcount; +			(*aglen) -= tmp.rc_blockcount; + +			error = xfs_refcount_lookup_ge(cur, *agbno, +					&found_rec); +			if (error) +				goto out_error; +		} + +		/* Stop if there's nothing left to modify */ +		if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) +			break; + +		/* +		 * Adjust the reference count and either update the tree +		 * (incr) or free the blocks (decr). +		 */ +		if (ext.rc_refcount == MAXREFCOUNT) +			goto skip; +		ext.rc_refcount += adj; +		trace_xfs_refcount_modify_extent(cur->bc_mp, +				cur->bc_private.a.agno, &ext); +		if (ext.rc_refcount > 1) { +			error = xfs_refcount_update(cur, &ext); +			if (error) +				goto out_error; +			cur->bc_private.a.priv.refc.nr_ops++; +		} else if (ext.rc_refcount == 1) { +			error = xfs_refcount_delete(cur, &found_rec); +			if (error) +				goto out_error; +			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +					found_rec == 1, out_error); +			cur->bc_private.a.priv.refc.nr_ops++; +			goto advloop; +		} else { +			fsbno = XFS_AGB_TO_FSB(cur->bc_mp, +					cur->bc_private.a.agno, +					ext.rc_startblock); +			xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, +					ext.rc_blockcount, oinfo); +		} + +skip: +		error = xfs_btree_increment(cur, 0, &found_rec); +		if (error) +			goto out_error; + +advloop: +		(*agbno) += ext.rc_blockcount; +		(*aglen) -= ext.rc_blockcount; +	} + +	return error; +out_error: +	trace_xfs_refcount_modify_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* Adjust the reference count of a range of AG blocks. */ +STATIC int +xfs_refcount_adjust( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		agbno, +	xfs_extlen_t		aglen, +	xfs_agblock_t		*new_agbno, +	xfs_extlen_t		*new_aglen, +	enum xfs_refc_adjust_op	adj, +	struct xfs_defer_ops	*dfops, +	struct xfs_owner_info	*oinfo) +{ +	bool			shape_changed; +	int			shape_changes = 0; +	int			error; + +	*new_agbno = agbno; +	*new_aglen = aglen; +	if (adj == XFS_REFCOUNT_ADJUST_INCREASE) +		trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, +				agbno, aglen); +	else +		trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, +				agbno, aglen); + +	/* +	 * Ensure that no rcextents cross the boundary of the adjustment range. +	 */ +	error = xfs_refcount_split_extent(cur, agbno, &shape_changed); +	if (error) +		goto out_error; +	if (shape_changed) +		shape_changes++; + +	error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); +	if (error) +		goto out_error; +	if (shape_changed) +		shape_changes++; + +	/* +	 * Try to merge with the left or right extents of the range. +	 */ +	error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, +			XFS_FIND_RCEXT_SHARED, &shape_changed); +	if (error) +		goto out_error; +	if (shape_changed) +		shape_changes++; +	if (shape_changes) +		cur->bc_private.a.priv.refc.shape_changes++; + +	/* Now that we've taken care of the ends, adjust the middle extents */ +	error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, +			adj, dfops, oinfo); +	if (error) +		goto out_error; + +	return 0; + +out_error: +	trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, +			error, _RET_IP_); +	return error; +} + +/* Clean up after calling xfs_refcount_finish_one. */ +void +xfs_refcount_finish_one_cleanup( +	struct xfs_trans	*tp, +	struct xfs_btree_cur	*rcur, +	int			error) +{ +	struct xfs_buf		*agbp; + +	if (rcur == NULL) +		return; +	agbp = rcur->bc_private.a.agbp; +	xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); +	if (error) +		xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred refcount operations.  We pass back the + * btree cursor to maintain our lock on the btree between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_refcount_finish_one( +	struct xfs_trans		*tp, +	struct xfs_defer_ops		*dfops, +	enum xfs_refcount_intent_type	type, +	xfs_fsblock_t			startblock, +	xfs_extlen_t			blockcount, +	xfs_fsblock_t			*new_fsb, +	xfs_extlen_t			*new_len, +	struct xfs_btree_cur		**pcur) +{ +	struct xfs_mount		*mp = tp->t_mountp; +	struct xfs_btree_cur		*rcur; +	struct xfs_buf			*agbp = NULL; +	int				error = 0; +	xfs_agnumber_t			agno; +	xfs_agblock_t			bno; +	xfs_agblock_t			new_agbno; +	unsigned long			nr_ops = 0; +	int				shape_changes = 0; + +	agno = XFS_FSB_TO_AGNO(mp, startblock); +	ASSERT(agno != NULLAGNUMBER); +	bno = XFS_FSB_TO_AGBNO(mp, startblock); + +	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), +			type, XFS_FSB_TO_AGBNO(mp, startblock), +			blockcount); + +	if (XFS_TEST_ERROR(false, mp, +			XFS_ERRTAG_REFCOUNT_FINISH_ONE, +			XFS_RANDOM_REFCOUNT_FINISH_ONE)) +		return -EIO; + +	/* +	 * If we haven't gotten a cursor or the cursor AG doesn't match +	 * the startblock, get one now. +	 */ +	rcur = *pcur; +	if (rcur != NULL && rcur->bc_private.a.agno != agno) { +		nr_ops = rcur->bc_private.a.priv.refc.nr_ops; +		shape_changes = rcur->bc_private.a.priv.refc.shape_changes; +		xfs_refcount_finish_one_cleanup(tp, rcur, 0); +		rcur = NULL; +		*pcur = NULL; +	} +	if (rcur == NULL) { +		error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, +				XFS_ALLOC_FLAG_FREEING, &agbp); +		if (error) +			return error; +		if (!agbp) +			return -EFSCORRUPTED; + +		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops); +		if (!rcur) { +			error = -ENOMEM; +			goto out_cur; +		} +		rcur->bc_private.a.priv.refc.nr_ops = nr_ops; +		rcur->bc_private.a.priv.refc.shape_changes = shape_changes; +	} +	*pcur = rcur; + +	switch (type) { +	case XFS_REFCOUNT_INCREASE: +		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, +			new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL); +		*new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); +		break; +	case XFS_REFCOUNT_DECREASE: +		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, +			new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL); +		*new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); +		break; +	case XFS_REFCOUNT_ALLOC_COW: +		*new_fsb = startblock + blockcount; +		*new_len = 0; +		error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops); +		break; +	case XFS_REFCOUNT_FREE_COW: +		*new_fsb = startblock + blockcount; +		*new_len = 0; +		error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops); +		break; +	default: +		ASSERT(0); +		error = -EFSCORRUPTED; +	} +	if (!error && *new_len > 0) +		trace_xfs_refcount_finish_one_leftover(mp, agno, type, +				bno, blockcount, new_agbno, *new_len); +	return error; + +out_cur: +	xfs_trans_brelse(tp, agbp); + +	return error; +} + +/* + * Record a refcount intent for later processing. + */ +static int +__xfs_refcount_add( +	struct xfs_mount		*mp, +	struct xfs_defer_ops		*dfops, +	enum xfs_refcount_intent_type	type, +	xfs_fsblock_t			startblock, +	xfs_extlen_t			blockcount) +{ +	struct xfs_refcount_intent	*ri; + +	trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock), +			type, XFS_FSB_TO_AGBNO(mp, startblock), +			blockcount); + +	ri = kmem_alloc(sizeof(struct xfs_refcount_intent), +			KM_SLEEP | KM_NOFS); +	INIT_LIST_HEAD(&ri->ri_list); +	ri->ri_type = type; +	ri->ri_startblock = startblock; +	ri->ri_blockcount = blockcount; + +	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); +	return 0; +} + +/* + * Increase the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_increase_extent( +	struct xfs_mount		*mp, +	struct xfs_defer_ops		*dfops, +	struct xfs_bmbt_irec		*PREV) +{ +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return 0; + +	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE, +			PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Decrease the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_decrease_extent( +	struct xfs_mount		*mp, +	struct xfs_defer_ops		*dfops, +	struct xfs_bmbt_irec		*PREV) +{ +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return 0; + +	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE, +			PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen.  If + * find_end_of_shared is set, return the longest contiguous extent of + * shared blocks; if not, just return the first extent we find.  If no + * shared blocks are found, fbno and flen will be set to NULLAGBLOCK + * and 0, respectively. + */ +int +xfs_refcount_find_shared( +	struct xfs_btree_cur		*cur, +	xfs_agblock_t			agbno, +	xfs_extlen_t			aglen, +	xfs_agblock_t			*fbno, +	xfs_extlen_t			*flen, +	bool				find_end_of_shared) +{ +	struct xfs_refcount_irec	tmp; +	int				i; +	int				have; +	int				error; + +	trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, +			agbno, aglen); + +	/* By default, skip the whole range */ +	*fbno = NULLAGBLOCK; +	*flen = 0; + +	/* Try to find a refcount extent that crosses the start */ +	error = xfs_refcount_lookup_le(cur, agbno, &have); +	if (error) +		goto out_error; +	if (!have) { +		/* No left extent, look at the next one */ +		error = xfs_btree_increment(cur, 0, &have); +		if (error) +			goto out_error; +		if (!have) +			goto done; +	} +	error = xfs_refcount_get_rec(cur, &tmp, &i); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + +	/* If the extent ends before the start, look at the next one */ +	if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { +		error = xfs_btree_increment(cur, 0, &have); +		if (error) +			goto out_error; +		if (!have) +			goto done; +		error = xfs_refcount_get_rec(cur, &tmp, &i); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); +	} + +	/* If the extent starts after the range we want, bail out */ +	if (tmp.rc_startblock >= agbno + aglen) +		goto done; + +	/* We found the start of a shared extent! */ +	if (tmp.rc_startblock < agbno) { +		tmp.rc_blockcount -= (agbno - tmp.rc_startblock); +		tmp.rc_startblock = agbno; +	} + +	*fbno = tmp.rc_startblock; +	*flen = min(tmp.rc_blockcount, agbno + aglen - *fbno); +	if (!find_end_of_shared) +		goto done; + +	/* Otherwise, find the end of this shared extent */ +	while (*fbno + *flen < agbno + aglen) { +		error = xfs_btree_increment(cur, 0, &have); +		if (error) +			goto out_error; +		if (!have) +			break; +		error = xfs_refcount_get_rec(cur, &tmp, &i); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); +		if (tmp.rc_startblock >= agbno + aglen || +		    tmp.rc_startblock != *fbno + *flen) +			break; +		*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); +	} + +done: +	trace_xfs_refcount_find_shared_result(cur->bc_mp, +			cur->bc_private.a.agno, *fbno, *flen); + +out_error: +	if (error) +		trace_xfs_refcount_find_shared_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Recovering CoW Blocks After a Crash + * + * Due to the way that the copy on write mechanism works, there's a window of + * opportunity in which we can lose track of allocated blocks during a crash. + * Because CoW uses delayed allocation in the in-core CoW fork, writeback + * causes blocks to be allocated and stored in the CoW fork.  The blocks are + * no longer in the free space btree but are not otherwise recorded anywhere + * until the write completes and the blocks are mapped into the file.  A crash + * in between allocation and remapping results in the replacement blocks being + * lost.  This situation is exacerbated by the CoW extent size hint because + * allocations can hang around for long time. + * + * However, there is a place where we can record these allocations before they + * become mappings -- the reference count btree.  The btree does not record + * extents with refcount == 1, so we can record allocations with a refcount of + * 1.  Blocks being used for CoW writeout cannot be shared, so there should be + * no conflict with shared block records.  These mappings should be created + * when we allocate blocks to the CoW fork and deleted when they're removed + * from the CoW fork. + * + * Minor nit: records for in-progress CoW allocations and records for shared + * extents must never be merged, to preserve the property that (except for CoW + * allocations) there are no refcount btree entries with refcount == 1.  The + * only time this could potentially happen is when unsharing a block that's + * adjacent to CoW allocations, so we must be careful to avoid this. + * + * At mount time we recover lost CoW allocations by searching the refcount + * btree for these refcount == 1 mappings.  These represent CoW allocations + * that were in progress at the time the filesystem went down, so we can free + * them to get the space back. + * + * This mechanism is superior to creating EFIs for unmapped CoW extents for + * several reasons -- first, EFIs pin the tail of the log and would have to be + * periodically relogged to avoid filling up the log.  Second, CoW completions + * will have to file an EFD and create new EFIs for whatever remains in the + * CoW fork; this partially takes care of (1) but extent-size reservations + * will have to periodically relog even if there's no writeout in progress. + * This can happen if the CoW extent size hint is set, which you really want. + * Third, EFIs cannot currently be automatically relogged into newer + * transactions to advance the log tail.  Fourth, stuffing the log full of + * EFIs places an upper bound on the number of CoW allocations that can be + * held filesystem-wide at any given time.  Recording them in the refcount + * btree doesn't require us to maintain any state in memory and doesn't pin + * the log. + */ +/* + * Adjust the refcounts of CoW allocations.  These allocations are "magic" + * in that they're not referenced anywhere else in the filesystem, so we + * stash them in the refcount btree with a refcount of 1 until either file + * remapping (or CoW cancellation) happens. + */ +STATIC int +xfs_refcount_adjust_cow_extents( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		agbno, +	xfs_extlen_t		aglen, +	enum xfs_refc_adjust_op	adj, +	struct xfs_defer_ops	*dfops, +	struct xfs_owner_info	*oinfo) +{ +	struct xfs_refcount_irec	ext, tmp; +	int				error; +	int				found_rec, found_tmp; + +	if (aglen == 0) +		return 0; + +	/* Find any overlapping refcount records */ +	error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); +	if (error) +		goto out_error; +	error = xfs_refcount_get_rec(cur, &ext, &found_rec); +	if (error) +		goto out_error; +	if (!found_rec) { +		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + +				XFS_REFC_COW_START; +		ext.rc_blockcount = 0; +		ext.rc_refcount = 0; +	} + +	switch (adj) { +	case XFS_REFCOUNT_ADJUST_COW_ALLOC: +		/* Adding a CoW reservation, there should be nothing here. */ +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +				ext.rc_startblock >= agbno + aglen, out_error); + +		tmp.rc_startblock = agbno; +		tmp.rc_blockcount = aglen; +		tmp.rc_refcount = 1; +		trace_xfs_refcount_modify_extent(cur->bc_mp, +				cur->bc_private.a.agno, &tmp); + +		error = xfs_refcount_insert(cur, &tmp, +				&found_tmp); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +				found_tmp == 1, out_error); +		break; +	case XFS_REFCOUNT_ADJUST_COW_FREE: +		/* Removing a CoW reservation, there should be one extent. */ +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +			ext.rc_startblock == agbno, out_error); +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +			ext.rc_blockcount == aglen, out_error); +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +			ext.rc_refcount == 1, out_error); + +		ext.rc_refcount = 0; +		trace_xfs_refcount_modify_extent(cur->bc_mp, +				cur->bc_private.a.agno, &ext); +		error = xfs_refcount_delete(cur, &found_rec); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, +				found_rec == 1, out_error); +		break; +	default: +		ASSERT(0); +	} + +	return error; +out_error: +	trace_xfs_refcount_modify_extent_error(cur->bc_mp, +			cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Add or remove refcount btree entries for CoW reservations. + */ +STATIC int +xfs_refcount_adjust_cow( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		agbno, +	xfs_extlen_t		aglen, +	enum xfs_refc_adjust_op	adj, +	struct xfs_defer_ops	*dfops) +{ +	bool			shape_changed; +	int			error; + +	agbno += XFS_REFC_COW_START; + +	/* +	 * Ensure that no rcextents cross the boundary of the adjustment range. +	 */ +	error = xfs_refcount_split_extent(cur, agbno, &shape_changed); +	if (error) +		goto out_error; + +	error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); +	if (error) +		goto out_error; + +	/* +	 * Try to merge with the left or right extents of the range. +	 */ +	error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, +			XFS_FIND_RCEXT_COW, &shape_changed); +	if (error) +		goto out_error; + +	/* Now that we've taken care of the ends, adjust the middle extents */ +	error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, +			dfops, NULL); +	if (error) +		goto out_error; + +	return 0; + +out_error: +	trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, +			error, _RET_IP_); +	return error; +} + +/* + * Record a CoW allocation in the refcount btree. + */ +STATIC int +__xfs_refcount_cow_alloc( +	struct xfs_btree_cur	*rcur, +	xfs_agblock_t		agbno, +	xfs_extlen_t		aglen, +	struct xfs_defer_ops	*dfops) +{ +	int			error; + +	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, +			agbno, aglen); + +	/* Add refcount btree reservation */ +	error = xfs_refcount_adjust_cow(rcur, agbno, aglen, +			XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); +	if (error) +		return error; + +	/* Add rmap entry */ +	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { +		error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, +				rcur->bc_private.a.agno, +				agbno, aglen, XFS_RMAP_OWN_COW); +		if (error) +			return error; +	} + +	return error; +} + +/* + * Remove a CoW allocation from the refcount btree. + */ +STATIC int +__xfs_refcount_cow_free( +	struct xfs_btree_cur	*rcur, +	xfs_agblock_t		agbno, +	xfs_extlen_t		aglen, +	struct xfs_defer_ops	*dfops) +{ +	int			error; + +	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, +			agbno, aglen); + +	/* Remove refcount btree reservation */ +	error = xfs_refcount_adjust_cow(rcur, agbno, aglen, +			XFS_REFCOUNT_ADJUST_COW_FREE, dfops); +	if (error) +		return error; + +	/* Remove rmap entry */ +	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { +		error = xfs_rmap_free_extent(rcur->bc_mp, dfops, +				rcur->bc_private.a.agno, +				agbno, aglen, XFS_RMAP_OWN_COW); +		if (error) +			return error; +	} + +	return error; +} + +/* Record a CoW staging extent in the refcount btree. */ +int +xfs_refcount_alloc_cow_extent( +	struct xfs_mount		*mp, +	struct xfs_defer_ops		*dfops, +	xfs_fsblock_t			fsb, +	xfs_extlen_t			len) +{ +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return 0; + +	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, +			fsb, len); +} + +/* Forget a CoW staging event in the refcount btree. */ +int +xfs_refcount_free_cow_extent( +	struct xfs_mount		*mp, +	struct xfs_defer_ops		*dfops, +	xfs_fsblock_t			fsb, +	xfs_extlen_t			len) +{ +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return 0; + +	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, +			fsb, len); +} + +struct xfs_refcount_recovery { +	struct list_head		rr_list; +	struct xfs_refcount_irec	rr_rrec; +}; + +/* Stuff an extent on the recovery list. */ +STATIC int +xfs_refcount_recover_extent( +	struct xfs_btree_cur		*cur, +	union xfs_btree_rec		*rec, +	void				*priv) +{ +	struct list_head		*debris = priv; +	struct xfs_refcount_recovery	*rr; + +	if (be32_to_cpu(rec->refc.rc_refcount) != 1) +		return -EFSCORRUPTED; + +	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); +	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); +	list_add_tail(&rr->rr_list, debris); + +	return 0; +} + +/* Find and remove leftover CoW reservations. */ +int +xfs_refcount_recover_cow_leftovers( +	struct xfs_mount		*mp, +	xfs_agnumber_t			agno) +{ +	struct xfs_trans		*tp; +	struct xfs_btree_cur		*cur; +	struct xfs_buf			*agbp; +	struct xfs_refcount_recovery	*rr, *n; +	struct list_head		debris; +	union xfs_btree_irec		low; +	union xfs_btree_irec		high; +	struct xfs_defer_ops		dfops; +	xfs_fsblock_t			fsb; +	xfs_agblock_t			agbno; +	int				error; + +	if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) +		return -EOPNOTSUPP; + +	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); +	if (error) +		return error; +	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + +	/* Find all the leftover CoW staging extents. */ +	INIT_LIST_HEAD(&debris); +	memset(&low, 0, sizeof(low)); +	memset(&high, 0, sizeof(high)); +	low.rc.rc_startblock = XFS_REFC_COW_START; +	high.rc.rc_startblock = -1U; +	error = xfs_btree_query_range(cur, &low, &high, +			xfs_refcount_recover_extent, &debris); +	if (error) +		goto out_cursor; +	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); +	xfs_buf_relse(agbp); + +	/* Now iterate the list to free the leftovers */ +	list_for_each_entry(rr, &debris, rr_list) { +		/* Set up transaction. */ +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); +		if (error) +			goto out_free; + +		trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + +		/* Free the orphan record */ +		xfs_defer_init(&dfops, &fsb); +		agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; +		fsb = XFS_AGB_TO_FSB(mp, agno, agbno); +		error = xfs_refcount_free_cow_extent(mp, &dfops, fsb, +				rr->rr_rrec.rc_blockcount); +		if (error) +			goto out_defer; + +		/* Free the block. */ +		xfs_bmap_add_free(mp, &dfops, fsb, +				rr->rr_rrec.rc_blockcount, NULL); + +		error = xfs_defer_finish(&tp, &dfops, NULL); +		if (error) +			goto out_defer; + +		error = xfs_trans_commit(tp); +		if (error) +			goto out_free; +	} + +out_free: +	/* Free the leftover list */ +	list_for_each_entry_safe(rr, n, &debris, rr_list) { +		list_del(&rr->rr_list); +		kmem_free(rr); +	} +	return error; + +out_cursor: +	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +	xfs_buf_relse(agbp); +	goto out_free; + +out_defer: +	xfs_defer_cancel(&dfops); +	xfs_trans_cancel(tp); +	goto out_free; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h new file mode 100644 index 000000000000..098dc668ab2c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_H__ +#define __XFS_REFCOUNT_H__ + +extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, +		xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, +		xfs_agblock_t bno, int *stat); +extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, +		struct xfs_refcount_irec *irec, int *stat); + +enum xfs_refcount_intent_type { +	XFS_REFCOUNT_INCREASE = 1, +	XFS_REFCOUNT_DECREASE, +	XFS_REFCOUNT_ALLOC_COW, +	XFS_REFCOUNT_FREE_COW, +}; + +struct xfs_refcount_intent { +	struct list_head			ri_list; +	enum xfs_refcount_intent_type		ri_type; +	xfs_fsblock_t				ri_startblock; +	xfs_extlen_t				ri_blockcount; +}; + +extern int xfs_refcount_increase_extent(struct xfs_mount *mp, +		struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); +extern int xfs_refcount_decrease_extent(struct xfs_mount *mp, +		struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); + +extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, +		struct xfs_btree_cur *rcur, int error); +extern int xfs_refcount_finish_one(struct xfs_trans *tp, +		struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type, +		xfs_fsblock_t startblock, xfs_extlen_t blockcount, +		xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len, +		struct xfs_btree_cur **pcur); + +extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, +		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, +		xfs_extlen_t *flen, bool find_end_of_shared); + +extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp, +		struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, +		xfs_extlen_t len); +extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, +		struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, +		xfs_extlen_t len); +extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, +		xfs_agnumber_t agno); + +#endif	/* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c new file mode 100644 index 000000000000..453bb2757ec2 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_rmap.h" + +static struct xfs_btree_cur * +xfs_refcountbt_dup_cursor( +	struct xfs_btree_cur	*cur) +{ +	return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, +			cur->bc_private.a.agbp, cur->bc_private.a.agno, +			cur->bc_private.a.dfops); +} + +STATIC void +xfs_refcountbt_set_root( +	struct xfs_btree_cur	*cur, +	union xfs_btree_ptr	*ptr, +	int			inc) +{ +	struct xfs_buf		*agbp = cur->bc_private.a.agbp; +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp); +	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno); +	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno); + +	ASSERT(ptr->s != 0); + +	agf->agf_refcount_root = ptr->s; +	be32_add_cpu(&agf->agf_refcount_level, inc); +	pag->pagf_refcount_level += inc; +	xfs_perag_put(pag); + +	xfs_alloc_log_agf(cur->bc_tp, agbp, +			XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); +} + +STATIC int +xfs_refcountbt_alloc_block( +	struct xfs_btree_cur	*cur, +	union xfs_btree_ptr	*start, +	union xfs_btree_ptr	*new, +	int			*stat) +{ +	struct xfs_buf		*agbp = cur->bc_private.a.agbp; +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp); +	struct xfs_alloc_arg	args;		/* block allocation args */ +	int			error;		/* error return value */ + +	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + +	memset(&args, 0, sizeof(args)); +	args.tp = cur->bc_tp; +	args.mp = cur->bc_mp; +	args.type = XFS_ALLOCTYPE_NEAR_BNO; +	args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, +			xfs_refc_block(args.mp)); +	args.firstblock = args.fsbno; +	xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); +	args.minlen = args.maxlen = args.prod = 1; +	args.resv = XFS_AG_RESV_METADATA; + +	error = xfs_alloc_vextent(&args); +	if (error) +		goto out_error; +	trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, +			args.agbno, 1); +	if (args.fsbno == NULLFSBLOCK) { +		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +		*stat = 0; +		return 0; +	} +	ASSERT(args.agno == cur->bc_private.a.agno); +	ASSERT(args.len == 1); + +	new->s = cpu_to_be32(args.agbno); +	be32_add_cpu(&agf->agf_refcount_blocks, 1); +	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + +	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +	*stat = 1; +	return 0; + +out_error: +	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); +	return error; +} + +STATIC int +xfs_refcountbt_free_block( +	struct xfs_btree_cur	*cur, +	struct xfs_buf		*bp) +{ +	struct xfs_mount	*mp = cur->bc_mp; +	struct xfs_buf		*agbp = cur->bc_private.a.agbp; +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp); +	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); +	struct xfs_owner_info	oinfo; +	int			error; + +	trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, +			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); +	be32_add_cpu(&agf->agf_refcount_blocks, -1); +	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); +	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, +			XFS_AG_RESV_METADATA); +	if (error) +		return error; + +	return error; +} + +STATIC int +xfs_refcountbt_get_minrecs( +	struct xfs_btree_cur	*cur, +	int			level) +{ +	return cur->bc_mp->m_refc_mnr[level != 0]; +} + +STATIC int +xfs_refcountbt_get_maxrecs( +	struct xfs_btree_cur	*cur, +	int			level) +{ +	return cur->bc_mp->m_refc_mxr[level != 0]; +} + +STATIC void +xfs_refcountbt_init_key_from_rec( +	union xfs_btree_key	*key, +	union xfs_btree_rec	*rec) +{ +	key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_refcountbt_init_high_key_from_rec( +	union xfs_btree_key	*key, +	union xfs_btree_rec	*rec) +{ +	__u32			x; + +	x = be32_to_cpu(rec->refc.rc_startblock); +	x += be32_to_cpu(rec->refc.rc_blockcount) - 1; +	key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_refcountbt_init_rec_from_cur( +	struct xfs_btree_cur	*cur, +	union xfs_btree_rec	*rec) +{ +	rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); +	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); +	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_refcountbt_init_ptr_from_cur( +	struct xfs_btree_cur	*cur, +	union xfs_btree_ptr	*ptr) +{ +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + +	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); +	ASSERT(agf->agf_refcount_root != 0); + +	ptr->s = agf->agf_refcount_root; +} + +STATIC __int64_t +xfs_refcountbt_key_diff( +	struct xfs_btree_cur	*cur, +	union xfs_btree_key	*key) +{ +	struct xfs_refcount_irec	*rec = &cur->bc_rec.rc; +	struct xfs_refcount_key		*kp = &key->refc; + +	return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; +} + +STATIC __int64_t +xfs_refcountbt_diff_two_keys( +	struct xfs_btree_cur	*cur, +	union xfs_btree_key	*k1, +	union xfs_btree_key	*k2) +{ +	return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - +			  be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC bool +xfs_refcountbt_verify( +	struct xfs_buf		*bp) +{ +	struct xfs_mount	*mp = bp->b_target->bt_mount; +	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp); +	struct xfs_perag	*pag = bp->b_pag; +	unsigned int		level; + +	if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) +		return false; + +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return false; +	if (!xfs_btree_sblock_v5hdr_verify(bp)) +		return false; + +	level = be16_to_cpu(block->bb_level); +	if (pag && pag->pagf_init) { +		if (level >= pag->pagf_refcount_level) +			return false; +	} else if (level >= mp->m_refc_maxlevels) +		return false; + +	return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); +} + +STATIC void +xfs_refcountbt_read_verify( +	struct xfs_buf	*bp) +{ +	if (!xfs_btree_sblock_verify_crc(bp)) +		xfs_buf_ioerror(bp, -EFSBADCRC); +	else if (!xfs_refcountbt_verify(bp)) +		xfs_buf_ioerror(bp, -EFSCORRUPTED); + +	if (bp->b_error) { +		trace_xfs_btree_corrupt(bp, _RET_IP_); +		xfs_verifier_error(bp); +	} +} + +STATIC void +xfs_refcountbt_write_verify( +	struct xfs_buf	*bp) +{ +	if (!xfs_refcountbt_verify(bp)) { +		trace_xfs_btree_corrupt(bp, _RET_IP_); +		xfs_buf_ioerror(bp, -EFSCORRUPTED); +		xfs_verifier_error(bp); +		return; +	} +	xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_refcountbt_buf_ops = { +	.name			= "xfs_refcountbt", +	.verify_read		= xfs_refcountbt_read_verify, +	.verify_write		= xfs_refcountbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_refcountbt_keys_inorder( +	struct xfs_btree_cur	*cur, +	union xfs_btree_key	*k1, +	union xfs_btree_key	*k2) +{ +	return be32_to_cpu(k1->refc.rc_startblock) < +	       be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_refcountbt_recs_inorder( +	struct xfs_btree_cur	*cur, +	union xfs_btree_rec	*r1, +	union xfs_btree_rec	*r2) +{ +	return  be32_to_cpu(r1->refc.rc_startblock) + +		be32_to_cpu(r1->refc.rc_blockcount) <= +		be32_to_cpu(r2->refc.rc_startblock); +} +#endif + +static const struct xfs_btree_ops xfs_refcountbt_ops = { +	.rec_len		= sizeof(struct xfs_refcount_rec), +	.key_len		= sizeof(struct xfs_refcount_key), + +	.dup_cursor		= xfs_refcountbt_dup_cursor, +	.set_root		= xfs_refcountbt_set_root, +	.alloc_block		= xfs_refcountbt_alloc_block, +	.free_block		= xfs_refcountbt_free_block, +	.get_minrecs		= xfs_refcountbt_get_minrecs, +	.get_maxrecs		= xfs_refcountbt_get_maxrecs, +	.init_key_from_rec	= xfs_refcountbt_init_key_from_rec, +	.init_high_key_from_rec	= xfs_refcountbt_init_high_key_from_rec, +	.init_rec_from_cur	= xfs_refcountbt_init_rec_from_cur, +	.init_ptr_from_cur	= xfs_refcountbt_init_ptr_from_cur, +	.key_diff		= xfs_refcountbt_key_diff, +	.buf_ops		= &xfs_refcountbt_buf_ops, +	.diff_two_keys		= xfs_refcountbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) +	.keys_inorder		= xfs_refcountbt_keys_inorder, +	.recs_inorder		= xfs_refcountbt_recs_inorder, +#endif +}; + +/* + * Allocate a new refcount btree cursor. + */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( +	struct xfs_mount	*mp, +	struct xfs_trans	*tp, +	struct xfs_buf		*agbp, +	xfs_agnumber_t		agno, +	struct xfs_defer_ops	*dfops) +{ +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp); +	struct xfs_btree_cur	*cur; + +	ASSERT(agno != NULLAGNUMBER); +	ASSERT(agno < mp->m_sb.sb_agcount); +	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + +	cur->bc_tp = tp; +	cur->bc_mp = mp; +	cur->bc_btnum = XFS_BTNUM_REFC; +	cur->bc_blocklog = mp->m_sb.sb_blocklog; +	cur->bc_ops = &xfs_refcountbt_ops; + +	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + +	cur->bc_private.a.agbp = agbp; +	cur->bc_private.a.agno = agno; +	cur->bc_private.a.dfops = dfops; +	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + +	cur->bc_private.a.priv.refc.nr_ops = 0; +	cur->bc_private.a.priv.refc.shape_changes = 0; + +	return cur; +} + +/* + * Calculate the number of records in a refcount btree block. + */ +int +xfs_refcountbt_maxrecs( +	struct xfs_mount	*mp, +	int			blocklen, +	bool			leaf) +{ +	blocklen -= XFS_REFCOUNT_BLOCK_LEN; + +	if (leaf) +		return blocklen / sizeof(struct xfs_refcount_rec); +	return blocklen / (sizeof(struct xfs_refcount_key) + +			   sizeof(xfs_refcount_ptr_t)); +} + +/* Compute the maximum height of a refcount btree. */ +void +xfs_refcountbt_compute_maxlevels( +	struct xfs_mount		*mp) +{ +	mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, +			mp->m_refc_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_refcountbt_calc_size( +	struct xfs_mount	*mp, +	unsigned long long	len) +{ +	return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_refcountbt_max_size( +	struct xfs_mount	*mp) +{ +	/* Bail out if we're uninitialized, which can happen in mkfs. */ +	if (mp->m_refc_mxr[0] == 0) +		return 0; + +	return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_refcountbt_calc_reserves( +	struct xfs_mount	*mp, +	xfs_agnumber_t		agno, +	xfs_extlen_t		*ask, +	xfs_extlen_t		*used) +{ +	struct xfs_buf		*agbp; +	struct xfs_agf		*agf; +	xfs_extlen_t		tree_len; +	int			error; + +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return 0; + +	*ask += xfs_refcountbt_max_size(mp); + +	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); +	if (error) +		return error; + +	agf = XFS_BUF_TO_AGF(agbp); +	tree_len = be32_to_cpu(agf->agf_refcount_blocks); +	xfs_buf_relse(agbp); + +	*used += tree_len; + +	return error; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h new file mode 100644 index 000000000000..3be7768bd51a --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_BTREE_H__ +#define	__XFS_REFCOUNT_BTREE_H__ + +/* + * Reference Count Btree on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size + */ +#define XFS_REFCOUNT_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_REFCOUNT_REC_ADDR(block, index) \ +	((struct xfs_refcount_rec *) \ +		((char *)(block) + \ +		 XFS_REFCOUNT_BLOCK_LEN + \ +		 (((index) - 1) * sizeof(struct xfs_refcount_rec)))) + +#define XFS_REFCOUNT_KEY_ADDR(block, index) \ +	((struct xfs_refcount_key *) \ +		((char *)(block) + \ +		 XFS_REFCOUNT_BLOCK_LEN + \ +		 ((index) - 1) * sizeof(struct xfs_refcount_key))) + +#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \ +	((xfs_refcount_ptr_t *) \ +		((char *)(block) + \ +		 XFS_REFCOUNT_BLOCK_LEN + \ +		 (maxrecs) * sizeof(struct xfs_refcount_key) + \ +		 ((index) - 1) * sizeof(xfs_refcount_ptr_t))) + +extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, +		struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, +		struct xfs_defer_ops *dfops); +extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, +		bool leaf); +extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); + +extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, +		unsigned long long len); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); + +extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, +		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + +#endif	/* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 73d05407d663..3a8cc7139912 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -148,6 +148,37 @@ done:  	return error;  } +STATIC int +xfs_rmap_delete( +	struct xfs_btree_cur	*rcur, +	xfs_agblock_t		agbno, +	xfs_extlen_t		len, +	uint64_t		owner, +	uint64_t		offset, +	unsigned int		flags) +{ +	int			i; +	int			error; + +	trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, +			len, owner, offset, flags); + +	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); +	if (error) +		goto done; +	XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + +	error = xfs_btree_delete(rcur, &i); +	if (error) +		goto done; +	XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: +	if (error) +		trace_xfs_rmap_delete_error(rcur->bc_mp, +				rcur->bc_private.a.agno, error, _RET_IP_); +	return error; +} +  static int  xfs_rmap_btrec_to_irec(  	union xfs_btree_rec	*rec, @@ -180,6 +211,160 @@ xfs_rmap_get_rec(  	return xfs_rmap_btrec_to_irec(rec, irec);  } +struct xfs_find_left_neighbor_info { +	struct xfs_rmap_irec	high; +	struct xfs_rmap_irec	*irec; +	int			*stat; +}; + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_find_left_neighbor_helper( +	struct xfs_btree_cur	*cur, +	struct xfs_rmap_irec	*rec, +	void			*priv) +{ +	struct xfs_find_left_neighbor_info	*info = priv; + +	trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, +			cur->bc_private.a.agno, rec->rm_startblock, +			rec->rm_blockcount, rec->rm_owner, rec->rm_offset, +			rec->rm_flags); + +	if (rec->rm_owner != info->high.rm_owner) +		return XFS_BTREE_QUERY_RANGE_CONTINUE; +	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && +	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && +	    rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) +		return XFS_BTREE_QUERY_RANGE_CONTINUE; + +	*info->irec = *rec; +	*info->stat = 1; +	return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and adjacent physical and logical + * block ranges. + */ +int +xfs_rmap_find_left_neighbor( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	uint64_t		owner, +	uint64_t		offset, +	unsigned int		flags, +	struct xfs_rmap_irec	*irec, +	int			*stat) +{ +	struct xfs_find_left_neighbor_info	info; +	int			error; + +	*stat = 0; +	if (bno == 0) +		return 0; +	info.high.rm_startblock = bno - 1; +	info.high.rm_owner = owner; +	if (!XFS_RMAP_NON_INODE_OWNER(owner) && +	    !(flags & XFS_RMAP_BMBT_BLOCK)) { +		if (offset == 0) +			return 0; +		info.high.rm_offset = offset - 1; +	} else +		info.high.rm_offset = 0; +	info.high.rm_flags = flags; +	info.high.rm_blockcount = 0; +	info.irec = irec; +	info.stat = stat; + +	trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, +			cur->bc_private.a.agno, bno, 0, owner, offset, flags); + +	error = xfs_rmap_query_range(cur, &info.high, &info.high, +			xfs_rmap_find_left_neighbor_helper, &info); +	if (error == XFS_BTREE_QUERY_RANGE_ABORT) +		error = 0; +	if (*stat) +		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, +				cur->bc_private.a.agno, irec->rm_startblock, +				irec->rm_blockcount, irec->rm_owner, +				irec->rm_offset, irec->rm_flags); +	return error; +} + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_lookup_le_range_helper( +	struct xfs_btree_cur	*cur, +	struct xfs_rmap_irec	*rec, +	void			*priv) +{ +	struct xfs_find_left_neighbor_info	*info = priv; + +	trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, +			cur->bc_private.a.agno, rec->rm_startblock, +			rec->rm_blockcount, rec->rm_owner, rec->rm_offset, +			rec->rm_flags); + +	if (rec->rm_owner != info->high.rm_owner) +		return XFS_BTREE_QUERY_RANGE_CONTINUE; +	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && +	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && +	    (rec->rm_offset > info->high.rm_offset || +	     rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) +		return XFS_BTREE_QUERY_RANGE_CONTINUE; + +	*info->irec = *rec; +	*info->stat = 1; +	return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and overlapping physical and logical + * block ranges.  This is the overlapping-interval version of + * xfs_rmap_lookup_le. + */ +int +xfs_rmap_lookup_le_range( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	uint64_t		owner, +	uint64_t		offset, +	unsigned int		flags, +	struct xfs_rmap_irec	*irec, +	int			*stat) +{ +	struct xfs_find_left_neighbor_info	info; +	int			error; + +	info.high.rm_startblock = bno; +	info.high.rm_owner = owner; +	if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK)) +		info.high.rm_offset = offset; +	else +		info.high.rm_offset = 0; +	info.high.rm_flags = flags; +	info.high.rm_blockcount = 0; +	*stat = 0; +	info.irec = irec; +	info.stat = stat; + +	trace_xfs_rmap_lookup_le_range(cur->bc_mp, +			cur->bc_private.a.agno, bno, 0, owner, offset, flags); +	error = xfs_rmap_query_range(cur, &info.high, &info.high, +			xfs_rmap_lookup_le_range_helper, &info); +	if (error == XFS_BTREE_QUERY_RANGE_ABORT) +		error = 0; +	if (*stat) +		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, +				cur->bc_private.a.agno, irec->rm_startblock, +				irec->rm_blockcount, irec->rm_owner, +				irec->rm_offset, irec->rm_flags); +	return error; +} +  /*   * Find the extent in the rmap btree and remove it.   * @@ -1093,11 +1278,704 @@ done:  	return error;  } +/* + * Convert an unwritten extent to a real extent or vice versa.  If there is no + * possibility of overlapping extents, delegate to the simpler convert + * function. + */ +STATIC int +xfs_rmap_convert_shared( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	xfs_extlen_t		len, +	bool			unwritten, +	struct xfs_owner_info	*oinfo) +{ +	struct xfs_mount	*mp = cur->bc_mp; +	struct xfs_rmap_irec	r[4];	/* neighbor extent entries */ +					/* left is 0, right is 1, prev is 2 */ +					/* new is 3 */ +	uint64_t		owner; +	uint64_t		offset; +	uint64_t		new_endoff; +	unsigned int		oldext; +	unsigned int		newext; +	unsigned int		flags = 0; +	int			i; +	int			state = 0; +	int			error; + +	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); +	ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || +			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); +	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; +	new_endoff = offset + len; +	trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, +			unwritten, oinfo); + +	/* +	 * For the initial lookup, look for and exact match or the left-adjacent +	 * record for our insertion point. This will also give us the record for +	 * start block contiguity tests. +	 */ +	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, +			&PREV, &i); +	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + +	ASSERT(PREV.rm_offset <= offset); +	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); +	ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); +	newext = ~oldext & XFS_RMAP_UNWRITTEN; + +	/* +	 * Set flags determining what part of the previous oldext allocation +	 * extent is being replaced by a newext allocation. +	 */ +	if (PREV.rm_offset == offset) +		state |= RMAP_LEFT_FILLING; +	if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) +		state |= RMAP_RIGHT_FILLING; + +	/* Is there a left record that abuts our range? */ +	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext, +			&LEFT, &i); +	if (error) +		goto done; +	if (i) { +		state |= RMAP_LEFT_VALID; +		XFS_WANT_CORRUPTED_GOTO(mp, +				LEFT.rm_startblock + LEFT.rm_blockcount <= bno, +				done); +		if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) +			state |= RMAP_LEFT_CONTIG; +	} + +	/* Is there a right record that abuts our range? */ +	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, +			newext, &i); +	if (error) +		goto done; +	if (i) { +		state |= RMAP_RIGHT_VALID; +		error = xfs_rmap_get_rec(cur, &RIGHT, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, +				done); +		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, +				cur->bc_private.a.agno, RIGHT.rm_startblock, +				RIGHT.rm_blockcount, RIGHT.rm_owner, +				RIGHT.rm_offset, RIGHT.rm_flags); +		if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) +			state |= RMAP_RIGHT_CONTIG; +	} + +	/* check that left + prev + right is not too long */ +	if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | +			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == +	    (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | +	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && +	    (unsigned long)LEFT.rm_blockcount + len + +	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) +		state &= ~RMAP_RIGHT_CONTIG; + +	trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, +			_RET_IP_); +	/* +	 * Switch out based on the FILLING and CONTIG state bits. +	 */ +	switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | +			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { +	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | +	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: +		/* +		 * Setting all of a previous oldext extent to newext. +		 * The left and right neighbors are both contiguous with new. +		 */ +		error = xfs_rmap_delete(cur, RIGHT.rm_startblock, +				RIGHT.rm_blockcount, RIGHT.rm_owner, +				RIGHT.rm_offset, RIGHT.rm_flags); +		if (error) +			goto done; +		error = xfs_rmap_delete(cur, PREV.rm_startblock, +				PREV.rm_blockcount, PREV.rm_owner, +				PREV.rm_offset, PREV.rm_flags); +		if (error) +			goto done; +		NEW = LEFT; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		break; + +	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: +		/* +		 * Setting all of a previous oldext extent to newext. +		 * The left neighbor is contiguous, the right is not. +		 */ +		error = xfs_rmap_delete(cur, PREV.rm_startblock, +				PREV.rm_blockcount, PREV.rm_owner, +				PREV.rm_offset, PREV.rm_flags); +		if (error) +			goto done; +		NEW = LEFT; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount += PREV.rm_blockcount; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		break; + +	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: +		/* +		 * Setting all of a previous oldext extent to newext. +		 * The right neighbor is contiguous, the left is not. +		 */ +		error = xfs_rmap_delete(cur, RIGHT.rm_startblock, +				RIGHT.rm_blockcount, RIGHT.rm_owner, +				RIGHT.rm_offset, RIGHT.rm_flags); +		if (error) +			goto done; +		NEW = PREV; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount += RIGHT.rm_blockcount; +		NEW.rm_flags = RIGHT.rm_flags; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		break; + +	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: +		/* +		 * Setting all of a previous oldext extent to newext. +		 * Neither the left nor right neighbors are contiguous with +		 * the new one. +		 */ +		NEW = PREV; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_flags = newext; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		break; + +	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: +		/* +		 * Setting the first part of a previous oldext extent to newext. +		 * The left neighbor is contiguous. +		 */ +		NEW = PREV; +		error = xfs_rmap_delete(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags); +		if (error) +			goto done; +		NEW.rm_offset += len; +		NEW.rm_startblock += len; +		NEW.rm_blockcount -= len; +		error = xfs_rmap_insert(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags); +		if (error) +			goto done; +		NEW = LEFT; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount += len; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		break; + +	case RMAP_LEFT_FILLING: +		/* +		 * Setting the first part of a previous oldext extent to newext. +		 * The left neighbor is not contiguous. +		 */ +		NEW = PREV; +		error = xfs_rmap_delete(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags); +		if (error) +			goto done; +		NEW.rm_offset += len; +		NEW.rm_startblock += len; +		NEW.rm_blockcount -= len; +		error = xfs_rmap_insert(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags); +		if (error) +			goto done; +		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); +		if (error) +			goto done; +		break; + +	case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: +		/* +		 * Setting the last part of a previous oldext extent to newext. +		 * The right neighbor is contiguous with the new allocation. +		 */ +		NEW = PREV; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount = offset - NEW.rm_offset; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		NEW = RIGHT; +		error = xfs_rmap_delete(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags); +		if (error) +			goto done; +		NEW.rm_offset = offset; +		NEW.rm_startblock = bno; +		NEW.rm_blockcount += len; +		error = xfs_rmap_insert(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags); +		if (error) +			goto done; +		break; + +	case RMAP_RIGHT_FILLING: +		/* +		 * Setting the last part of a previous oldext extent to newext. +		 * The right neighbor is not contiguous. +		 */ +		NEW = PREV; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount -= len; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); +		if (error) +			goto done; +		break; + +	case 0: +		/* +		 * Setting the middle part of a previous oldext extent to +		 * newext.  Contiguity is impossible here. +		 * One extent becomes three extents. +		 */ +		/* new right extent - oldext */ +		NEW.rm_startblock = bno + len; +		NEW.rm_owner = owner; +		NEW.rm_offset = new_endoff; +		NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - +				new_endoff; +		NEW.rm_flags = PREV.rm_flags; +		error = xfs_rmap_insert(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, +				NEW.rm_flags); +		if (error) +			goto done; +		/* new left extent - oldext */ +		NEW = PREV; +		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, +				NEW.rm_offset, NEW.rm_flags, &i); +		if (error) +			goto done; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); +		NEW.rm_blockcount = offset - NEW.rm_offset; +		error = xfs_rmap_update(cur, &NEW); +		if (error) +			goto done; +		/* new middle extent - newext */ +		NEW.rm_startblock = bno; +		NEW.rm_blockcount = len; +		NEW.rm_owner = owner; +		NEW.rm_offset = offset; +		NEW.rm_flags = newext; +		error = xfs_rmap_insert(cur, NEW.rm_startblock, +				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, +				NEW.rm_flags); +		if (error) +			goto done; +		break; + +	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: +	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: +	case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: +	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: +	case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: +	case RMAP_LEFT_CONTIG: +	case RMAP_RIGHT_CONTIG: +		/* +		 * These cases are all impossible. +		 */ +		ASSERT(0); +	} + +	trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, +			unwritten, oinfo); +done: +	if (error) +		trace_xfs_rmap_convert_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} +  #undef	NEW  #undef	LEFT  #undef	RIGHT  #undef	PREV +/* + * Find an extent in the rmap btree and unmap it.  For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _free function. + */ +STATIC int +xfs_rmap_unmap_shared( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	xfs_extlen_t		len, +	bool			unwritten, +	struct xfs_owner_info	*oinfo) +{ +	struct xfs_mount	*mp = cur->bc_mp; +	struct xfs_rmap_irec	ltrec; +	uint64_t		ltoff; +	int			error = 0; +	int			i; +	uint64_t		owner; +	uint64_t		offset; +	unsigned int		flags; + +	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); +	if (unwritten) +		flags |= XFS_RMAP_UNWRITTEN; +	trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, +			unwritten, oinfo); + +	/* +	 * We should always have a left record because there's a static record +	 * for the AG headers at rm_startblock == 0 created by mkfs/growfs that +	 * will not ever be removed from the tree. +	 */ +	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, +			<rec, &i); +	if (error) +		goto out_error; +	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); +	ltoff = ltrec.rm_offset; + +	/* Make sure the extent we found covers the entire freeing range. */ +	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && +		ltrec.rm_startblock + ltrec.rm_blockcount >= +		bno + len, out_error); + +	/* Make sure the owner matches what we expect to find in the tree. */ +	XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + +	/* Make sure the unwritten flag matches. */ +	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == +			(ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + +	/* Check the offset. */ +	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); +	XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, +			out_error); + +	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { +		/* Exact match, simply remove the record from rmap tree. */ +		error = xfs_rmap_delete(cur, ltrec.rm_startblock, +				ltrec.rm_blockcount, ltrec.rm_owner, +				ltrec.rm_offset, ltrec.rm_flags); +		if (error) +			goto out_error; +	} else if (ltrec.rm_startblock == bno) { +		/* +		 * Overlap left hand side of extent: move the start, trim the +		 * length and update the current record. +		 * +		 *       ltbno                ltlen +		 * Orig:    |oooooooooooooooooooo| +		 * Freeing: |fffffffff| +		 * Result:            |rrrrrrrrrr| +		 *         bno       len +		 */ + +		/* Delete prev rmap. */ +		error = xfs_rmap_delete(cur, ltrec.rm_startblock, +				ltrec.rm_blockcount, ltrec.rm_owner, +				ltrec.rm_offset, ltrec.rm_flags); +		if (error) +			goto out_error; + +		/* Add an rmap at the new offset. */ +		ltrec.rm_startblock += len; +		ltrec.rm_blockcount -= len; +		ltrec.rm_offset += len; +		error = xfs_rmap_insert(cur, ltrec.rm_startblock, +				ltrec.rm_blockcount, ltrec.rm_owner, +				ltrec.rm_offset, ltrec.rm_flags); +		if (error) +			goto out_error; +	} else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { +		/* +		 * Overlap right hand side of extent: trim the length and +		 * update the current record. +		 * +		 *       ltbno                ltlen +		 * Orig:    |oooooooooooooooooooo| +		 * Freeing:            |fffffffff| +		 * Result:  |rrrrrrrrrr| +		 *                    bno       len +		 */ +		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, +				ltrec.rm_blockcount, ltrec.rm_owner, +				ltrec.rm_offset, ltrec.rm_flags, &i); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); +		ltrec.rm_blockcount -= len; +		error = xfs_rmap_update(cur, <rec); +		if (error) +			goto out_error; +	} else { +		/* +		 * Overlap middle of extent: trim the length of the existing +		 * record to the length of the new left-extent size, increment +		 * the insertion position so we can insert a new record +		 * containing the remaining right-extent space. +		 * +		 *       ltbno                ltlen +		 * Orig:    |oooooooooooooooooooo| +		 * Freeing:       |fffffffff| +		 * Result:  |rrrrr|         |rrrr| +		 *               bno       len +		 */ +		xfs_extlen_t	orig_len = ltrec.rm_blockcount; + +		/* Shrink the left side of the rmap */ +		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, +				ltrec.rm_blockcount, ltrec.rm_owner, +				ltrec.rm_offset, ltrec.rm_flags, &i); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); +		ltrec.rm_blockcount = bno - ltrec.rm_startblock; +		error = xfs_rmap_update(cur, <rec); +		if (error) +			goto out_error; + +		/* Add an rmap at the new offset */ +		error = xfs_rmap_insert(cur, bno + len, +				orig_len - len - ltrec.rm_blockcount, +				ltrec.rm_owner, offset + len, +				ltrec.rm_flags); +		if (error) +			goto out_error; +	} + +	trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, +			unwritten, oinfo); +out_error: +	if (error) +		trace_xfs_rmap_unmap_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} + +/* + * Find an extent in the rmap btree and map it.  For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _alloc function. + */ +STATIC int +xfs_rmap_map_shared( +	struct xfs_btree_cur	*cur, +	xfs_agblock_t		bno, +	xfs_extlen_t		len, +	bool			unwritten, +	struct xfs_owner_info	*oinfo) +{ +	struct xfs_mount	*mp = cur->bc_mp; +	struct xfs_rmap_irec	ltrec; +	struct xfs_rmap_irec	gtrec; +	int			have_gt; +	int			have_lt; +	int			error = 0; +	int			i; +	uint64_t		owner; +	uint64_t		offset; +	unsigned int		flags = 0; + +	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); +	if (unwritten) +		flags |= XFS_RMAP_UNWRITTEN; +	trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, +			unwritten, oinfo); + +	/* Is there a left record that abuts our range? */ +	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, +			<rec, &have_lt); +	if (error) +		goto out_error; +	if (have_lt && +	    !xfs_rmap_is_mergeable(<rec, owner, flags)) +		have_lt = 0; + +	/* Is there a right record that abuts our range? */ +	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, +			flags, &have_gt); +	if (error) +		goto out_error; +	if (have_gt) { +		error = xfs_rmap_get_rec(cur, >rec, &have_gt); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); +		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, +			cur->bc_private.a.agno, gtrec.rm_startblock, +			gtrec.rm_blockcount, gtrec.rm_owner, +			gtrec.rm_offset, gtrec.rm_flags); + +		if (!xfs_rmap_is_mergeable(>rec, owner, flags)) +			have_gt = 0; +	} + +	if (have_lt && +	    ltrec.rm_startblock + ltrec.rm_blockcount == bno && +	    ltrec.rm_offset + ltrec.rm_blockcount == offset) { +		/* +		 * Left edge contiguous, merge into left record. +		 * +		 *       ltbno     ltlen +		 * orig:   |ooooooooo| +		 * adding:           |aaaaaaaaa| +		 * result: |rrrrrrrrrrrrrrrrrrr| +		 *                  bno       len +		 */ +		ltrec.rm_blockcount += len; +		if (have_gt && +		    bno + len == gtrec.rm_startblock && +		    offset + len == gtrec.rm_offset) { +			/* +			 * Right edge also contiguous, delete right record +			 * and merge into left record. +			 * +			 *       ltbno     ltlen    gtbno     gtlen +			 * orig:   |ooooooooo|         |ooooooooo| +			 * adding:           |aaaaaaaaa| +			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| +			 */ +			ltrec.rm_blockcount += gtrec.rm_blockcount; +			error = xfs_rmap_delete(cur, gtrec.rm_startblock, +					gtrec.rm_blockcount, gtrec.rm_owner, +					gtrec.rm_offset, gtrec.rm_flags); +			if (error) +				goto out_error; +		} + +		/* Point the cursor back to the left record and update. */ +		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, +				ltrec.rm_blockcount, ltrec.rm_owner, +				ltrec.rm_offset, ltrec.rm_flags, &i); +		if (error) +			goto out_error; +		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + +		error = xfs_rmap_update(cur, <rec); +		if (error) +			goto out_error; +	} else if (have_gt && +		   bno + len == gtrec.rm_startblock && +		   offset + len == gtrec.rm_offset) { +		/* +		 * Right edge contiguous, merge into right record. +		 * +		 *                 gtbno     gtlen +		 * Orig:             |ooooooooo| +		 * adding: |aaaaaaaaa| +		 * Result: |rrrrrrrrrrrrrrrrrrr| +		 *        bno       len +		 */ +		/* Delete the old record. */ +		error = xfs_rmap_delete(cur, gtrec.rm_startblock, +				gtrec.rm_blockcount, gtrec.rm_owner, +				gtrec.rm_offset, gtrec.rm_flags); +		if (error) +			goto out_error; + +		/* Move the start and re-add it. */ +		gtrec.rm_startblock = bno; +		gtrec.rm_blockcount += len; +		gtrec.rm_offset = offset; +		error = xfs_rmap_insert(cur, gtrec.rm_startblock, +				gtrec.rm_blockcount, gtrec.rm_owner, +				gtrec.rm_offset, gtrec.rm_flags); +		if (error) +			goto out_error; +	} else { +		/* +		 * No contiguous edge with identical owner, insert +		 * new record at current cursor position. +		 */ +		error = xfs_rmap_insert(cur, bno, len, owner, offset, flags); +		if (error) +			goto out_error; +	} + +	trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, +			unwritten, oinfo); +out_error: +	if (error) +		trace_xfs_rmap_map_error(cur->bc_mp, +				cur->bc_private.a.agno, error, _RET_IP_); +	return error; +} +  struct xfs_rmap_query_range_info {  	xfs_rmap_query_range_fn	fn;  	void				*priv; @@ -1237,15 +2115,27 @@ xfs_rmap_finish_one(  	case XFS_RMAP_MAP:  		error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);  		break; +	case XFS_RMAP_MAP_SHARED: +		error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, +				&oinfo); +		break;  	case XFS_RMAP_FREE:  	case XFS_RMAP_UNMAP:  		error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,  				&oinfo);  		break; +	case XFS_RMAP_UNMAP_SHARED: +		error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, +				&oinfo); +		break;  	case XFS_RMAP_CONVERT:  		error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,  				&oinfo);  		break; +	case XFS_RMAP_CONVERT_SHARED: +		error = xfs_rmap_convert_shared(rcur, bno, blockcount, +				!unwritten, &oinfo); +		break;  	default:  		ASSERT(0);  		error = -EFSCORRUPTED; @@ -1263,9 +2153,10 @@ out_cur:   */  static bool  xfs_rmap_update_is_needed( -	struct xfs_mount	*mp) +	struct xfs_mount	*mp, +	int			whichfork)  { -	return xfs_sb_version_hasrmapbt(&mp->m_sb); +	return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;  }  /* @@ -1311,10 +2202,11 @@ xfs_rmap_map_extent(  	int			whichfork,  	struct xfs_bmbt_irec	*PREV)  { -	if (!xfs_rmap_update_is_needed(mp)) +	if (!xfs_rmap_update_is_needed(mp, whichfork))  		return 0; -	return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, +	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? +			XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,  			whichfork, PREV);  } @@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent(  	int			whichfork,  	struct xfs_bmbt_irec	*PREV)  { -	if (!xfs_rmap_update_is_needed(mp)) +	if (!xfs_rmap_update_is_needed(mp, whichfork))  		return 0; -	return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, +	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? +			XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,  			whichfork, PREV);  } @@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent(  	int			whichfork,  	struct xfs_bmbt_irec	*PREV)  { -	if (!xfs_rmap_update_is_needed(mp)) +	if (!xfs_rmap_update_is_needed(mp, whichfork))  		return 0; -	return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, +	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? +			XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,  			whichfork, PREV);  } @@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent(  {  	struct xfs_bmbt_irec	bmap; -	if (!xfs_rmap_update_is_needed(mp)) +	if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))  		return 0;  	bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); @@ -1386,7 +2280,7 @@ xfs_rmap_free_extent(  {  	struct xfs_bmbt_irec	bmap; -	if (!xfs_rmap_update_is_needed(mp)) +	if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))  		return 0;  	bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 71cf99a4acba..789930599339 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,  		xfs_fsblock_t startblock, xfs_filblks_t blockcount,  		xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, +		uint64_t owner, uint64_t offset, unsigned int flags, +		struct xfs_rmap_irec *irec, int	*stat); +int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, +		uint64_t owner, uint64_t offset, unsigned int flags, +		struct xfs_rmap_irec *irec, int	*stat); +  #endif	/* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 17b8eeb34ac8..83e672ff7577 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -35,6 +35,7 @@  #include "xfs_cksum.h"  #include "xfs_error.h"  #include "xfs_extent_busy.h" +#include "xfs_ag_resv.h"  /*   * Reverse map btree. @@ -512,6 +513,83 @@ void  xfs_rmapbt_compute_maxlevels(  	struct xfs_mount		*mp)  { -	mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, -			mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +	/* +	 * On a non-reflink filesystem, the maximum number of rmap +	 * records is the number of blocks in the AG, hence the max +	 * rmapbt height is log_$maxrecs($agblocks).  However, with +	 * reflink each AG block can have up to 2^32 (per the refcount +	 * record format) owners, which means that theoretically we +	 * could face up to 2^64 rmap records. +	 * +	 * That effectively means that the max rmapbt height must be +	 * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG +	 * blocks to feed the rmapbt long before the rmapbt reaches +	 * maximum height.  The reflink code uses ag_resv_critical to +	 * disallow reflinking when less than 10% of the per-AG metadata +	 * block reservation since the fallback is a regular file copy. +	 */ +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; +	else +		mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, +				mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_rmapbt_calc_size( +	struct xfs_mount	*mp, +	unsigned long long	len) +{ +	return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_rmapbt_max_size( +	struct xfs_mount	*mp) +{ +	/* Bail out if we're uninitialized, which can happen in mkfs. */ +	if (mp->m_rmap_mxr[0] == 0) +		return 0; + +	return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_rmapbt_calc_reserves( +	struct xfs_mount	*mp, +	xfs_agnumber_t		agno, +	xfs_extlen_t		*ask, +	xfs_extlen_t		*used) +{ +	struct xfs_buf		*agbp; +	struct xfs_agf		*agf; +	xfs_extlen_t		pool_len; +	xfs_extlen_t		tree_len; +	int			error; + +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) +		return 0; + +	/* Reserve 1% of the AG or enough for 1 block per record. */ +	pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); +	*ask += pool_len; + +	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); +	if (error) +		return error; + +	agf = XFS_BUF_TO_AGF(agbp); +	tree_len = be32_to_cpu(agf->agf_rmap_blocks); +	xfs_buf_relse(agbp); + +	*used += tree_len; + +	return error;  } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index e73a55357dab..2a9ac472fb15 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,  int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);  extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, +		unsigned long long len); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); + +extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, +		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +  #endif	/* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4aecc5fefe96..a70aec910626 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -38,6 +38,8 @@  #include "xfs_ialloc_btree.h"  #include "xfs_log.h"  #include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h"  /*   * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -737,6 +739,13 @@ xfs_sb_mount_common(  	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;  	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; +	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, +			true); +	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, +			false); +	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; +	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; +  	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);  	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,  					sbp->sb_inopblock); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 0c5b30bd884c..c6f4eb46fe26 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;  extern const struct xfs_buf_ops xfs_agfl_buf_ops;  extern const struct xfs_buf_ops xfs_allocbt_buf_ops;  extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;  extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;  extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;  extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -122,6 +123,7 @@ int	xfs_log_calc_minimum_size(struct xfs_mount *);  #define	XFS_INO_REF		2  #define	XFS_ATTR_BTREE_REF	1  #define	XFS_DQUOT_REF		1 +#define	XFS_REFC_BTREE_REF	1  /*   * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 301ef2f4dbd6..b456cca1bfb2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -67,13 +67,14 @@ xfs_calc_buf_res(   * Per-extent log reservation for the btree changes involved in freeing or   * allocating an extent.  In classic XFS there were two trees that will be   * modified (bnobt + cntbt).  With rmap enabled, there are three trees - * (rmapbt).  The number of blocks reserved is based on the formula: + * (rmapbt).  With reflink, there are four trees (refcountbt).  The number of + * blocks reserved is based on the formula:   *   * num trees * ((2 blocks/level * max depth) - 1)   *   * Keep in mind that max depth is calculated separately for each type of tree.   */ -static uint +uint  xfs_allocfree_log_count(  	struct xfs_mount *mp,  	uint		num_ops) @@ -83,6 +84,8 @@ xfs_allocfree_log_count(  	blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);  	if (xfs_sb_version_hasrmapbt(&mp->m_sb))  		blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);  	return blocks;  } @@ -809,11 +812,18 @@ xfs_trans_resv_calc(  	 * require a permanent reservation on space.  	 */  	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); -	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; +	else +		resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;  	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;  	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); -	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		resp->tr_itruncate.tr_logcount = +				XFS_ITRUNCATE_LOG_COUNT_REFLINK; +	else +		resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;  	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;  	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -870,7 +880,10 @@ xfs_trans_resv_calc(  	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;  	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); -	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; +	else +		resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;  	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;  	/* diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0eb46ed6d404..b7e5357d060a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -87,6 +87,7 @@ struct xfs_trans_resv {  #define	XFS_DEFAULT_LOG_COUNT		1  #define	XFS_DEFAULT_PERM_LOG_COUNT	2  #define	XFS_ITRUNCATE_LOG_COUNT		2 +#define	XFS_ITRUNCATE_LOG_COUNT_REFLINK	8  #define XFS_INACTIVE_LOG_COUNT		2  #define	XFS_CREATE_LOG_COUNT		2  #define	XFS_CREATE_TMPFILE_LOG_COUNT	2 @@ -96,11 +97,13 @@ struct xfs_trans_resv {  #define	XFS_LINK_LOG_COUNT		2  #define	XFS_RENAME_LOG_COUNT		2  #define	XFS_WRITE_LOG_COUNT		2 +#define	XFS_WRITE_LOG_COUNT_REFLINK	8  #define	XFS_ADDAFORK_LOG_COUNT		2  #define	XFS_ATTRINVAL_LOG_COUNT		1  #define	XFS_ATTRSET_LOG_COUNT		3  #define	XFS_ATTRRM_LOG_COUNT		3  void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); +uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);  #endif	/* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 41e0428d8175..7917f6e44286 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,6 +21,8 @@  /*   * Components of space reservations.   */ +#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \ +		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))  #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \  		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))  #define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1) @@ -28,6 +30,13 @@  	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \  	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \  	  XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ +	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ +	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ +	  XFS_EXTENTADD_SPACE_RES(mp,w) + \ +	 ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ +	  XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ +	  (mp)->m_rmap_maxlevels)  #define	XFS_DAENTER_1B(mp,w)	\  	((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)  #define	XFS_DAENTER_DBS(mp,w)	\ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3d503647f26b..8d74870468c2 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -90,6 +90,7 @@ typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */   */  #define	XFS_DATA_FORK	0  #define	XFS_ATTR_FORK	1 +#define	XFS_COW_FORK	2  /*   * Min numbers of data/attr fork btree root pointers. @@ -109,7 +110,7 @@ typedef enum {  typedef enum {  	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, -	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX +	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX  } xfs_btnum_t;  struct xfs_name { diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6e527b8eccb..b468e041f207 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -236,7 +236,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)  		iattr.ia_valid = ATTR_MODE | ATTR_CTIME;  		iattr.ia_mode = mode; -		iattr.ia_ctime = current_fs_time(inode->i_sb); +		iattr.ia_ctime = current_time(inode);  		error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);  	} @@ -257,16 +257,11 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)  		return error;  	if (type == ACL_TYPE_ACCESS) { -		umode_t mode = inode->i_mode; -		error = posix_acl_equiv_mode(acl, &mode); - -		if (error <= 0) { -			acl = NULL; - -			if (error < 0) -				return error; -		} +		umode_t mode; +		error = posix_acl_update_mode(inode, &mode, &acl); +		if (error) +			return error;  		error = xfs_set_mode(inode, mode);  		if (error)  			return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7575cfc3ad15..3e57a56cf829 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@  #include "xfs_bmap.h"  #include "xfs_bmap_util.h"  #include "xfs_bmap_btree.h" +#include "xfs_reflink.h"  #include <linux/gfp.h>  #include <linux/mpage.h>  #include <linux/pagevec.h> @@ -39,6 +40,7 @@  /* flags for direct write completions */  #define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)  #define XFS_DIO_FLAG_APPEND	(1 << 1) +#define XFS_DIO_FLAG_COW	(1 << 2)  /*   * structure owned by writepages passed to individual writepage calls @@ -200,7 +202,7 @@ xfs_setfilesize_trans_alloc(   * Update on-disk file size now that data has been written to disk.   */  STATIC int -xfs_setfilesize( +__xfs_setfilesize(  	struct xfs_inode	*ip,  	struct xfs_trans	*tp,  	xfs_off_t		offset, @@ -225,6 +227,23 @@ xfs_setfilesize(  	return xfs_trans_commit(tp);  } +int +xfs_setfilesize( +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	size_t			size) +{ +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_trans	*tp; +	int			error; + +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); +	if (error) +		return error; + +	return __xfs_setfilesize(ip, tp, offset, size); +} +  STATIC int  xfs_setfilesize_ioend(  	struct xfs_ioend	*ioend, @@ -247,7 +266,7 @@ xfs_setfilesize_ioend(  		return error;  	} -	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); +	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);  }  /* @@ -270,6 +289,25 @@ xfs_end_io(  		error = -EIO;  	/* +	 * For a CoW extent, we need to move the mapping from the CoW fork +	 * to the data fork.  If instead an error happened, just dump the +	 * new blocks. +	 */ +	if (ioend->io_type == XFS_IO_COW) { +		if (error) +			goto done; +		if (ioend->io_bio->bi_error) { +			error = xfs_reflink_cancel_cow_range(ip, +					ioend->io_offset, ioend->io_size); +			goto done; +		} +		error = xfs_reflink_end_cow(ip, ioend->io_offset, +				ioend->io_size); +		if (error) +			goto done; +	} + +	/*  	 * For unwritten extents we need to issue transactions to convert a  	 * range to normal written extens after the data I/O has finished.  	 * Detecting and handling completion IO errors is done individually @@ -284,7 +322,8 @@ xfs_end_io(  	} else if (ioend->io_append_trans) {  		error = xfs_setfilesize_ioend(ioend, error);  	} else { -		ASSERT(!xfs_ioend_is_append(ioend)); +		ASSERT(!xfs_ioend_is_append(ioend) || +		       ioend->io_type == XFS_IO_COW);  	}  done: @@ -298,7 +337,7 @@ xfs_end_bio(  	struct xfs_ioend	*ioend = bio->bi_private;  	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount; -	if (ioend->io_type == XFS_IO_UNWRITTEN) +	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)  		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);  	else if (ioend->io_append_trans)  		queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -324,6 +363,7 @@ xfs_map_blocks(  	if (XFS_FORCED_SHUTDOWN(mp))  		return -EIO; +	ASSERT(type != XFS_IO_COW);  	if (type == XFS_IO_UNWRITTEN)  		bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -338,6 +378,13 @@ xfs_map_blocks(  	offset_fsb = XFS_B_TO_FSBT(mp, offset);  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,  				imap, &nimaps, bmapi_flags); +	/* +	 * Truncate an overwrite extent if there's a pending CoW +	 * reservation before the end of this extent.  This forces us +	 * to come back to writepage to take care of the CoW. +	 */ +	if (nimaps && type == XFS_IO_OVERWRITE) +		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);  	xfs_iunlock(ip, XFS_ILOCK_SHARED);  	if (error) @@ -345,7 +392,8 @@ xfs_map_blocks(  	if (type == XFS_IO_DELALLOC &&  	    (!nimaps || isnullstartblock(imap->br_startblock))) { -		error = xfs_iomap_write_allocate(ip, offset, imap); +		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, +				imap);  		if (!error)  			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);  		return error; @@ -720,6 +768,56 @@ out_invalidate:  	return;  } +static int +xfs_map_cow( +	struct xfs_writepage_ctx *wpc, +	struct inode		*inode, +	loff_t			offset, +	unsigned int		*new_type) +{ +	struct xfs_inode	*ip = XFS_I(inode); +	struct xfs_bmbt_irec	imap; +	bool			is_cow = false, need_alloc = false; +	int			error; + +	/* +	 * If we already have a valid COW mapping keep using it. +	 */ +	if (wpc->io_type == XFS_IO_COW) { +		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); +		if (wpc->imap_valid) { +			*new_type = XFS_IO_COW; +			return 0; +		} +	} + +	/* +	 * Else we need to check if there is a COW mapping at this offset. +	 */ +	xfs_ilock(ip, XFS_ILOCK_SHARED); +	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); +	xfs_iunlock(ip, XFS_ILOCK_SHARED); + +	if (!is_cow) +		return 0; + +	/* +	 * And if the COW mapping has a delayed extent here we need to +	 * allocate real space for it now. +	 */ +	if (need_alloc) { +		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, +				&imap); +		if (error) +			return error; +	} + +	wpc->io_type = *new_type = XFS_IO_COW; +	wpc->imap_valid = true; +	wpc->imap = imap; +	return 0; +} +  /*   * We implement an immediate ioend submission policy here to avoid needing to   * chain multiple ioends and hence nest mempool allocations which can violate @@ -752,6 +850,7 @@ xfs_writepage_map(  	int			error = 0;  	int			count = 0;  	int			uptodate = 1; +	unsigned int		new_type;  	bh = head = page_buffers(page);  	offset = page_offset(page); @@ -772,22 +871,13 @@ xfs_writepage_map(  			continue;  		} -		if (buffer_unwritten(bh)) { -			if (wpc->io_type != XFS_IO_UNWRITTEN) { -				wpc->io_type = XFS_IO_UNWRITTEN; -				wpc->imap_valid = false; -			} -		} else if (buffer_delay(bh)) { -			if (wpc->io_type != XFS_IO_DELALLOC) { -				wpc->io_type = XFS_IO_DELALLOC; -				wpc->imap_valid = false; -			} -		} else if (buffer_uptodate(bh)) { -			if (wpc->io_type != XFS_IO_OVERWRITE) { -				wpc->io_type = XFS_IO_OVERWRITE; -				wpc->imap_valid = false; -			} -		} else { +		if (buffer_unwritten(bh)) +			new_type = XFS_IO_UNWRITTEN; +		else if (buffer_delay(bh)) +			new_type = XFS_IO_DELALLOC; +		else if (buffer_uptodate(bh)) +			new_type = XFS_IO_OVERWRITE; +		else {  			if (PageUptodate(page))  				ASSERT(buffer_mapped(bh));  			/* @@ -800,6 +890,17 @@ xfs_writepage_map(  			continue;  		} +		if (xfs_is_reflink_inode(XFS_I(inode))) { +			error = xfs_map_cow(wpc, inode, offset, &new_type); +			if (error) +				goto out; +		} + +		if (wpc->io_type != new_type) { +			wpc->io_type = new_type; +			wpc->imap_valid = false; +		} +  		if (wpc->imap_valid)  			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,  							 offset); @@ -1090,18 +1191,24 @@ xfs_map_direct(  	struct inode		*inode,  	struct buffer_head	*bh_result,  	struct xfs_bmbt_irec	*imap, -	xfs_off_t		offset) +	xfs_off_t		offset, +	bool			is_cow)  {  	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;  	xfs_off_t		size = bh_result->b_size;  	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, -		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); +		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : +		XFS_IO_OVERWRITE, imap);  	if (ISUNWRITTEN(imap)) {  		*flags |= XFS_DIO_FLAG_UNWRITTEN;  		set_buffer_defer_completion(bh_result); -	} else if (offset + size > i_size_read(inode) || offset + size < 0) { +	} else if (is_cow) { +		*flags |= XFS_DIO_FLAG_COW; +		set_buffer_defer_completion(bh_result); +	} +	if (offset + size > i_size_read(inode) || offset + size < 0) {  		*flags |= XFS_DIO_FLAG_APPEND;  		set_buffer_defer_completion(bh_result);  	} @@ -1147,6 +1254,44 @@ xfs_map_trim_size(  	bh_result->b_size = mapping_size;  } +/* Bounce unaligned directio writes to the page cache. */ +static int +xfs_bounce_unaligned_dio_write( +	struct xfs_inode	*ip, +	xfs_fileoff_t		offset_fsb, +	struct xfs_bmbt_irec	*imap) +{ +	struct xfs_bmbt_irec	irec; +	xfs_fileoff_t		delta; +	bool			shared; +	bool			x; +	int			error; + +	irec = *imap; +	if (offset_fsb > irec.br_startoff) { +		delta = offset_fsb - irec.br_startoff; +		irec.br_blockcount -= delta; +		irec.br_startblock += delta; +		irec.br_startoff = offset_fsb; +	} +	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); +	if (error) +		return error; + +	/* +	 * We're here because we're trying to do a directio write to a +	 * region that isn't aligned to a filesystem block.  If any part +	 * of the extent is shared, fall back to buffered mode to handle +	 * the RMW.  This is done by returning -EREMCHG ("remote addr +	 * changed"), which is caught further up the call stack. +	 */ +	if (shared) { +		trace_xfs_reflink_bounce_dio_write(ip, imap); +		return -EREMCHG; +	} +	return 0; +} +  STATIC int  __xfs_get_blocks(  	struct inode		*inode, @@ -1166,6 +1311,8 @@ __xfs_get_blocks(  	xfs_off_t		offset;  	ssize_t			size;  	int			new = 0; +	bool			is_cow = false; +	bool			need_alloc = false;  	BUG_ON(create && !direct); @@ -1191,8 +1338,26 @@ __xfs_get_blocks(  	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);  	offset_fsb = XFS_B_TO_FSBT(mp, offset); -	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, -				&imap, &nimaps, XFS_BMAPI_ENTIRE); +	if (create && direct && xfs_is_reflink_inode(ip)) +		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, +					&need_alloc); +	if (!is_cow) { +		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, +					&imap, &nimaps, XFS_BMAPI_ENTIRE); +		/* +		 * Truncate an overwrite extent if there's a pending CoW +		 * reservation before the end of this extent.  This +		 * forces us to come back to get_blocks to take care of +		 * the CoW. +		 */ +		if (create && direct && nimaps && +		    imap.br_startblock != HOLESTARTBLOCK && +		    imap.br_startblock != DELAYSTARTBLOCK && +		    !ISUNWRITTEN(&imap)) +			xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, +					&imap); +	} +	ASSERT(!need_alloc);  	if (error)  		goto out_unlock; @@ -1244,6 +1409,13 @@ __xfs_get_blocks(  	if (imap.br_startblock != HOLESTARTBLOCK &&  	    imap.br_startblock != DELAYSTARTBLOCK &&  	    (create || !ISUNWRITTEN(&imap))) { +		if (create && direct && !is_cow) { +			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, +					&imap); +			if (error) +				return error; +		} +  		xfs_map_buffer(inode, bh_result, &imap, offset);  		if (ISUNWRITTEN(&imap))  			set_buffer_unwritten(bh_result); @@ -1252,7 +1424,8 @@ __xfs_get_blocks(  			if (dax_fault)  				ASSERT(!ISUNWRITTEN(&imap));  			else -				xfs_map_direct(inode, bh_result, &imap, offset); +				xfs_map_direct(inode, bh_result, &imap, offset, +						is_cow);  		}  	} @@ -1336,13 +1509,12 @@ xfs_end_io_direct_write(  {  	struct inode		*inode = file_inode(iocb->ki_filp);  	struct xfs_inode	*ip = XFS_I(inode); -	struct xfs_mount	*mp = ip->i_mount;  	uintptr_t		flags = (uintptr_t)private;  	int			error = 0;  	trace_xfs_end_io_direct_write(ip, offset, size); -	if (XFS_FORCED_SHUTDOWN(mp)) +	if (XFS_FORCED_SHUTDOWN(ip->i_mount))  		return -EIO;  	if (size <= 0) @@ -1375,19 +1547,17 @@ xfs_end_io_direct_write(  		i_size_write(inode, offset + size);  	spin_unlock(&ip->i_flags_lock); +	if (flags & XFS_DIO_FLAG_COW) +		error = xfs_reflink_end_cow(ip, offset, size);  	if (flags & XFS_DIO_FLAG_UNWRITTEN) {  		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);  		error = xfs_iomap_write_unwritten(ip, offset, size); -	} else if (flags & XFS_DIO_FLAG_APPEND) { -		struct xfs_trans *tp; - +	} +	if (flags & XFS_DIO_FLAG_APPEND) {  		trace_xfs_end_io_direct_write_append(ip, offset, size); -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, -				&tp); -		if (!error) -			error = xfs_setfilesize(ip, tp, offset, size); +		error = xfs_setfilesize(ip, offset, size);  	}  	return error; @@ -1414,6 +1584,17 @@ xfs_vm_bmap(  	trace_xfs_vm_bmap(XFS_I(inode));  	xfs_ilock(ip, XFS_IOLOCK_SHARED); + +	/* +	 * The swap code (ab-)uses ->bmap to get a block mapping and then +	 * bypasseѕ the file system for actual I/O.  We really can't allow +	 * that on reflinks inodes, so we have to skip out here.  And yes, +	 * 0 is the magic code for a bmap error.. +	 */ +	if (xfs_is_reflink_inode(ip)) { +		xfs_iunlock(ip, XFS_IOLOCK_SHARED); +		return 0; +	}  	filemap_write_and_wait(mapping);  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);  	return generic_block_bmap(mapping, block, xfs_get_blocks); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index bf2d9a141a73..b3c6634f9518 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -28,13 +28,15 @@ enum {  	XFS_IO_DELALLOC,	/* covers delalloc region */  	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */  	XFS_IO_OVERWRITE,	/* covers already allocated extent */ +	XFS_IO_COW,		/* covers copy-on-write extent */  };  #define XFS_IO_TYPES \  	{ XFS_IO_INVALID,		"invalid" }, \  	{ XFS_IO_DELALLOC,		"delalloc" }, \  	{ XFS_IO_UNWRITTEN,		"unwritten" }, \ -	{ XFS_IO_OVERWRITE,		"overwrite" } +	{ XFS_IO_OVERWRITE,		"overwrite" }, \ +	{ XFS_IO_COW,			"CoW" }  /*   * Structure for buffered I/O completions. @@ -62,6 +64,7 @@ int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,  int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,  		ssize_t size, void *private); +int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);  extern void xfs_count_page_state(struct page *, int *, int *);  extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c new file mode 100644 index 000000000000..9bf57c76623b --- /dev/null +++ b/fs/xfs/xfs_bmap_item.c @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +kmem_zone_t	*xfs_bui_zone; +kmem_zone_t	*xfs_bud_zone; + +static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) +{ +	return container_of(lip, struct xfs_bui_log_item, bui_item); +} + +void +xfs_bui_item_free( +	struct xfs_bui_log_item	*buip) +{ +	kmem_zone_free(xfs_bui_zone, buip); +} + +STATIC void +xfs_bui_item_size( +	struct xfs_log_item	*lip, +	int			*nvecs, +	int			*nbytes) +{ +	struct xfs_bui_log_item	*buip = BUI_ITEM(lip); + +	*nvecs += 1; +	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bui log item. We use only 1 iovec, and we point that + * at the bui_log_format structure embedded in the bui item. + * It is at this point that we assert that all of the extent + * slots in the bui item have been filled. + */ +STATIC void +xfs_bui_item_format( +	struct xfs_log_item	*lip, +	struct xfs_log_vec	*lv) +{ +	struct xfs_bui_log_item	*buip = BUI_ITEM(lip); +	struct xfs_log_iovec	*vecp = NULL; + +	ASSERT(atomic_read(&buip->bui_next_extent) == +			buip->bui_format.bui_nextents); + +	buip->bui_format.bui_type = XFS_LI_BUI; +	buip->bui_format.bui_size = 1; + +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format, +			xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents)); +} + +/* + * Pinning has no meaning for an bui item, so just return. + */ +STATIC void +xfs_bui_item_pin( +	struct xfs_log_item	*lip) +{ +} + +/* + * The unpin operation is the last place an BUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the BUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the BUI to either construct + * and commit the BUD or drop the BUD's reference in the event of error. Simply + * drop the log's BUI reference now that the log is done with it. + */ +STATIC void +xfs_bui_item_unpin( +	struct xfs_log_item	*lip, +	int			remove) +{ +	struct xfs_bui_log_item	*buip = BUI_ITEM(lip); + +	xfs_bui_release(buip); +} + +/* + * BUI items have no locking or pushing.  However, since BUIs are pulled from + * the AIL when their corresponding BUDs are committed to disk, their situation + * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log.  This should help in getting the BUI out of + * the AIL. + */ +STATIC uint +xfs_bui_item_push( +	struct xfs_log_item	*lip, +	struct list_head	*buffer_list) +{ +	return XFS_ITEM_PINNED; +} + +/* + * The BUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an BUD isn't going to be + * constructed and thus we free the BUI here directly. + */ +STATIC void +xfs_bui_item_unlock( +	struct xfs_log_item	*lip) +{ +	if (lip->li_flags & XFS_LI_ABORTED) +		xfs_bui_item_free(BUI_ITEM(lip)); +} + +/* + * The BUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_bui_item_committed( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +	return lsn; +} + +/* + * The BUI dependency tracking op doesn't do squat.  It can't because + * it doesn't know where the free extent is coming from.  The dependency + * tracking has to be handled by the "enclosing" metadata object.  For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bui_item_committing( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +} + +/* + * This is the ops vector shared by all bui log items. + */ +static const struct xfs_item_ops xfs_bui_item_ops = { +	.iop_size	= xfs_bui_item_size, +	.iop_format	= xfs_bui_item_format, +	.iop_pin	= xfs_bui_item_pin, +	.iop_unpin	= xfs_bui_item_unpin, +	.iop_unlock	= xfs_bui_item_unlock, +	.iop_committed	= xfs_bui_item_committed, +	.iop_push	= xfs_bui_item_push, +	.iop_committing = xfs_bui_item_committing, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( +	struct xfs_mount		*mp) + +{ +	struct xfs_bui_log_item		*buip; + +	buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + +	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); +	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; +	buip->bui_format.bui_id = (uintptr_t)(void *)buip; +	atomic_set(&buip->bui_next_extent, 0); +	atomic_set(&buip->bui_refcount, 2); + +	return buip; +} + +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( +	struct xfs_bui_log_item	*buip) +{ +	if (atomic_dec_and_test(&buip->bui_refcount)) { +		xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); +		xfs_bui_item_free(buip); +	} +} + +static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) +{ +	return container_of(lip, struct xfs_bud_log_item, bud_item); +} + +STATIC void +xfs_bud_item_size( +	struct xfs_log_item	*lip, +	int			*nvecs, +	int			*nbytes) +{ +	*nvecs += 1; +	*nbytes += sizeof(struct xfs_bud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bud log item. We use only 1 iovec, and we point that + * at the bud_log_format structure embedded in the bud item. + * It is at this point that we assert that all of the extent + * slots in the bud item have been filled. + */ +STATIC void +xfs_bud_item_format( +	struct xfs_log_item	*lip, +	struct xfs_log_vec	*lv) +{ +	struct xfs_bud_log_item	*budp = BUD_ITEM(lip); +	struct xfs_log_iovec	*vecp = NULL; + +	budp->bud_format.bud_type = XFS_LI_BUD; +	budp->bud_format.bud_size = 1; + +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format, +			sizeof(struct xfs_bud_log_format)); +} + +/* + * Pinning has no meaning for an bud item, so just return. + */ +STATIC void +xfs_bud_item_pin( +	struct xfs_log_item	*lip) +{ +} + +/* + * Since pinning has no meaning for an bud item, unpinning does + * not either. + */ +STATIC void +xfs_bud_item_unpin( +	struct xfs_log_item	*lip, +	int			remove) +{ +} + +/* + * There isn't much you can do to push on an bud item.  It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_bud_item_push( +	struct xfs_log_item	*lip, +	struct list_head	*buffer_list) +{ +	return XFS_ITEM_PINNED; +} + +/* + * The BUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the BUI and free the + * BUD. + */ +STATIC void +xfs_bud_item_unlock( +	struct xfs_log_item	*lip) +{ +	struct xfs_bud_log_item	*budp = BUD_ITEM(lip); + +	if (lip->li_flags & XFS_LI_ABORTED) { +		xfs_bui_release(budp->bud_buip); +		kmem_zone_free(xfs_bud_zone, budp); +	} +} + +/* + * When the bud item is committed to disk, all we need to do is delete our + * reference to our partner bui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_bud_item_committed( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +	struct xfs_bud_log_item	*budp = BUD_ITEM(lip); + +	/* +	 * Drop the BUI reference regardless of whether the BUD has been +	 * aborted. Once the BUD transaction is constructed, it is the sole +	 * responsibility of the BUD to release the BUI (even if the BUI is +	 * aborted due to log I/O error). +	 */ +	xfs_bui_release(budp->bud_buip); +	kmem_zone_free(xfs_bud_zone, budp); + +	return (xfs_lsn_t)-1; +} + +/* + * The BUD dependency tracking op doesn't do squat.  It can't because + * it doesn't know where the free extent is coming from.  The dependency + * tracking has to be handled by the "enclosing" metadata object.  For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bud_item_committing( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +} + +/* + * This is the ops vector shared by all bud log items. + */ +static const struct xfs_item_ops xfs_bud_item_ops = { +	.iop_size	= xfs_bud_item_size, +	.iop_format	= xfs_bud_item_format, +	.iop_pin	= xfs_bud_item_pin, +	.iop_unpin	= xfs_bud_item_unpin, +	.iop_unlock	= xfs_bud_item_unlock, +	.iop_committed	= xfs_bud_item_committed, +	.iop_push	= xfs_bud_item_push, +	.iop_committing = xfs_bud_item_committing, +}; + +/* + * Allocate and initialize an bud item with the given number of extents. + */ +struct xfs_bud_log_item * +xfs_bud_init( +	struct xfs_mount		*mp, +	struct xfs_bui_log_item		*buip) + +{ +	struct xfs_bud_log_item	*budp; + +	budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); +	xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); +	budp->bud_buip = buip; +	budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + +	return budp; +} + +/* + * Process a bmap update intent item that was recovered from the log. + * We need to update some inode's bmbt. + */ +int +xfs_bui_recover( +	struct xfs_mount		*mp, +	struct xfs_bui_log_item		*buip) +{ +	int				error = 0; +	unsigned int			bui_type; +	struct xfs_map_extent		*bmap; +	xfs_fsblock_t			startblock_fsb; +	xfs_fsblock_t			inode_fsb; +	bool				op_ok; +	struct xfs_bud_log_item		*budp; +	enum xfs_bmap_intent_type	type; +	int				whichfork; +	xfs_exntst_t			state; +	struct xfs_trans		*tp; +	struct xfs_inode		*ip = NULL; +	struct xfs_defer_ops		dfops; +	xfs_fsblock_t			firstfsb; + +	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + +	/* Only one mapping operation per BUI... */ +	if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { +		set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); +		xfs_bui_release(buip); +		return -EIO; +	} + +	/* +	 * First check the validity of the extent described by the +	 * BUI.  If anything is bad, then toss the BUI. +	 */ +	bmap = &buip->bui_format.bui_extents[0]; +	startblock_fsb = XFS_BB_TO_FSB(mp, +			   XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); +	inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, +			XFS_INO_TO_FSB(mp, bmap->me_owner))); +	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { +	case XFS_BMAP_MAP: +	case XFS_BMAP_UNMAP: +		op_ok = true; +		break; +	default: +		op_ok = false; +		break; +	} +	if (!op_ok || startblock_fsb == 0 || +	    bmap->me_len == 0 || +	    inode_fsb == 0 || +	    startblock_fsb >= mp->m_sb.sb_dblocks || +	    bmap->me_len >= mp->m_sb.sb_agblocks || +	    inode_fsb >= mp->m_sb.sb_dblocks || +	    (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { +		/* +		 * This will pull the BUI from the AIL and +		 * free the memory associated with it. +		 */ +		set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); +		xfs_bui_release(buip); +		return -EIO; +	} + +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); +	if (error) +		return error; +	budp = xfs_trans_get_bud(tp, buip); + +	/* Grab the inode. */ +	error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); +	if (error) +		goto err_inode; + +	if (VFS_I(ip)->i_nlink == 0) +		xfs_iflags_set(ip, XFS_IRECOVERY); +	xfs_defer_init(&dfops, &firstfsb); + +	/* Process deferred bmap item. */ +	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? +			XFS_EXT_UNWRITTEN : XFS_EXT_NORM; +	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? +			XFS_ATTR_FORK : XFS_DATA_FORK; +	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; +	switch (bui_type) { +	case XFS_BMAP_MAP: +	case XFS_BMAP_UNMAP: +		type = bui_type; +		break; +	default: +		error = -EFSCORRUPTED; +		goto err_dfops; +	} +	xfs_trans_ijoin(tp, ip, 0); + +	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, +			ip, whichfork, bmap->me_startoff, +			bmap->me_startblock, bmap->me_len, +			state); +	if (error) +		goto err_dfops; + +	/* Finish transaction, free inodes. */ +	error = xfs_defer_finish(&tp, &dfops, NULL); +	if (error) +		goto err_dfops; + +	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); +	error = xfs_trans_commit(tp); +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	IRELE(ip); + +	return error; + +err_dfops: +	xfs_defer_cancel(&dfops); +err_inode: +	xfs_trans_cancel(tp); +	if (ip) { +		xfs_iunlock(ip, XFS_ILOCK_EXCL); +		IRELE(ip); +	} +	return error; +} diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h new file mode 100644 index 000000000000..c867daae4a3c --- /dev/null +++ b/fs/xfs/xfs_bmap_item.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#ifndef	__XFS_BMAP_ITEM_H__ +#define	__XFS_BMAP_ITEM_H__ + +/* + * There are (currently) two pairs of bmap btree redo item types: map & unmap. + * The common abbreviations for these are BUI (bmap update intent) and BUD + * (bmap update done).  The redo item type is encoded in the flags field of + * each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * bmbt metadata updates in the non-first transaction. + */ + +/* kernel only BUI/BUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define	XFS_BUI_MAX_FAST_EXTENTS	1 + +/* + * Define BUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define	XFS_BUI_RECOVERED		1 + +/* + * This is the "bmap update intent" log item.  It is used to log the fact that + * some reverse mappings need to change.  It is used in conjunction with the + * "bmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_bui_log_item { +	struct xfs_log_item		bui_item; +	atomic_t			bui_refcount; +	atomic_t			bui_next_extent; +	unsigned long			bui_flags;	/* misc flags */ +	struct xfs_bui_log_format	bui_format; +}; + +static inline size_t +xfs_bui_log_item_sizeof( +	unsigned int		nr) +{ +	return offsetof(struct xfs_bui_log_item, bui_format) + +			xfs_bui_log_format_sizeof(nr); +} + +/* + * This is the "bmap update done" log item.  It is used to log the fact that + * some bmbt updates mentioned in an earlier bui item have been performed. + */ +struct xfs_bud_log_item { +	struct xfs_log_item		bud_item; +	struct xfs_bui_log_item		*bud_buip; +	struct xfs_bud_log_format	bud_format; +}; + +extern struct kmem_zone	*xfs_bui_zone; +extern struct kmem_zone	*xfs_bud_zone; + +struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); +struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, +		struct xfs_bui_log_item *); +void xfs_bui_item_free(struct xfs_bui_log_item *); +void xfs_bui_release(struct xfs_bui_log_item *); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); + +#endif	/* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4ece4f2ffc72..552465e011ec 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -42,6 +42,9 @@  #include "xfs_icache.h"  #include "xfs_log.h"  #include "xfs_rmap_btree.h" +#include "xfs_iomap.h" +#include "xfs_reflink.h" +#include "xfs_refcount.h"  /* Kernel only BMAP related definitions and functions */ @@ -182,7 +185,7 @@ xfs_bmap_rtalloc(  					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);  		/* Zero the extent if we were asked to do so */ -		if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { +		if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {  			error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);  			if (error)  				return error; @@ -389,11 +392,13 @@ xfs_bmap_count_blocks(  STATIC int  xfs_getbmapx_fix_eof_hole(  	xfs_inode_t		*ip,		/* xfs incore inode pointer */ +	int			whichfork,  	struct getbmapx		*out,		/* output structure */  	int			prealloced,	/* this is a file with  						 * preallocated data space */  	__int64_t		end,		/* last block requested */ -	xfs_fsblock_t		startblock) +	xfs_fsblock_t		startblock, +	bool			moretocome)  {  	__int64_t		fixlen;  	xfs_mount_t		*mp;		/* file system mount point */ @@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole(  		else  			out->bmv_block = xfs_fsb_to_db(ip, startblock);  		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); -		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); -		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && +		ifp = XFS_IFORK_PTR(ip, whichfork); +		if (!moretocome && +		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&  		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))  			out->bmv_oflags |= BMV_OF_LAST;  	} @@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole(  	return 1;  } +/* Adjust the reported bmap around shared/unshared extent transitions. */ +STATIC int +xfs_getbmap_adjust_shared( +	struct xfs_inode		*ip, +	int				whichfork, +	struct xfs_bmbt_irec		*map, +	struct getbmapx			*out, +	struct xfs_bmbt_irec		*next_map) +{ +	struct xfs_mount		*mp = ip->i_mount; +	xfs_agnumber_t			agno; +	xfs_agblock_t			agbno; +	xfs_agblock_t			ebno; +	xfs_extlen_t			elen; +	xfs_extlen_t			nlen; +	int				error; + +	next_map->br_startblock = NULLFSBLOCK; +	next_map->br_startoff = NULLFILEOFF; +	next_map->br_blockcount = 0; + +	/* Only written data blocks can be shared. */ +	if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || +	    map->br_startblock == DELAYSTARTBLOCK || +	    map->br_startblock == HOLESTARTBLOCK || +	    ISUNWRITTEN(map)) +		return 0; + +	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); +	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); +	error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, +			&ebno, &elen, true); +	if (error) +		return error; + +	if (ebno == NULLAGBLOCK) { +		/* No shared blocks at all. */ +		return 0; +	} else if (agbno == ebno) { +		/* +		 * Shared extent at (agbno, elen).  Shrink the reported +		 * extent length and prepare to move the start of map[i] +		 * to agbno+elen, with the aim of (re)formatting the new +		 * map[i] the next time through the inner loop. +		 */ +		out->bmv_length = XFS_FSB_TO_BB(mp, elen); +		out->bmv_oflags |= BMV_OF_SHARED; +		if (elen != map->br_blockcount) { +			*next_map = *map; +			next_map->br_startblock += elen; +			next_map->br_startoff += elen; +			next_map->br_blockcount -= elen; +		} +		map->br_blockcount -= elen; +	} else { +		/* +		 * There's an unshared extent (agbno, ebno - agbno) +		 * followed by shared extent at (ebno, elen).  Shrink +		 * the reported extent length to cover only the unshared +		 * extent and prepare to move up the start of map[i] to +		 * ebno, with the aim of (re)formatting the new map[i] +		 * the next time through the inner loop. +		 */ +		*next_map = *map; +		nlen = ebno - agbno; +		out->bmv_length = XFS_FSB_TO_BB(mp, nlen); +		next_map->br_startblock += nlen; +		next_map->br_startoff += nlen; +		next_map->br_blockcount -= nlen; +		map->br_blockcount -= nlen; +	} + +	return 0; +} +  /*   * Get inode's extents as described in bmv, and format for output.   * Calls formatter to fill the user's buffer until all extents @@ -459,12 +540,28 @@ xfs_getbmap(  	int			iflags;		/* interface flags */  	int			bmapi_flags;	/* flags for xfs_bmapi */  	int			cur_ext = 0; +	struct xfs_bmbt_irec	inject_map;  	mp = ip->i_mount;  	iflags = bmv->bmv_iflags; -	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; -	if (whichfork == XFS_ATTR_FORK) { +#ifndef DEBUG +	/* Only allow CoW fork queries if we're debugging. */ +	if (iflags & BMV_IF_COWFORK) +		return -EINVAL; +#endif +	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) +		return -EINVAL; + +	if (iflags & BMV_IF_ATTRFORK) +		whichfork = XFS_ATTR_FORK; +	else if (iflags & BMV_IF_COWFORK) +		whichfork = XFS_COW_FORK; +	else +		whichfork = XFS_DATA_FORK; + +	switch (whichfork) { +	case XFS_ATTR_FORK:  		if (XFS_IFORK_Q(ip)) {  			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&  			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && @@ -480,7 +577,20 @@ xfs_getbmap(  		prealloced = 0;  		fixlen = 1LL << 32; -	} else { +		break; +	case XFS_COW_FORK: +		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) +			return -EINVAL; + +		if (xfs_get_cowextsz_hint(ip)) { +			prealloced = 1; +			fixlen = mp->m_super->s_maxbytes; +		} else { +			prealloced = 0; +			fixlen = XFS_ISIZE(ip); +		} +		break; +	default:  		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&  		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&  		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -494,6 +604,7 @@ xfs_getbmap(  			prealloced = 0;  			fixlen = XFS_ISIZE(ip);  		} +		break;  	}  	if (bmv->bmv_length == -1) { @@ -520,7 +631,8 @@ xfs_getbmap(  		return -ENOMEM;  	xfs_ilock(ip, XFS_IOLOCK_SHARED); -	if (whichfork == XFS_DATA_FORK) { +	switch (whichfork) { +	case XFS_DATA_FORK:  		if (!(iflags & BMV_IF_DELALLOC) &&  		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {  			error = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -538,8 +650,14 @@ xfs_getbmap(  		}  		lock = xfs_ilock_data_map_shared(ip); -	} else { +		break; +	case XFS_COW_FORK: +		lock = XFS_ILOCK_SHARED; +		xfs_ilock(ip, lock); +		break; +	case XFS_ATTR_FORK:  		lock = xfs_ilock_attr_map_shared(ip); +		break;  	}  	/* @@ -581,7 +699,8 @@ xfs_getbmap(  			goto out_free_map;  		ASSERT(nmap <= subnex); -		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { +		for (i = 0; i < nmap && nexleft && bmv->bmv_length && +				cur_ext < bmv->bmv_count; i++) {  			out[cur_ext].bmv_oflags = 0;  			if (map[i].br_state == XFS_EXT_UNWRITTEN)  				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -614,9 +733,16 @@ xfs_getbmap(  				goto out_free_map;  			} -			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], -					prealloced, bmvend, -					map[i].br_startblock)) +			/* Is this a shared block? */ +			error = xfs_getbmap_adjust_shared(ip, whichfork, +					&map[i], &out[cur_ext], &inject_map); +			if (error) +				goto out_free_map; + +			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, +					&out[cur_ext], prealloced, bmvend, +					map[i].br_startblock, +					inject_map.br_startblock != NULLFSBLOCK))  				goto out_free_map;  			bmv->bmv_offset = @@ -636,11 +762,16 @@ xfs_getbmap(  				continue;  			} -			nexleft--; +			if (inject_map.br_startblock != NULLFSBLOCK) { +				map[i] = inject_map; +				i--; +			} else +				nexleft--;  			bmv->bmv_entries++;  			cur_ext++;  		} -	} while (nmap && nexleft && bmv->bmv_length); +	} while (nmap && nexleft && bmv->bmv_length && +		 cur_ext < bmv->bmv_count);   out_free_map:  	kmem_free(map); @@ -1433,8 +1564,8 @@ xfs_insert_file_space(   */  static int  xfs_swap_extents_check_format( -	xfs_inode_t	*ip,	/* target inode */ -	xfs_inode_t	*tip)	/* tmp inode */ +	struct xfs_inode	*ip,	/* target inode */ +	struct xfs_inode	*tip)	/* tmp inode */  {  	/* Should never get a local format */ @@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format(  		return -EINVAL;  	/* +	 * If we have to use the (expensive) rmap swap method, we can +	 * handle any number of extents and any format. +	 */ +	if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) +		return 0; + +	/*  	 * if the target inode is in extent form and the temp inode is in btree  	 * form then we will end up with the target inode in the wrong format  	 * as we already know there are less extents in the temp inode. @@ -1518,125 +1656,161 @@ xfs_swap_extent_flush(  	return 0;  } -int -xfs_swap_extents( -	xfs_inode_t	*ip,	/* target inode */ -	xfs_inode_t	*tip,	/* tmp inode */ -	xfs_swapext_t	*sxp) +/* + * Move extents from one file to another, when rmap is enabled. + */ +STATIC int +xfs_swap_extent_rmap( +	struct xfs_trans		**tpp, +	struct xfs_inode		*ip, +	struct xfs_inode		*tip)  { -	xfs_mount_t	*mp = ip->i_mount; -	xfs_trans_t	*tp; -	xfs_bstat_t	*sbp = &sxp->sx_stat; -	xfs_ifork_t	*tempifp, *ifp, *tifp; -	int		src_log_flags, target_log_flags; -	int		error = 0; -	int		aforkblks = 0; -	int		taforkblks = 0; -	__uint64_t	tmp; -	int		lock_flags; - -	/* XXX: we can't do this with rmap, will fix later */ -	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) -		return -EOPNOTSUPP; - -	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); -	if (!tempifp) { -		error = -ENOMEM; -		goto out; -	} +	struct xfs_bmbt_irec		irec; +	struct xfs_bmbt_irec		uirec; +	struct xfs_bmbt_irec		tirec; +	xfs_fileoff_t			offset_fsb; +	xfs_fileoff_t			end_fsb; +	xfs_filblks_t			count_fsb; +	xfs_fsblock_t			firstfsb; +	struct xfs_defer_ops		dfops; +	int				error; +	xfs_filblks_t			ilen; +	xfs_filblks_t			rlen; +	int				nimaps; +	__uint64_t			tip_flags2;  	/* -	 * Lock the inodes against other IO, page faults and truncate to -	 * begin with.  Then we can ensure the inodes are flushed and have no -	 * page cache safely. Once we have done this we can take the ilocks and -	 * do the rest of the checks. +	 * If the source file has shared blocks, we must flag the donor +	 * file as having shared blocks so that we get the shared-block +	 * rmap functions when we go to fix up the rmaps.  The flags +	 * will be switch for reals later.  	 */ -	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; -	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); -	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); - -	/* Verify that both files have the same format */ -	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { -		error = -EINVAL; -		goto out_unlock; -	} +	tip_flags2 = tip->i_d.di_flags2; +	if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) +		tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + +	offset_fsb = 0; +	end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); +	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + +	while (count_fsb) { +		/* Read extent from the donor file */ +		nimaps = 1; +		error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec, +				&nimaps, 0); +		if (error) +			goto out; +		ASSERT(nimaps == 1); +		ASSERT(tirec.br_startblock != DELAYSTARTBLOCK); + +		trace_xfs_swap_extent_rmap_remap(tip, &tirec); +		ilen = tirec.br_blockcount; + +		/* Unmap the old blocks in the source file. */ +		while (tirec.br_blockcount) { +			xfs_defer_init(&dfops, &firstfsb); +			trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); + +			/* Read extent from the source file */ +			nimaps = 1; +			error = xfs_bmapi_read(ip, tirec.br_startoff, +					tirec.br_blockcount, &irec, +					&nimaps, 0); +			if (error) +				goto out_defer; +			ASSERT(nimaps == 1); +			ASSERT(tirec.br_startoff == irec.br_startoff); +			trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); + +			/* Trim the extent. */ +			uirec = tirec; +			uirec.br_blockcount = rlen = min_t(xfs_filblks_t, +					tirec.br_blockcount, +					irec.br_blockcount); +			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + +			/* Remove the mapping from the donor file. */ +			error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, +					tip, &uirec); +			if (error) +				goto out_defer; -	/* Verify both files are either real-time or non-realtime */ -	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { -		error = -EINVAL; -		goto out_unlock; -	} +			/* Remove the mapping from the source file. */ +			error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, +					ip, &irec); +			if (error) +				goto out_defer; -	error = xfs_swap_extent_flush(ip); -	if (error) -		goto out_unlock; -	error = xfs_swap_extent_flush(tip); -	if (error) -		goto out_unlock; +			/* Map the donor file's blocks into the source file. */ +			error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, +					ip, &uirec); +			if (error) +				goto out_defer; -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); -	if (error) -		goto out_unlock; +			/* Map the source file's blocks into the donor file. */ +			error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, +					tip, &irec); +			if (error) +				goto out_defer; -	/* -	 * Lock and join the inodes to the tansaction so that transaction commit -	 * or cancel will unlock the inodes from this point onwards. -	 */ -	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); -	lock_flags |= XFS_ILOCK_EXCL; -	xfs_trans_ijoin(tp, ip, lock_flags); -	xfs_trans_ijoin(tp, tip, lock_flags); +			error = xfs_defer_finish(tpp, &dfops, ip); +			if (error) +				goto out_defer; +			tirec.br_startoff += rlen; +			if (tirec.br_startblock != HOLESTARTBLOCK && +			    tirec.br_startblock != DELAYSTARTBLOCK) +				tirec.br_startblock += rlen; +			tirec.br_blockcount -= rlen; +		} -	/* Verify all data are being swapped */ -	if (sxp->sx_offset != 0 || -	    sxp->sx_length != ip->i_d.di_size || -	    sxp->sx_length != tip->i_d.di_size) { -		error = -EFAULT; -		goto out_trans_cancel; +		/* Roll on... */ +		count_fsb -= ilen; +		offset_fsb += ilen;  	} -	trace_xfs_swap_extent_before(ip, 0); -	trace_xfs_swap_extent_before(tip, 1); +	tip->i_d.di_flags2 = tip_flags2; +	return 0; -	/* check inode formats now that data is flushed */ -	error = xfs_swap_extents_check_format(ip, tip); -	if (error) { -		xfs_notice(mp, -		    "%s: inode 0x%llx format is incompatible for exchanging.", -				__func__, ip->i_ino); -		goto out_trans_cancel; -	} +out_defer: +	xfs_defer_cancel(&dfops); +out: +	trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); +	tip->i_d.di_flags2 = tip_flags2; +	return error; +} + +/* Swap the extents of two files by swapping data forks. */ +STATIC int +xfs_swap_extent_forks( +	struct xfs_trans	*tp, +	struct xfs_inode	*ip, +	struct xfs_inode	*tip, +	int			*src_log_flags, +	int			*target_log_flags) +{ +	struct xfs_ifork	tempifp, *ifp, *tifp; +	int			aforkblks = 0; +	int			taforkblks = 0; +	__uint64_t		tmp; +	int			error; -	/* -	 * Compare the current change & modify times with that -	 * passed in.  If they differ, we abort this swap. -	 * This is the mechanism used to ensure the calling -	 * process that the file was not changed out from -	 * under it. -	 */ -	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || -	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || -	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || -	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { -		error = -EBUSY; -		goto out_trans_cancel; -	}  	/*  	 * Count the number of extended attribute blocks  	 */  	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&  	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { -		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); +		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, +				&aforkblks);  		if (error) -			goto out_trans_cancel; +			return error;  	}  	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&  	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {  		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, -			&taforkblks); +				&taforkblks);  		if (error) -			goto out_trans_cancel; +			return error;  	}  	/* @@ -1645,31 +1819,23 @@ xfs_swap_extents(  	 * buffers, and so the validation done on read will expect the owner  	 * field to be correctly set. Once we change the owners, we can swap the  	 * inode forks. -	 * -	 * Note the trickiness in setting the log flags - we set the owner log -	 * flag on the opposite inode (i.e. the inode we are setting the new -	 * owner to be) because once we swap the forks and log that, log -	 * recovery is going to see the fork as owned by the swapped inode, -	 * not the pre-swapped inodes.  	 */ -	src_log_flags = XFS_ILOG_CORE; -	target_log_flags = XFS_ILOG_CORE;  	if (ip->i_d.di_version == 3 &&  	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { -		target_log_flags |= XFS_ILOG_DOWNER; +		(*target_log_flags) |= XFS_ILOG_DOWNER;  		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,  					      tip->i_ino, NULL);  		if (error) -			goto out_trans_cancel; +			return error;  	}  	if (tip->i_d.di_version == 3 &&  	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { -		src_log_flags |= XFS_ILOG_DOWNER; +		(*src_log_flags) |= XFS_ILOG_DOWNER;  		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,  					      ip->i_ino, NULL);  		if (error) -			goto out_trans_cancel; +			return error;  	}  	/* @@ -1677,9 +1843,9 @@ xfs_swap_extents(  	 */  	ifp = &ip->i_df;  	tifp = &tip->i_df; -	*tempifp = *ifp;	/* struct copy */ +	tempifp = *ifp;		/* struct copy */  	*ifp = *tifp;		/* struct copy */ -	*tifp = *tempifp;	/* struct copy */ +	*tifp = tempifp;	/* struct copy */  	/*  	 * Fix the on-disk inode values @@ -1719,12 +1885,12 @@ xfs_swap_extents(  			ifp->if_u1.if_extents =  				ifp->if_u2.if_inline_ext;  		} -		src_log_flags |= XFS_ILOG_DEXT; +		(*src_log_flags) |= XFS_ILOG_DEXT;  		break;  	case XFS_DINODE_FMT_BTREE:  		ASSERT(ip->i_d.di_version < 3 || -		       (src_log_flags & XFS_ILOG_DOWNER)); -		src_log_flags |= XFS_ILOG_DBROOT; +		       (*src_log_flags & XFS_ILOG_DOWNER)); +		(*src_log_flags) |= XFS_ILOG_DBROOT;  		break;  	} @@ -1738,15 +1904,166 @@ xfs_swap_extents(  			tifp->if_u1.if_extents =  				tifp->if_u2.if_inline_ext;  		} -		target_log_flags |= XFS_ILOG_DEXT; +		(*target_log_flags) |= XFS_ILOG_DEXT;  		break;  	case XFS_DINODE_FMT_BTREE: -		target_log_flags |= XFS_ILOG_DBROOT; +		(*target_log_flags) |= XFS_ILOG_DBROOT;  		ASSERT(tip->i_d.di_version < 3 || -		       (target_log_flags & XFS_ILOG_DOWNER)); +		       (*target_log_flags & XFS_ILOG_DOWNER));  		break;  	} +	return 0; +} + +int +xfs_swap_extents( +	struct xfs_inode	*ip,	/* target inode */ +	struct xfs_inode	*tip,	/* tmp inode */ +	struct xfs_swapext	*sxp) +{ +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_trans	*tp; +	struct xfs_bstat	*sbp = &sxp->sx_stat; +	int			src_log_flags, target_log_flags; +	int			error = 0; +	int			lock_flags; +	struct xfs_ifork	*cowfp; +	__uint64_t		f; +	int			resblks; + +	/* +	 * Lock the inodes against other IO, page faults and truncate to +	 * begin with.  Then we can ensure the inodes are flushed and have no +	 * page cache safely. Once we have done this we can take the ilocks and +	 * do the rest of the checks. +	 */ +	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; +	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); +	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + +	/* Verify that both files have the same format */ +	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { +		error = -EINVAL; +		goto out_unlock; +	} + +	/* Verify both files are either real-time or non-realtime */ +	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { +		error = -EINVAL; +		goto out_unlock; +	} + +	error = xfs_swap_extent_flush(ip); +	if (error) +		goto out_unlock; +	error = xfs_swap_extent_flush(tip); +	if (error) +		goto out_unlock; + +	/* +	 * Extent "swapping" with rmap requires a permanent reservation and +	 * a block reservation because it's really just a remap operation +	 * performed with log redo items! +	 */ +	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { +		/* +		 * Conceptually this shouldn't affect the shape of either +		 * bmbt, but since we atomically move extents one by one, +		 * we reserve enough space to rebuild both trees. +		 */ +		resblks = XFS_SWAP_RMAP_SPACE_RES(mp, +				XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), +				XFS_DATA_FORK) + +			  XFS_SWAP_RMAP_SPACE_RES(mp, +				XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), +				XFS_DATA_FORK); +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, +				0, 0, &tp); +	} else +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, +				0, 0, &tp); +	if (error) +		goto out_unlock; + +	/* +	 * Lock and join the inodes to the tansaction so that transaction commit +	 * or cancel will unlock the inodes from this point onwards. +	 */ +	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); +	lock_flags |= XFS_ILOCK_EXCL; +	xfs_trans_ijoin(tp, ip, 0); +	xfs_trans_ijoin(tp, tip, 0); + + +	/* Verify all data are being swapped */ +	if (sxp->sx_offset != 0 || +	    sxp->sx_length != ip->i_d.di_size || +	    sxp->sx_length != tip->i_d.di_size) { +		error = -EFAULT; +		goto out_trans_cancel; +	} + +	trace_xfs_swap_extent_before(ip, 0); +	trace_xfs_swap_extent_before(tip, 1); + +	/* check inode formats now that data is flushed */ +	error = xfs_swap_extents_check_format(ip, tip); +	if (error) { +		xfs_notice(mp, +		    "%s: inode 0x%llx format is incompatible for exchanging.", +				__func__, ip->i_ino); +		goto out_trans_cancel; +	} + +	/* +	 * Compare the current change & modify times with that +	 * passed in.  If they differ, we abort this swap. +	 * This is the mechanism used to ensure the calling +	 * process that the file was not changed out from +	 * under it. +	 */ +	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || +	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || +	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || +	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { +		error = -EBUSY; +		goto out_trans_cancel; +	} + +	/* +	 * Note the trickiness in setting the log flags - we set the owner log +	 * flag on the opposite inode (i.e. the inode we are setting the new +	 * owner to be) because once we swap the forks and log that, log +	 * recovery is going to see the fork as owned by the swapped inode, +	 * not the pre-swapped inodes. +	 */ +	src_log_flags = XFS_ILOG_CORE; +	target_log_flags = XFS_ILOG_CORE; + +	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) +		error = xfs_swap_extent_rmap(&tp, ip, tip); +	else +		error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, +				&target_log_flags); +	if (error) +		goto out_trans_cancel; + +	/* Do we have to swap reflink flags? */ +	if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^ +	    (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) { +		f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; +		ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +		tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; +		tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; +		cowfp = ip->i_cowfp; +		ip->i_cowfp = tip->i_cowfp; +		tip->i_cowfp = cowfp; +		xfs_inode_set_cowblocks_tag(ip); +		xfs_inode_set_cowblocks_tag(tip); +	} +  	xfs_trans_log_inode(tp, ip,  src_log_flags);  	xfs_trans_log_inode(tp, tip, target_log_flags); @@ -1761,16 +2078,16 @@ xfs_swap_extents(  	trace_xfs_swap_extent_after(ip, 0);  	trace_xfs_swap_extent_after(tip, 1); -out: -	kmem_free(tempifp); -	return error; -out_unlock:  	xfs_iunlock(ip, lock_flags);  	xfs_iunlock(tip, lock_flags); -	goto out; +	return error;  out_trans_cancel:  	xfs_trans_cancel(tp); -	goto out; + +out_unlock: +	xfs_iunlock(ip, lock_flags); +	xfs_iunlock(tip, lock_flags); +	return error;  } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e455f9098d49..2975cb2319f4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -865,7 +865,7 @@ xfs_buf_item_log_segment(  	 */  	if (bit) {  		end_bit = MIN(bit + bits_to_set, (uint)NBWORD); -		mask = ((1 << (end_bit - bit)) - 1) << bit; +		mask = ((1U << (end_bit - bit)) - 1) << bit;  		*wordp |= mask;  		wordp++;  		bits_set = end_bit - bit; @@ -888,7 +888,7 @@ xfs_buf_item_log_segment(  	 */  	end_bit = bits_to_set - bits_set;  	if (end_bit) { -		mask = (1 << end_bit) - 1; +		mask = (1U << end_bit) - 1;  		*wordp |= mask;  	}  } @@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(  	     bp->b_last_error != bp->b_error) {  		bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);  		bp->b_last_error = bp->b_error; -		if (cfg->retry_timeout && !bp->b_first_retry_time) +		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && +		    !bp->b_first_retry_time)  			bp->b_first_retry_time = jiffies;  		xfs_buf_ioerror(bp, 0); @@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(  	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&  	    ++bp->b_retries > cfg->max_retries)  			goto permanent_error; -	if (cfg->retry_timeout && +	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&  	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))  			goto permanent_error; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f44f79996978..29816981b50a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -84,7 +84,8 @@ xfs_dir2_sf_getdents(  	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; -	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); +	if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) +		return -EFSCORRUPTED;  	/*  	 * If the block number in the offset is out of range, we're done. diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3d224702fbc0..05f8666733a0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp);  #define XFS_ERRTAG_BMAPIFORMAT				21  #define XFS_ERRTAG_FREE_EXTENT				22  #define XFS_ERRTAG_RMAP_FINISH_ONE			23 -#define XFS_ERRTAG_MAX					24 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE		24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE			25 +#define XFS_ERRTAG_BMAP_FINISH_ONE			26 +#define XFS_ERRTAG_AG_RESV_CRITICAL			27 +#define XFS_ERRTAG_MAX					28  /*   * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp);  #define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT  #define XFS_RANDOM_FREE_EXTENT				1  #define XFS_RANDOM_RMAP_FINISH_ONE			1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE		1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE			1 +#define XFS_RANDOM_BMAP_FINISH_ONE			1 +#define XFS_RANDOM_AG_RESV_CRITICAL			4  #ifdef DEBUG  extern int xfs_error_test_active; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index c263e079273e..162dc186cf04 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -384,7 +384,7 @@ restart:  		 * If this is a metadata allocation, try to reuse the busy  		 * extent instead of trimming the allocation.  		 */ -		if (!args->userdata && +		if (!xfs_alloc_is_userdata(args->datatype) &&  		    !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {  			if (!xfs_extent_busy_update_extent(args->mp, args->pag,  							  busyp, fbno, flen, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e612a0233710..a314fc7b56fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@  #include "xfs_icache.h"  #include "xfs_pnfs.h"  #include "xfs_iomap.h" +#include "xfs_reflink.h"  #include <linux/dcache.h>  #include <linux/falloc.h> @@ -269,6 +270,8 @@ xfs_file_dio_aio_read(  		return -EINVAL;  	} +	file_accessed(iocb->ki_filp); +  	/*  	 * Locking is a bit tricky here. If we take an exclusive lock for direct  	 * IO, we effectively serialise all new concurrent read IO to this file @@ -317,13 +320,12 @@ xfs_file_dio_aio_read(  	data = *to;  	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,  			xfs_get_blocks_direct, NULL, NULL, 0); -	if (ret > 0) { +	if (ret >= 0) {  		iocb->ki_pos += ret;  		iov_iter_advance(to, ret);  	}  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); -	file_accessed(iocb->ki_filp);  	return ret;  } @@ -332,10 +334,7 @@ xfs_file_dax_read(  	struct kiocb		*iocb,  	struct iov_iter		*to)  { -	struct address_space	*mapping = iocb->ki_filp->f_mapping; -	struct inode		*inode = mapping->host; -	struct xfs_inode	*ip = XFS_I(inode); -	struct iov_iter		data = *to; +	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);  	size_t			count = iov_iter_count(to);  	ssize_t			ret = 0; @@ -345,11 +344,7 @@ xfs_file_dax_read(  		return 0; /* skip atime */  	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); -	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); -	if (ret > 0) { -		iocb->ki_pos += ret; -		iov_iter_advance(to, ret); -	} +	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);  	file_accessed(iocb->ki_filp); @@ -399,45 +394,6 @@ xfs_file_read_iter(  	return ret;  } -STATIC ssize_t -xfs_file_splice_read( -	struct file		*infilp, -	loff_t			*ppos, -	struct pipe_inode_info	*pipe, -	size_t			count, -	unsigned int		flags) -{ -	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host); -	ssize_t			ret; - -	XFS_STATS_INC(ip->i_mount, xs_read_calls); - -	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) -		return -EIO; - -	trace_xfs_file_splice_read(ip, count, *ppos); - -	/* -	 * DAX inodes cannot ues the page cache for splice, so we have to push -	 * them through the VFS IO path. This means it goes through -	 * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we -	 * cannot lock the splice operation at this level for DAX inodes. -	 */ -	if (IS_DAX(VFS_I(ip))) { -		ret = default_file_splice_read(infilp, ppos, pipe, count, -					       flags); -		goto out; -	} - -	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); -	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); -	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); -out: -	if (ret > 0) -		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); -	return ret; -} -  /*   * Zero any on disk space between the current EOF and the new, larger EOF.   * @@ -679,6 +635,13 @@ xfs_file_dio_aio_write(  	trace_xfs_file_direct_write(ip, count, iocb->ki_pos); +	/* If this is a block-aligned directio CoW, remap immediately. */ +	if (xfs_is_reflink_inode(ip) && !unaligned_io) { +		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); +		if (ret) +			goto out; +	} +  	data = *from;  	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,  			xfs_get_blocks_direct, xfs_end_io_direct_write, @@ -711,70 +674,32 @@ xfs_file_dax_write(  	struct kiocb		*iocb,  	struct iov_iter		*from)  { -	struct address_space	*mapping = iocb->ki_filp->f_mapping; -	struct inode		*inode = mapping->host; +	struct inode		*inode = iocb->ki_filp->f_mapping->host;  	struct xfs_inode	*ip = XFS_I(inode); -	struct xfs_mount	*mp = ip->i_mount; -	ssize_t			ret = 0; -	int			unaligned_io = 0; -	int			iolock; -	struct iov_iter		data; +	int			iolock = XFS_IOLOCK_EXCL; +	ssize_t			ret, error = 0; +	size_t			count; +	loff_t			pos; -	/* "unaligned" here means not aligned to a filesystem block */ -	if ((iocb->ki_pos & mp->m_blockmask) || -	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { -		unaligned_io = 1; -		iolock = XFS_IOLOCK_EXCL; -	} else if (mapping->nrpages) { -		iolock = XFS_IOLOCK_EXCL; -	} else { -		iolock = XFS_IOLOCK_SHARED; -	}  	xfs_rw_ilock(ip, iolock); -  	ret = xfs_file_aio_write_checks(iocb, from, &iolock);  	if (ret)  		goto out; -	/* -	 * Yes, even DAX files can have page cache attached to them:  A zeroed -	 * page is inserted into the pagecache when we have to serve a write -	 * fault on a hole.  It should never be dirtied and can simply be -	 * dropped from the pagecache once we get real data for the page. -	 * -	 * XXX: This is racy against mmap, and there's nothing we can do about -	 * it. dax_do_io() should really do this invalidation internally as -	 * it will know if we've allocated over a holei for this specific IO and -	 * if so it needs to update the mapping tree and invalidate existing -	 * PTEs over the newly allocated range. Remove this invalidation when -	 * dax_do_io() is fixed up. -	 */ -	if (mapping->nrpages) { -		loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; +	pos = iocb->ki_pos; +	count = iov_iter_count(from); -		ret = invalidate_inode_pages2_range(mapping, -						    iocb->ki_pos >> PAGE_SHIFT, -						    end >> PAGE_SHIFT); -		WARN_ON_ONCE(ret); -	} +	trace_xfs_file_dax_write(ip, count, pos); -	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { -		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); -		iolock = XFS_IOLOCK_SHARED; +	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); +	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { +		i_size_write(inode, iocb->ki_pos); +		error = xfs_setfilesize(ip, pos, ret);  	} -	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); - -	data = *from; -	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, -			xfs_end_io_direct_write, 0); -	if (ret > 0) { -		iocb->ki_pos += ret; -		iov_iter_advance(from, ret); -	}  out:  	xfs_rw_iunlock(ip, iolock); -	return ret; +	return error ? error : ret;  }  STATIC ssize_t @@ -818,6 +743,9 @@ write_retry:  		enospc = xfs_inode_free_quota_eofblocks(ip);  		if (enospc)  			goto write_retry; +		enospc = xfs_inode_free_quota_cowblocks(ip); +		if (enospc) +			goto write_retry;  	} else if (ret == -ENOSPC && !enospc) {  		struct xfs_eofblocks eofb = {0}; @@ -857,10 +785,20 @@ xfs_file_write_iter(  	if (IS_DAX(inode))  		ret = xfs_file_dax_write(iocb, from); -	else if (iocb->ki_flags & IOCB_DIRECT) +	else if (iocb->ki_flags & IOCB_DIRECT) { +		/* +		 * Allow a directio write to fall back to a buffered +		 * write *only* in the case that we're doing a reflink +		 * CoW.  In all other directio scenarios we do not +		 * allow an operation to fall back to buffered mode. +		 */  		ret = xfs_file_dio_aio_write(iocb, from); -	else +		if (ret == -EREMCHG) +			goto buffered; +	} else { +buffered:  		ret = xfs_file_buffered_aio_write(iocb, from); +	}  	if (ret > 0) {  		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); @@ -874,7 +812,7 @@ xfs_file_write_iter(  #define	XFS_FALLOC_FL_SUPPORTED						\  		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\  		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\ -		 FALLOC_FL_INSERT_RANGE) +		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)  STATIC long  xfs_file_fallocate( @@ -964,9 +902,15 @@ xfs_file_fallocate(  		if (mode & FALLOC_FL_ZERO_RANGE)  			error = xfs_zero_file_space(ip, offset, len); -		else +		else { +			if (mode & FALLOC_FL_UNSHARE_RANGE) { +				error = xfs_reflink_unshare(ip, offset, len); +				if (error) +					goto out_unlock; +			}  			error = xfs_alloc_file_space(ip, offset, len,  						     XFS_BMAPI_PREALLOC); +		}  		if (error)  			goto out_unlock;  	} @@ -984,7 +928,7 @@ xfs_file_fallocate(  		iattr.ia_valid = ATTR_SIZE;  		iattr.ia_size = new_size; -		error = xfs_setattr_size(ip, &iattr); +		error = xfs_vn_setattr_size(file_dentry(file), &iattr);  		if (error)  			goto out_unlock;  	} @@ -1003,6 +947,189 @@ out_unlock:  	return error;  } +/* + * Flush all file writes out to disk. + */ +static int +xfs_file_wait_for_io( +	struct inode	*inode, +	loff_t		offset, +	size_t		len) +{ +	loff_t		rounding; +	loff_t		ioffset; +	loff_t		iendoffset; +	loff_t		bs; +	int		ret; + +	bs = inode->i_sb->s_blocksize; +	inode_dio_wait(inode); + +	rounding = max_t(xfs_off_t, bs, PAGE_SIZE); +	ioffset = round_down(offset, rounding); +	iendoffset = round_up(offset + len, rounding) - 1; +	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +					   iendoffset); +	return ret; +} + +/* Hook up to the VFS reflink function */ +STATIC int +xfs_file_share_range( +	struct file	*file_in, +	loff_t		pos_in, +	struct file	*file_out, +	loff_t		pos_out, +	u64		len, +	bool		is_dedupe) +{ +	struct inode	*inode_in; +	struct inode	*inode_out; +	ssize_t		ret; +	loff_t		bs; +	loff_t		isize; +	int		same_inode; +	loff_t		blen; +	unsigned int	flags = 0; + +	inode_in = file_inode(file_in); +	inode_out = file_inode(file_out); +	bs = inode_out->i_sb->s_blocksize; + +	/* Don't touch certain kinds of inodes */ +	if (IS_IMMUTABLE(inode_out)) +		return -EPERM; +	if (IS_SWAPFILE(inode_in) || +	    IS_SWAPFILE(inode_out)) +		return -ETXTBSY; + +	/* Reflink only works within this filesystem. */ +	if (inode_in->i_sb != inode_out->i_sb) +		return -EXDEV; +	same_inode = (inode_in->i_ino == inode_out->i_ino); + +	/* Don't reflink dirs, pipes, sockets... */ +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) +		return -EISDIR; +	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) +		return -EINVAL; +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) +		return -EINVAL; + +	/* Don't share DAX file data for now. */ +	if (IS_DAX(inode_in) || IS_DAX(inode_out)) +		return -EINVAL; + +	/* Are we going all the way to the end? */ +	isize = i_size_read(inode_in); +	if (isize == 0) +		return 0; +	if (len == 0) +		len = isize - pos_in; + +	/* Ensure offsets don't wrap and the input is inside i_size */ +	if (pos_in + len < pos_in || pos_out + len < pos_out || +	    pos_in + len > isize) +		return -EINVAL; + +	/* Don't allow dedupe past EOF in the dest file */ +	if (is_dedupe) { +		loff_t	disize; + +		disize = i_size_read(inode_out); +		if (pos_out >= disize || pos_out + len > disize) +			return -EINVAL; +	} + +	/* If we're linking to EOF, continue to the block boundary. */ +	if (pos_in + len == isize) +		blen = ALIGN(isize, bs) - pos_in; +	else +		blen = len; + +	/* Only reflink if we're aligned to block boundaries */ +	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || +	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) +		return -EINVAL; + +	/* Don't allow overlapped reflink within the same file */ +	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) +		return -EINVAL; + +	/* Wait for the completion of any pending IOs on srcfile */ +	ret = xfs_file_wait_for_io(inode_in, pos_in, len); +	if (ret) +		goto out; +	ret = xfs_file_wait_for_io(inode_out, pos_out, len); +	if (ret) +		goto out; + +	if (is_dedupe) +		flags |= XFS_REFLINK_DEDUPE; +	ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), +			pos_out, len, flags); +	if (ret < 0) +		goto out; + +out: +	return ret; +} + +STATIC ssize_t +xfs_file_copy_range( +	struct file	*file_in, +	loff_t		pos_in, +	struct file	*file_out, +	loff_t		pos_out, +	size_t		len, +	unsigned int	flags) +{ +	int		error; + +	error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, +				     len, false); +	if (error) +		return error; +	return len; +} + +STATIC int +xfs_file_clone_range( +	struct file	*file_in, +	loff_t		pos_in, +	struct file	*file_out, +	loff_t		pos_out, +	u64		len) +{ +	return xfs_file_share_range(file_in, pos_in, file_out, pos_out, +				     len, false); +} + +#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024) +STATIC ssize_t +xfs_file_dedupe_range( +	struct file	*src_file, +	u64		loff, +	u64		len, +	struct file	*dst_file, +	u64		dst_loff) +{ +	int		error; + +	/* +	 * Limit the total length we will dedupe for each operation. +	 * This is intended to bound the total time spent in this +	 * ioctl to something sane. +	 */ +	if (len > XFS_MAX_DEDUPE_LEN) +		len = XFS_MAX_DEDUPE_LEN; + +	error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, +				     len, true); +	if (error) +		return error; +	return len; +}  STATIC int  xfs_file_open( @@ -1513,7 +1640,7 @@ xfs_filemap_page_mkwrite(  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);  	if (IS_DAX(inode)) { -		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); +		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);  	} else {  		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);  		ret = block_page_mkwrite_return(ret); @@ -1547,7 +1674,7 @@ xfs_filemap_fault(  		 * changes to xfs_get_blocks_direct() to map unwritten extent  		 * ioend for conversion on read-only mappings.  		 */ -		ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); +		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);  	} else  		ret = filemap_fault(vma, vmf);  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1652,7 +1779,7 @@ const struct file_operations xfs_file_operations = {  	.llseek		= xfs_file_llseek,  	.read_iter	= xfs_file_read_iter,  	.write_iter	= xfs_file_write_iter, -	.splice_read	= xfs_file_splice_read, +	.splice_read	= generic_file_splice_read,  	.splice_write	= iter_file_splice_write,  	.unlocked_ioctl	= xfs_file_ioctl,  #ifdef CONFIG_COMPAT @@ -1662,7 +1789,11 @@ const struct file_operations xfs_file_operations = {  	.open		= xfs_file_open,  	.release	= xfs_file_release,  	.fsync		= xfs_file_fsync, +	.get_unmapped_area = thp_get_unmapped_area,  	.fallocate	= xfs_file_fallocate, +	.copy_file_range = xfs_file_copy_range, +	.clone_file_range = xfs_file_clone_range, +	.dedupe_file_range = xfs_file_dedupe_range,  };  const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 4a33a3304369..043ca3808ea2 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -30,6 +30,7 @@  #include "xfs_mru_cache.h"  #include "xfs_filestream.h"  #include "xfs_trace.h" +#include "xfs_ag_resv.h"  struct xfs_fstrm_item {  	struct xfs_mru_cache_elem	mru; @@ -198,7 +199,8 @@ xfs_filestream_pick_ag(  		}  		longest = xfs_alloc_longest_free_extent(mp, pag, -					xfs_alloc_min_freelist(mp, pag)); +				xfs_alloc_min_freelist(mp, pag), +				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));  		if (((minlen && longest >= minlen) ||  		     (!minlen && pag->pagf_freeblks >= minfree)) &&  		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || @@ -369,7 +371,8 @@ xfs_filestream_new_ag(  	struct xfs_mount	*mp = ip->i_mount;  	xfs_extlen_t		minlen = ap->length;  	xfs_agnumber_t		startag = 0; -	int			flags, err = 0; +	int			flags = 0; +	int			err = 0;  	struct xfs_mru_cache_elem *mru;  	*agp = NULLAGNUMBER; @@ -385,8 +388,10 @@ xfs_filestream_new_ag(  		startag = (item->ag + 1) % mp->m_sb.sb_agcount;  	} -	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | -	        (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); +	if (xfs_alloc_is_userdata(ap->datatype)) +		flags |= XFS_PICK_USERDATA; +	if (ap->dfops->dop_low) +		flags |= XFS_PICK_LOWSPACE;  	err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 0b7f986745c1..93d12fa2670d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -43,6 +43,7 @@  #include "xfs_log.h"  #include "xfs_filestream.h"  #include "xfs_rmap.h" +#include "xfs_ag_resv.h"  /*   * File system operations @@ -108,7 +109,9 @@ xfs_fs_geometry(  			(xfs_sb_version_hassparseinodes(&mp->m_sb) ?  				XFS_FSOP_GEOM_FLAGS_SPINODES : 0) |  			(xfs_sb_version_hasrmapbt(&mp->m_sb) ? -				XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); +				XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | +			(xfs_sb_version_hasreflink(&mp->m_sb) ? +				XFS_FSOP_GEOM_FLAGS_REFLINK : 0);  		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?  				mp->m_sb.sb_logsectsize : BBSIZE;  		geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -259,6 +262,12 @@ xfs_growfs_data_private(  		agf->agf_longest = cpu_to_be32(tmpsize);  		if (xfs_sb_version_hascrc(&mp->m_sb))  			uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); +		if (xfs_sb_version_hasreflink(&mp->m_sb)) { +			agf->agf_refcount_root = cpu_to_be32( +					xfs_refc_block(mp)); +			agf->agf_refcount_level = cpu_to_be32(1); +			agf->agf_refcount_blocks = cpu_to_be32(1); +		}  		error = xfs_bwrite(bp);  		xfs_buf_relse(bp); @@ -450,6 +459,17 @@ xfs_growfs_data_private(  			rrec->rm_offset = 0;  			be16_add_cpu(&block->bb_numrecs, 1); +			/* account for refc btree root */ +			if (xfs_sb_version_hasreflink(&mp->m_sb)) { +				rrec = XFS_RMAP_REC_ADDR(block, 5); +				rrec->rm_startblock = cpu_to_be32( +						xfs_refc_block(mp)); +				rrec->rm_blockcount = cpu_to_be32(1); +				rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); +				rrec->rm_offset = 0; +				be16_add_cpu(&block->bb_numrecs, 1); +			} +  			error = xfs_bwrite(bp);  			xfs_buf_relse(bp);  			if (error) @@ -507,6 +527,28 @@ xfs_growfs_data_private(  				goto error0;  		} +		/* +		 * refcount btree root block +		 */ +		if (xfs_sb_version_hasreflink(&mp->m_sb)) { +			bp = xfs_growfs_get_hdr_buf(mp, +				XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), +				BTOBB(mp->m_sb.sb_blocksize), 0, +				&xfs_refcountbt_buf_ops); +			if (!bp) { +				error = -ENOMEM; +				goto error0; +			} + +			xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, +					     0, 0, agno, +					     XFS_BTREE_CRC_BLOCKS); + +			error = xfs_bwrite(bp); +			xfs_buf_relse(bp); +			if (error) +				goto error0; +		}  	}  	xfs_trans_agblocks_delta(tp, nfree);  	/* @@ -553,7 +595,7 @@ xfs_growfs_data_private(  		error = xfs_free_extent(tp,  				XFS_AGB_TO_FSB(mp, agno,  					be32_to_cpu(agf->agf_length) - new), -				new, &oinfo); +				new, &oinfo, XFS_AG_RESV_NONE);  		if (error)  			goto error0;  	} @@ -589,6 +631,11 @@ xfs_growfs_data_private(  	xfs_set_low_space_thresholds(mp);  	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); +	/* Reserve AG metadata blocks. */ +	error = xfs_fs_reserve_ag_blocks(mp); +	if (error && error != -ENOSPC) +		goto out; +  	/* update secondary superblocks. */  	for (agno = 1; agno < nagcount; agno++) {  		error = 0; @@ -639,6 +686,8 @@ xfs_growfs_data_private(  			continue;  		}  	} + + out:  	return saved_error ? saved_error : error;   error0: @@ -948,3 +997,59 @@ xfs_do_force_shutdown(  	"Please umount the filesystem and rectify the problem(s)");  	}  } + +/* + * Reserve free space for per-AG metadata. + */ +int +xfs_fs_reserve_ag_blocks( +	struct xfs_mount	*mp) +{ +	xfs_agnumber_t		agno; +	struct xfs_perag	*pag; +	int			error = 0; +	int			err2; + +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { +		pag = xfs_perag_get(mp, agno); +		err2 = xfs_ag_resv_init(pag); +		xfs_perag_put(pag); +		if (err2 && !error) +			error = err2; +	} + +	if (error && error != -ENOSPC) { +		xfs_warn(mp, +	"Error %d reserving per-AG metadata reserve pool.", error); +		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +	} + +	return error; +} + +/* + * Free space reserved for per-AG metadata. + */ +int +xfs_fs_unreserve_ag_blocks( +	struct xfs_mount	*mp) +{ +	xfs_agnumber_t		agno; +	struct xfs_perag	*pag; +	int			error = 0; +	int			err2; + +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { +		pag = xfs_perag_get(mp, agno); +		err2 = xfs_ag_resv_free(pag); +		xfs_perag_put(pag); +		if (err2 && !error) +			error = err2; +	} + +	if (error) +		xfs_warn(mp, +	"Error %d freeing per-AG metadata reserve pool.", error); + +	return error; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f32713f14f9a..f34915898fea 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,  				xfs_fsop_resblks_t *outval);  extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +  #endif	/* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d41b241298f..687a4b01fc53 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,8 +21,8 @@  /*   * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,   * other XFS code uses these values.  Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer, which is measured in - * seconds. + * 100ths of a second) with the exception of eofb_timer and cowb_timer, which + * are measured in seconds.   */  xfs_param_t xfs_params = {  			  /*	MIN		DFLT		MAX	*/ @@ -42,6 +42,7 @@ xfs_param_t xfs_params = {  	.inherit_nodfrg	= {	0,		1,		1	},  	.fstrm_timer	= {	1,		30*100,		3600*100},  	.eofb_timer	= {	1,		300,		3600*24}, +	.cowb_timer	= {	1,		1800,		3600*24},  };  struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fb39a66914dd..14796b744e0a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,7 @@  #include "xfs_bmap_util.h"  #include "xfs_dquot_item.h"  #include "xfs_dquot.h" +#include "xfs_reflink.h"  #include <linux/kthread.h>  #include <linux/freezer.h> @@ -76,6 +77,9 @@ xfs_inode_alloc(  	ip->i_mount = mp;  	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));  	ip->i_afp = NULL; +	ip->i_cowfp = NULL; +	ip->i_cnextents = 0; +	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;  	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));  	ip->i_flags = 0;  	ip->i_delayed_blks = 0; @@ -101,6 +105,8 @@ xfs_inode_free_callback(  	if (ip->i_afp)  		xfs_idestroy_fork(ip, XFS_ATTR_FORK); +	if (ip->i_cowfp) +		xfs_idestroy_fork(ip, XFS_COW_FORK);  	if (ip->i_itemp) {  		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); @@ -787,6 +793,33 @@ xfs_eofblocks_worker(  	xfs_queue_eofblocks(mp);  } +/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +STATIC void +xfs_queue_cowblocks( +	struct xfs_mount *mp) +{ +	rcu_read_lock(); +	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) +		queue_delayed_work(mp->m_eofblocks_workqueue, +				   &mp->m_cowblocks_work, +				   msecs_to_jiffies(xfs_cowb_secs * 1000)); +	rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( +	struct work_struct *work) +{ +	struct xfs_mount *mp = container_of(to_delayed_work(work), +				struct xfs_mount, m_cowblocks_work); +	xfs_icache_free_cowblocks(mp, NULL); +	xfs_queue_cowblocks(mp); +} +  int  xfs_inode_ag_iterator(  	struct xfs_mount	*mp, @@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks(  	return ret;  } -int -xfs_icache_free_eofblocks( +static int +__xfs_icache_free_eofblocks(  	struct xfs_mount	*mp, -	struct xfs_eofblocks	*eofb) +	struct xfs_eofblocks	*eofb, +	int			(*execute)(struct xfs_inode *ip, int flags, +					   void *args), +	int			tag)  {  	int flags = SYNC_TRYLOCK;  	if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))  		flags = SYNC_WAIT; -	return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, -					 eofb, XFS_ICI_EOFBLOCKS_TAG); +	return xfs_inode_ag_iterator_tag(mp, execute, flags, +					 eofb, tag); +} + +int +xfs_icache_free_eofblocks( +	struct xfs_mount	*mp, +	struct xfs_eofblocks	*eofb) +{ +	return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, +			XFS_ICI_EOFBLOCKS_TAG);  }  /* @@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks(   * failure. We make a best effort by including each quota under low free space   * conditions (less than 1% free space) in the scan.   */ -int -xfs_inode_free_quota_eofblocks( -	struct xfs_inode *ip) +static int +__xfs_inode_free_quota_eofblocks( +	struct xfs_inode	*ip, +	int			(*execute)(struct xfs_mount *mp, +					   struct xfs_eofblocks	*eofb))  {  	int scan = 0;  	struct xfs_eofblocks eofb = {0}; @@ -1401,41 +1448,58 @@ xfs_inode_free_quota_eofblocks(  	}  	if (scan) -		xfs_icache_free_eofblocks(ip->i_mount, &eofb); +		execute(ip->i_mount, &eofb);  	return scan;  } -void -xfs_inode_set_eofblocks_tag( -	xfs_inode_t	*ip) +int +xfs_inode_free_quota_eofblocks( +	struct xfs_inode *ip) +{ +	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); +} + +static void +__xfs_inode_set_eofblocks_tag( +	xfs_inode_t	*ip, +	void		(*execute)(struct xfs_mount *mp), +	void		(*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, +				  int error, unsigned long caller_ip), +	int		tag)  {  	struct xfs_mount *mp = ip->i_mount;  	struct xfs_perag *pag;  	int tagged; +	/* +	 * Don't bother locking the AG and looking up in the radix trees +	 * if we already know that we have the tag set. +	 */ +	if (ip->i_flags & XFS_IEOFBLOCKS) +		return; +	spin_lock(&ip->i_flags_lock); +	ip->i_flags |= XFS_IEOFBLOCKS; +	spin_unlock(&ip->i_flags_lock); +  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));  	spin_lock(&pag->pag_ici_lock); -	trace_xfs_inode_set_eofblocks_tag(ip); -	tagged = radix_tree_tagged(&pag->pag_ici_root, -				   XFS_ICI_EOFBLOCKS_TAG); +	tagged = radix_tree_tagged(&pag->pag_ici_root, tag);  	radix_tree_tag_set(&pag->pag_ici_root, -			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), -			   XFS_ICI_EOFBLOCKS_TAG); +			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);  	if (!tagged) {  		/* propagate the eofblocks tag up into the perag radix tree */  		spin_lock(&ip->i_mount->m_perag_lock);  		radix_tree_tag_set(&ip->i_mount->m_perag_tree,  				   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), -				   XFS_ICI_EOFBLOCKS_TAG); +				   tag);  		spin_unlock(&ip->i_mount->m_perag_lock);  		/* kick off background trimming */ -		xfs_queue_eofblocks(ip->i_mount); +		execute(ip->i_mount); -		trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, -					      -1, _RET_IP_); +		set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);  	}  	spin_unlock(&pag->pag_ici_lock); @@ -1443,31 +1507,166 @@ xfs_inode_set_eofblocks_tag(  }  void -xfs_inode_clear_eofblocks_tag( +xfs_inode_set_eofblocks_tag(  	xfs_inode_t	*ip)  { +	trace_xfs_inode_set_eofblocks_tag(ip); +	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, +			trace_xfs_perag_set_eofblocks, +			XFS_ICI_EOFBLOCKS_TAG); +} + +static void +__xfs_inode_clear_eofblocks_tag( +	xfs_inode_t	*ip, +	void		(*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, +				    int error, unsigned long caller_ip), +	int		tag) +{  	struct xfs_mount *mp = ip->i_mount;  	struct xfs_perag *pag; +	spin_lock(&ip->i_flags_lock); +	ip->i_flags &= ~XFS_IEOFBLOCKS; +	spin_unlock(&ip->i_flags_lock); +  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));  	spin_lock(&pag->pag_ici_lock); -	trace_xfs_inode_clear_eofblocks_tag(ip);  	radix_tree_tag_clear(&pag->pag_ici_root, -			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), -			     XFS_ICI_EOFBLOCKS_TAG); -	if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { +			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); +	if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {  		/* clear the eofblocks tag from the perag radix tree */  		spin_lock(&ip->i_mount->m_perag_lock);  		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,  				     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), -				     XFS_ICI_EOFBLOCKS_TAG); +				     tag);  		spin_unlock(&ip->i_mount->m_perag_lock); -		trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, -					       -1, _RET_IP_); +		clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);  	}  	spin_unlock(&pag->pag_ici_lock);  	xfs_perag_put(pag);  } +void +xfs_inode_clear_eofblocks_tag( +	xfs_inode_t	*ip) +{ +	trace_xfs_inode_clear_eofblocks_tag(ip); +	return __xfs_inode_clear_eofblocks_tag(ip, +			trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long.  If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( +	struct xfs_inode	*ip, +	int			flags, +	void			*args) +{ +	int ret; +	struct xfs_eofblocks *eofb = args; +	bool need_iolock = true; +	int match; + +	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + +	if (!xfs_reflink_has_real_cow_blocks(ip)) { +		trace_xfs_inode_free_cowblocks_invalid(ip); +		xfs_inode_clear_cowblocks_tag(ip); +		return 0; +	} + +	/* +	 * If the mapping is dirty or under writeback we cannot touch the +	 * CoW fork.  Leave it alone if we're in the midst of a directio. +	 */ +	if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || +	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || +	    atomic_read(&VFS_I(ip)->i_dio_count)) +		return 0; + +	if (eofb) { +		if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) +			match = xfs_inode_match_id_union(ip, eofb); +		else +			match = xfs_inode_match_id(ip, eofb); +		if (!match) +			return 0; + +		/* skip the inode if the file size is too small */ +		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && +		    XFS_ISIZE(ip) < eofb->eof_min_file_size) +			return 0; + +		/* +		 * A scan owner implies we already hold the iolock. Skip it in +		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates +		 * the possibility of EAGAIN being returned. +		 */ +		if (eofb->eof_scan_owner == ip->i_ino) +			need_iolock = false; +	} + +	/* Free the CoW blocks */ +	if (need_iolock) { +		xfs_ilock(ip, XFS_IOLOCK_EXCL); +		xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +	} + +	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + +	if (need_iolock) { +		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); +		xfs_iunlock(ip, XFS_IOLOCK_EXCL); +	} + +	return ret; +} + +int +xfs_icache_free_cowblocks( +	struct xfs_mount	*mp, +	struct xfs_eofblocks	*eofb) +{ +	return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, +			XFS_ICI_COWBLOCKS_TAG); +} + +int +xfs_inode_free_quota_cowblocks( +	struct xfs_inode *ip) +{ +	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); +} + +void +xfs_inode_set_cowblocks_tag( +	xfs_inode_t	*ip) +{ +	trace_xfs_inode_set_eofblocks_tag(ip); +	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, +			trace_xfs_perag_set_eofblocks, +			XFS_ICI_COWBLOCKS_TAG); +} + +void +xfs_inode_clear_cowblocks_tag( +	xfs_inode_t	*ip) +{ +	trace_xfs_inode_clear_eofblocks_tag(ip); +	return __xfs_inode_clear_eofblocks_tag(ip, +			trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 05bac99bef75..a1e02f4708ab 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,6 +40,7 @@ struct xfs_eofblocks {  					   in xfs_inode_ag_iterator */  #define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */  #define XFS_ICI_EOFBLOCKS_TAG	1	/* inode has blocks beyond EOF */ +#define XFS_ICI_COWBLOCKS_TAG	2	/* inode can have cow blocks to gc */  /*   * Flags for xfs_iget() @@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);  void xfs_eofblocks_worker(struct work_struct *);  void xfs_queue_eofblocks(struct xfs_mount *); +void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); +void xfs_cowblocks_worker(struct work_struct *); +  int xfs_inode_ag_iterator(struct xfs_mount *mp,  	int (*execute)(struct xfs_inode *ip, int flags, void *args),  	int flags, void *args); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e08eaea6327b..4e560e6a12c1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -49,6 +49,7 @@  #include "xfs_trans_priv.h"  #include "xfs_log.h"  #include "xfs_bmap_btree.h" +#include "xfs_reflink.h"  kmem_zone_t *xfs_inode_zone; @@ -77,6 +78,29 @@ xfs_get_extsz_hint(  }  /* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two.  If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( +	struct xfs_inode	*ip) +{ +	xfs_extlen_t		a, b; + +	a = 0; +	if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) +		a = ip->i_d.di_cowextsize; +	b = xfs_get_extsz_hint(ip); + +	a = max(a, b); +	if (a == 0) +		return XFS_DEFAULT_COWEXTSZ_HINT; +	return a; +} + +/*   * These two are wrapper routines around the xfs_ilock() routine used to   * centralize some grungy code.  They are used in places that wish to lock the   * inode solely for reading the extents.  The reason these places can't just @@ -651,6 +675,8 @@ _xfs_dic2xflags(  	if (di_flags2 & XFS_DIFLAG2_ANY) {  		if (di_flags2 & XFS_DIFLAG2_DAX)  			flags |= FS_XFLAG_DAX; +		if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) +			flags |= FS_XFLAG_COWEXTSIZE;  	}  	if (has_attr) @@ -821,7 +847,7 @@ xfs_ialloc(  	ip->i_d.di_nextents = 0;  	ASSERT(ip->i_d.di_nblocks == 0); -	tv = current_fs_time(mp->m_super); +	tv = current_time(inode);  	inode->i_mtime = tv;  	inode->i_atime = tv;  	inode->i_ctime = tv; @@ -834,6 +860,7 @@ xfs_ialloc(  	if (ip->i_d.di_version == 3) {  		inode->i_version = 1;  		ip->i_d.di_flags2 = 0; +		ip->i_d.di_cowextsize = 0;  		ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;  		ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;  	} @@ -896,6 +923,15 @@ xfs_ialloc(  			ip->i_d.di_flags |= di_flags;  			ip->i_d.di_flags2 |= di_flags2;  		} +		if (pip && +		    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && +		    pip->i_d.di_version == 3 && +		    ip->i_d.di_version == 3) { +			if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { +				ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; +				ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; +			} +		}  		/* FALLTHROUGH */  	case S_IFLNK:  		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -1586,6 +1622,20 @@ xfs_itruncate_extents(  			goto out;  	} +	/* Remove all pending CoW reservations. */ +	error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, +			last_block); +	if (error) +		goto out; + +	/* +	 * Clear the reflink flag if we truncated everything. +	 */ +	if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { +		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; +		xfs_inode_clear_cowblocks_tag(ip); +	} +  	/*  	 * Always re-log the inode so that our permanent transaction can keep  	 * on rolling it forward in the log. @@ -1710,7 +1760,7 @@ xfs_inactive_truncate(  	/*  	 * Log the inode size first to prevent stale data exposure in the event  	 * of a system crash before the truncate completes. See the related -	 * comment in xfs_setattr_size() for details. +	 * comment in xfs_vn_setattr_size() for details.  	 */  	ip->i_d.di_size = 0;  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1850,6 +1900,7 @@ xfs_inactive(  	}  	mp = ip->i_mount; +	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));  	/* If this is a read-only mount, don't do this (would generate I/O) */  	if (mp->m_flags & XFS_MOUNT_RDONLY) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e1a411e08f00..f14c1de2549d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -47,6 +47,7 @@ typedef struct xfs_inode {  	/* Extent information. */  	xfs_ifork_t		*i_afp;		/* attribute fork pointer */ +	xfs_ifork_t		*i_cowfp;	/* copy on write extents */  	xfs_ifork_t		i_df;		/* data fork */  	/* operations vectors */ @@ -65,6 +66,9 @@ typedef struct xfs_inode {  	struct xfs_icdinode	i_d;		/* most of ondisk inode */ +	xfs_extnum_t		i_cnextents;	/* # of extents in cow fork */ +	unsigned int		i_cformat;	/* format of cow fork */ +  	/* VFS inode */  	struct inode		i_vnode;	/* embedded VFS inode */  } xfs_inode_t; @@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp)  	return XFS_PROJID_DEFAULT;  } +static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +{ +	return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +} +  /*   * In-core inode flags.   */ @@ -216,6 +225,13 @@ xfs_get_initial_prid(struct xfs_inode *dp)  #define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */  #define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)  #define XFS_IDONTCACHE		(1 << 9) /* don't cache the inode long term */ +#define XFS_IEOFBLOCKS		(1 << 10)/* has the preallocblocks tag set */ +/* + * If this unlinked inode is in the middle of recovery, don't let drop_inode + * truncate and free the inode.  This can happen if we iget the inode during + * log recovery to replay a bmap operation on the inode. + */ +#define XFS_IRECOVERY		(1 << 11)  /*   * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -410,6 +426,7 @@ int		xfs_iflush(struct xfs_inode *, struct xfs_buf **);  void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);  xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);  int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,  			       xfs_nlink_t, xfs_dev_t, prid_t, int, @@ -473,4 +490,7 @@ do { \  extern struct kmem_zone	*xfs_inode_zone; +/* The default CoW extent size hint. */ +#define XFS_DEFAULT_COWEXTSZ_HINT 32 +  #endif	/* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 892c2aced207..9610e9c00952 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -368,7 +368,7 @@ xfs_inode_to_log_dinode(  		to->di_crtime.t_sec = from->di_crtime.t_sec;  		to->di_crtime.t_nsec = from->di_crtime.t_nsec;  		to->di_flags2 = from->di_flags2; - +		to->di_cowextsize = from->di_cowextsize;  		to->di_ino = ip->i_ino;  		to->di_lsn = lsn;  		memset(to->di_pad2, 0, sizeof(to->di_pad2)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 96a70fd1f5d6..c245bed3249b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -720,7 +720,7 @@ xfs_ioc_space(  		iattr.ia_valid = ATTR_SIZE;  		iattr.ia_size = bf->l_start; -		error = xfs_setattr_size(ip, &iattr); +		error = xfs_vn_setattr_size(file_dentry(filp), &iattr);  		break;  	default:  		ASSERT(0); @@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr(  	xfs_ilock(ip, XFS_ILOCK_SHARED);  	fa.fsx_xflags = xfs_ip2xflags(ip);  	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; +	fa.fsx_cowextsize = ip->i_d.di_cowextsize << +			ip->i_mount->m_sb.sb_blocklog;  	fa.fsx_projid = xfs_get_projid(ip);  	if (attr) { @@ -973,12 +975,13 @@ xfs_set_diflags(  	if (ip->i_d.di_version < 3)  		return; -	di_flags2 = 0; +	di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);  	if (xflags & FS_XFLAG_DAX)  		di_flags2 |= XFS_DIFLAG2_DAX; +	if (xflags & FS_XFLAG_COWEXTSIZE) +		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;  	ip->i_d.di_flags2 = di_flags2; -  }  STATIC void @@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags(  			return -EINVAL;  	} +	/* Clear reflink if we are actually able to set the rt flag. */ +	if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) +		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + +	/* Don't allow us to set DAX mode for a reflinked file for now. */ +	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) +		return -EINVAL; +  	/*  	 * Can't modify an immutable/append-only file unless  	 * we have appropriate permission. @@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize(  	return 0;  } +/* + * CoW extent size hint validation rules are: + * + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + *    The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + *    for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. CoW extsize hint of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + *    alignment extending the extent beyond the limits of the AG. + */ +static int +xfs_ioctl_setattr_check_cowextsize( +	struct xfs_inode	*ip, +	struct fsxattr		*fa) +{ +	struct xfs_mount	*mp = ip->i_mount; + +	if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) +		return 0; + +	if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || +	    ip->i_d.di_version != 3) +		return -EINVAL; + +	if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode)) +		return -EINVAL; + +	if (fa->fsx_cowextsize != 0) { +		xfs_extlen_t    size; +		xfs_fsblock_t   cowextsize_fsb; + +		cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); +		if (cowextsize_fsb > MAXEXTLEN) +			return -EINVAL; + +		size = mp->m_sb.sb_blocksize; +		if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) +			return -EINVAL; + +		if (fa->fsx_cowextsize % size) +			return -EINVAL; +	} else +		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + +	return 0; +} +  static int  xfs_ioctl_setattr_check_projid(  	struct xfs_inode	*ip, @@ -1311,6 +1372,10 @@ xfs_ioctl_setattr(  	if (code)  		goto error_trans_cancel; +	code = xfs_ioctl_setattr_check_cowextsize(ip, fa); +	if (code) +		goto error_trans_cancel; +  	code = xfs_ioctl_setattr_xflags(tp, ip, fa);  	if (code)  		goto error_trans_cancel; @@ -1346,6 +1411,12 @@ xfs_ioctl_setattr(  		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;  	else  		ip->i_d.di_extsize = 0; +	if (ip->i_d.di_version == 3 && +	    (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) +		ip->i_d.di_cowextsize = fa->fsx_cowextsize >> +				mp->m_sb.sb_blocklog; +	else +		ip->i_d.di_cowextsize = 0;  	code = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2af0dda1c978..d907eb9f8ef3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1,5 +1,6 @@  /*   * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2016 Christoph Hellwig.   * All Rights Reserved.   *   * This program is free software; you can redistribute it and/or @@ -38,21 +39,45 @@  #include "xfs_quota.h"  #include "xfs_dquot_item.h"  #include "xfs_dquot.h" +#include "xfs_reflink.h"  #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \  						<< mp->m_writeio_log) -#define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP -STATIC int -xfs_iomap_eof_align_last_fsb( -	xfs_mount_t	*mp, -	xfs_inode_t	*ip, -	xfs_extlen_t	extsize, -	xfs_fileoff_t	*last_fsb) +void +xfs_bmbt_to_iomap( +	struct xfs_inode	*ip, +	struct iomap		*iomap, +	struct xfs_bmbt_irec	*imap) +{ +	struct xfs_mount	*mp = ip->i_mount; + +	if (imap->br_startblock == HOLESTARTBLOCK) { +		iomap->blkno = IOMAP_NULL_BLOCK; +		iomap->type = IOMAP_HOLE; +	} else if (imap->br_startblock == DELAYSTARTBLOCK) { +		iomap->blkno = IOMAP_NULL_BLOCK; +		iomap->type = IOMAP_DELALLOC; +	} else { +		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); +		if (imap->br_state == XFS_EXT_UNWRITTEN) +			iomap->type = IOMAP_UNWRITTEN; +		else +			iomap->type = IOMAP_MAPPED; +	} +	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); +	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); +	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +xfs_extlen_t +xfs_eof_alignment( +	struct xfs_inode	*ip, +	xfs_extlen_t		extsize)  { -	xfs_extlen_t	align = 0; -	int		eof, error; +	struct xfs_mount	*mp = ip->i_mount; +	xfs_extlen_t		align = 0;  	if (!XFS_IS_REALTIME_INODE(ip)) {  		/* @@ -83,8 +108,21 @@ xfs_iomap_eof_align_last_fsb(  			align = extsize;  	} +	return align; +} + +STATIC int +xfs_iomap_eof_align_last_fsb( +	struct xfs_inode	*ip, +	xfs_extlen_t		extsize, +	xfs_fileoff_t		*last_fsb) +{ +	xfs_extlen_t		align = xfs_eof_alignment(ip, extsize); +  	if (align) {  		xfs_fileoff_t	new_last_fsb = roundup_64(*last_fsb, align); +		int		eof, error; +  		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);  		if (error)  			return error; @@ -154,7 +192,7 @@ xfs_iomap_write_direct(  		 */  		ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &  								XFS_IFEXTENTS); -		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); +		error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);  		if (error)  			goto out_unlock;  	} else { @@ -274,130 +312,6 @@ out_trans_cancel:  	goto out_unlock;  } -/* - * If the caller is doing a write at the end of the file, then extend the - * allocation out to the file system's write iosize.  We clean up any extra - * space left over when the file is closed in xfs_inactive(). - * - * If we find we already have delalloc preallocation beyond EOF, don't do more - * preallocation as it it not needed. - */ -STATIC int -xfs_iomap_eof_want_preallocate( -	xfs_mount_t	*mp, -	xfs_inode_t	*ip, -	xfs_off_t	offset, -	size_t		count, -	xfs_bmbt_irec_t *imap, -	int		nimaps, -	int		*prealloc) -{ -	xfs_fileoff_t   start_fsb; -	xfs_filblks_t   count_fsb; -	int		n, error, imaps; -	int		found_delalloc = 0; - -	*prealloc = 0; -	if (offset + count <= XFS_ISIZE(ip)) -		return 0; - -	/* -	 * If the file is smaller than the minimum prealloc and we are using -	 * dynamic preallocation, don't do any preallocation at all as it is -	 * likely this is the only write to the file that is going to be done. -	 */ -	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && -	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) -		return 0; - -	/* -	 * If there are any real blocks past eof, then don't -	 * do any speculative allocation. -	 */ -	start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); -	count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); -	while (count_fsb > 0) { -		imaps = nimaps; -		error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, -				       0); -		if (error) -			return error; -		for (n = 0; n < imaps; n++) { -			if ((imap[n].br_startblock != HOLESTARTBLOCK) && -			    (imap[n].br_startblock != DELAYSTARTBLOCK)) -				return 0; -			start_fsb += imap[n].br_blockcount; -			count_fsb -= imap[n].br_blockcount; - -			if (imap[n].br_startblock == DELAYSTARTBLOCK) -				found_delalloc = 1; -		} -	} -	if (!found_delalloc) -		*prealloc = 1; -	return 0; -} - -/* - * Determine the initial size of the preallocation. We are beyond the current - * EOF here, but we need to take into account whether this is a sparse write or - * an extending write when determining the preallocation size.  Hence we need to - * look up the extent that ends at the current write offset and use the result - * to determine the preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceeding data extent as the basis for the - * preallocation size. If the size of the extent is greater than half the - * maximum extent length, then use the current offset as the basis. This ensures - * that for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width alignment of - * real extents. - */ -STATIC xfs_fsblock_t -xfs_iomap_eof_prealloc_initial_size( -	struct xfs_mount	*mp, -	struct xfs_inode	*ip, -	xfs_off_t		offset, -	xfs_bmbt_irec_t		*imap, -	int			nimaps) -{ -	xfs_fileoff_t   start_fsb; -	int		imaps = 1; -	int		error; - -	ASSERT(nimaps >= imaps); - -	/* if we are using a specific prealloc size, return now */ -	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) -		return 0; - -	/* If the file is small, then use the minimum prealloc */ -	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) -		return 0; - -	/* -	 * As we write multiple pages, the offset will always align to the -	 * start of a page and hence point to a hole at EOF. i.e. if the size is -	 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) -	 * will return FSB 1. Hence if there are blocks in the file, we want to -	 * point to the block prior to the EOF block and not the hole that maps -	 * directly at @offset. -	 */ -	start_fsb = XFS_B_TO_FSB(mp, offset); -	if (start_fsb) -		start_fsb--; -	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); -	if (error) -		return 0; - -	ASSERT(imaps == 1); -	if (imap[0].br_startblock == HOLESTARTBLOCK) -		return 0; -	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) -		return imap[0].br_blockcount << 1; -	return XFS_B_TO_FSB(mp, offset); -} -  STATIC bool  xfs_quota_need_throttle(  	struct xfs_inode *ip, @@ -459,27 +373,76 @@ xfs_quota_calc_throttle(  }  /* + * If we are doing a write at the end of the file and there are no allocations + * past this one, then extend the allocation out to the file system's write + * iosize. + *   * If we don't have a user specified preallocation size, dynamically increase - * the preallocation size as the size of the file grows. Cap the maximum size + * the preallocation size as the size of the file grows.  Cap the maximum size   * at a single extent or less if the filesystem is near full. The closer the   * filesystem is to full, the smaller the maximum prealocation. + * + * As an exception we don't do any preallocation at all if the file is smaller + * than the minimum preallocation and we are using the default dynamic + * preallocation scheme, as it is likely this is the only write to the file that + * is going to be done. + * + * We clean up any extra space left over when the file is closed in + * xfs_inactive().   */  STATIC xfs_fsblock_t  xfs_iomap_prealloc_size( -	struct xfs_mount	*mp,  	struct xfs_inode	*ip, -	xfs_off_t		offset, -	struct xfs_bmbt_irec	*imap, -	int			nimaps) +	loff_t			offset, +	loff_t			count, +	xfs_extnum_t		idx, +	struct xfs_bmbt_irec	*prev)  { -	xfs_fsblock_t		alloc_blocks = 0; +	struct xfs_mount	*mp = ip->i_mount; +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);  	int			shift = 0;  	int64_t			freesp;  	xfs_fsblock_t		qblocks;  	int			qshift = 0; +	xfs_fsblock_t		alloc_blocks = 0; + +	if (offset + count <= XFS_ISIZE(ip)) +		return 0; + +	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && +	    (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))) +		return 0; -	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, -							   imap, nimaps); +	/* +	 * If an explicit allocsize is set, the file is small, or we +	 * are writing behind a hole, then use the minimum prealloc: +	 */ +	if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || +	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || +	    idx == 0 || +	    prev->br_startoff + prev->br_blockcount < offset_fsb) +		return mp->m_writeio_blocks; + +	/* +	 * Determine the initial size of the preallocation. We are beyond the +	 * current EOF here, but we need to take into account whether this is +	 * a sparse write or an extending write when determining the +	 * preallocation size.  Hence we need to look up the extent that ends +	 * at the current write offset and use the result to determine the +	 * preallocation size. +	 * +	 * If the extent is a hole, then preallocation is essentially disabled. +	 * Otherwise we take the size of the preceding data extent as the basis +	 * for the preallocation size. If the size of the extent is greater than +	 * half the maximum extent length, then use the current offset as the +	 * basis. This ensures that for large files the preallocation size +	 * always extends to MAXEXTLEN rather than falling short due to things +	 * like stripe unit/width alignment of real extents. +	 */ +	if (prev->br_blockcount <= (MAXEXTLEN >> 1)) +		alloc_blocks = prev->br_blockcount << 1; +	else +		alloc_blocks = XFS_B_TO_FSB(mp, offset);  	if (!alloc_blocks)  		goto check_writeio;  	qblocks = alloc_blocks; @@ -550,120 +513,145 @@ xfs_iomap_prealloc_size(  	 */  	while (alloc_blocks && alloc_blocks >= freesp)  		alloc_blocks >>= 4; -  check_writeio:  	if (alloc_blocks < mp->m_writeio_blocks)  		alloc_blocks = mp->m_writeio_blocks; -  	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,  				      mp->m_writeio_blocks); -  	return alloc_blocks;  } -int -xfs_iomap_write_delay( -	xfs_inode_t	*ip, -	xfs_off_t	offset, -	size_t		count, -	xfs_bmbt_irec_t *ret_imap) +static int +xfs_file_iomap_begin_delay( +	struct inode		*inode, +	loff_t			offset, +	loff_t			count, +	unsigned		flags, +	struct iomap		*iomap)  { -	xfs_mount_t	*mp = ip->i_mount; -	xfs_fileoff_t	offset_fsb; -	xfs_fileoff_t	last_fsb; -	xfs_off_t	aligned_offset; -	xfs_fileoff_t	ioalign; -	xfs_extlen_t	extsz; -	int		nimaps; -	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; -	int		prealloc; -	int		error; - -	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - -	/* -	 * Make sure that the dquots are there. This doesn't hold -	 * the ilock across a disk read. -	 */ -	error = xfs_qm_dqattach_locked(ip, 0); -	if (error) -		return error; +	struct xfs_inode	*ip = XFS_I(inode); +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset); +	xfs_fileoff_t		maxbytes_fsb = +		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); +	xfs_fileoff_t		end_fsb, orig_end_fsb; +	int			error = 0, eof = 0; +	struct xfs_bmbt_irec	got; +	struct xfs_bmbt_irec	prev; +	xfs_extnum_t		idx; -	extsz = xfs_get_extsz_hint(ip); -	offset_fsb = XFS_B_TO_FSBT(mp, offset); +	ASSERT(!XFS_IS_REALTIME_INODE(ip)); +	ASSERT(!xfs_get_extsz_hint(ip)); -	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, -				imap, XFS_WRITE_IMAPS, &prealloc); -	if (error) -		return error; +	xfs_ilock(ip, XFS_ILOCK_EXCL); -retry: -	if (prealloc) { -		xfs_fsblock_t	alloc_blocks; +	if (unlikely(XFS_TEST_ERROR( +	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && +	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), +	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); +		error = -EFSCORRUPTED; +		goto out_unlock; +	} -		alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, -						       XFS_WRITE_IMAPS); +	XFS_STATS_INC(mp, xs_blk_mapw); -		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); -		ioalign = XFS_B_TO_FSBT(mp, aligned_offset); -		last_fsb = ioalign + alloc_blocks; -	} else { -		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); +	if (!(ifp->if_flags & XFS_IFEXTENTS)) { +		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); +		if (error) +			goto out_unlock;  	} -	if (prealloc || extsz) { -		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); -		if (error) -			return error; +	xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, +			&got, &prev); +	if (!eof && got.br_startoff <= offset_fsb) { +		trace_xfs_iomap_found(ip, offset, count, 0, &got); +		goto done;  	} +	error = xfs_qm_dqattach_locked(ip, 0); +	if (error) +		goto out_unlock; +  	/* -	 * Make sure preallocation does not create extents beyond the range we -	 * actually support in this filesystem. +	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages +	 * to keep the chunks of work done where somewhat symmetric with the +	 * work writeback does. This is a completely arbitrary number pulled +	 * out of thin air as a best guess for initial testing. +	 * +	 * Note that the values needs to be less than 32-bits wide until +	 * the lower level functions are updated.  	 */ -	if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) -		last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); +	count = min_t(loff_t, count, 1024 * PAGE_SIZE); +	end_fsb = orig_end_fsb = +		min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + +	if (eof) { +		xfs_fsblock_t	prealloc_blocks; + +		prealloc_blocks = +			xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); +		if (prealloc_blocks) { +			xfs_extlen_t	align; +			xfs_off_t	end_offset; + +			end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); +			end_fsb = XFS_B_TO_FSBT(mp, end_offset) + +				prealloc_blocks; -	ASSERT(last_fsb > offset_fsb); +			align = xfs_eof_alignment(ip, 0); +			if (align) +				end_fsb = roundup_64(end_fsb, align); + +			end_fsb = min(end_fsb, maxbytes_fsb); +			ASSERT(end_fsb > offset_fsb); +		} +	} -	nimaps = XFS_WRITE_IMAPS; -	error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, -				imap, &nimaps, XFS_BMAPI_ENTIRE); +retry: +	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, +			end_fsb - offset_fsb, &got, +			&prev, &idx, eof);  	switch (error) {  	case 0: +		break;  	case -ENOSPC:  	case -EDQUOT: -		break; -	default: -		return error; -	} - -	/* -	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry -	 * without EOF preallocation. -	 */ -	if (nimaps == 0) { +		/* retry without any preallocation */  		trace_xfs_delalloc_enospc(ip, offset, count); -		if (prealloc) { -			prealloc = 0; -			error = 0; +		if (end_fsb != orig_end_fsb) { +			end_fsb = orig_end_fsb;  			goto retry;  		} -		return error ? error : -ENOSPC; +		/*FALLTHRU*/ +	default: +		goto out_unlock;  	} -	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) -		return xfs_alert_fsblock_zero(ip, &imap[0]); -  	/*  	 * Tag the inode as speculatively preallocated so we can reclaim this  	 * space on demand, if necessary.  	 */ -	if (prealloc) +	if (end_fsb != orig_end_fsb)  		xfs_inode_set_eofblocks_tag(ip); -	*ret_imap = imap[0]; -	return 0; +	trace_xfs_iomap_alloc(ip, offset, count, 0, &got); +done: +	if (isnullstartblock(got.br_startblock)) +		got.br_startblock = DELAYSTARTBLOCK; + +	if (!got.br_startblock) { +		error = xfs_alert_fsblock_zero(ip, &got); +		if (error) +			goto out_unlock; +	} + +	xfs_bmbt_to_iomap(ip, iomap, &got); + +out_unlock: +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	return error;  }  /* @@ -679,6 +667,7 @@ retry:  int  xfs_iomap_write_allocate(  	xfs_inode_t	*ip, +	int		whichfork,  	xfs_off_t	offset,  	xfs_bmbt_irec_t *imap)  { @@ -691,8 +680,12 @@ xfs_iomap_write_allocate(  	xfs_trans_t	*tp;  	int		nimaps;  	int		error = 0; +	int		flags = 0;  	int		nres; +	if (whichfork == XFS_COW_FORK) +		flags |= XFS_BMAPI_COWFORK; +  	/*  	 * Make sure that the dquots are there.  	 */ @@ -786,7 +779,7 @@ xfs_iomap_write_allocate(  			 * pointer that the caller gave to us.  			 */  			error = xfs_bmapi_write(tp, ip, map_start_fsb, -						count_fsb, 0, &first_block, +						count_fsb, flags, &first_block,  						nres, imap, &nimaps,  						&dfops);  			if (error) @@ -947,37 +940,13 @@ error_on_bmapi_transaction:  	return error;  } -void -xfs_bmbt_to_iomap( -	struct xfs_inode	*ip, -	struct iomap		*iomap, -	struct xfs_bmbt_irec	*imap) -{ -	struct xfs_mount	*mp = ip->i_mount; - -	if (imap->br_startblock == HOLESTARTBLOCK) { -		iomap->blkno = IOMAP_NULL_BLOCK; -		iomap->type = IOMAP_HOLE; -	} else if (imap->br_startblock == DELAYSTARTBLOCK) { -		iomap->blkno = IOMAP_NULL_BLOCK; -		iomap->type = IOMAP_DELALLOC; -	} else { -		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); -		if (imap->br_state == XFS_EXT_UNWRITTEN) -			iomap->type = IOMAP_UNWRITTEN; -		else -			iomap->type = IOMAP_MAPPED; -	} -	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); -	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); -} - -static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool imap_needs_alloc(struct inode *inode, +		struct xfs_bmbt_irec *imap, int nimaps)  {  	return !nimaps ||  		imap->br_startblock == HOLESTARTBLOCK || -		imap->br_startblock == DELAYSTARTBLOCK; +		imap->br_startblock == DELAYSTARTBLOCK || +		(IS_DAX(inode) && ISUNWRITTEN(imap));  }  static int @@ -992,12 +961,27 @@ xfs_file_iomap_begin(  	struct xfs_mount	*mp = ip->i_mount;  	struct xfs_bmbt_irec	imap;  	xfs_fileoff_t		offset_fsb, end_fsb; +	bool			shared, trimmed;  	int			nimaps = 1, error = 0; +	unsigned		lockmode;  	if (XFS_FORCED_SHUTDOWN(mp))  		return -EIO; -	xfs_ilock(ip, XFS_ILOCK_EXCL); +	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { +		error = xfs_reflink_reserve_cow_range(ip, offset, length); +		if (error < 0) +			return error; +	} + +	if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && +		   !xfs_get_extsz_hint(ip)) { +		/* Reserve delalloc blocks for regular writeback. */ +		return xfs_file_iomap_begin_delay(inode, offset, length, flags, +				iomap); +	} + +	lockmode = xfs_ilock_data_map_shared(ip);  	ASSERT(offset <= mp->m_super->s_maxbytes);  	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -1006,13 +990,20 @@ xfs_file_iomap_begin(  	end_fsb = XFS_B_TO_FSB(mp, offset + length);  	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, -			       &nimaps, XFS_BMAPI_ENTIRE); +			       &nimaps, 0);  	if (error) { -		xfs_iunlock(ip, XFS_ILOCK_EXCL); +		xfs_iunlock(ip, lockmode);  		return error;  	} -	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { +	/* Trim the mapping to the nearest shared extent boundary. */ +	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); +	if (error) { +		xfs_iunlock(ip, lockmode); +		return error; +	} + +	if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {  		/*  		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES  		 * pages to keep the chunks of work done where somewhat symmetric @@ -1024,31 +1015,29 @@ xfs_file_iomap_begin(  		 * the lower level functions are updated.  		 */  		length = min_t(loff_t, length, 1024 * PAGE_SIZE); -		if (xfs_get_extsz_hint(ip)) { -			/* -			 * xfs_iomap_write_direct() expects the shared lock. It -			 * is unlocked on return. -			 */ -			xfs_ilock_demote(ip, XFS_ILOCK_EXCL); -			error = xfs_iomap_write_direct(ip, offset, length, &imap, -					nimaps); -		} else { -			error = xfs_iomap_write_delay(ip, offset, length, &imap); -			xfs_iunlock(ip, XFS_ILOCK_EXCL); -		} - +		/* +		 * xfs_iomap_write_direct() expects the shared lock. It +		 * is unlocked on return. +		 */ +		if (lockmode == XFS_ILOCK_EXCL) +			xfs_ilock_demote(ip, lockmode); +		error = xfs_iomap_write_direct(ip, offset, length, &imap, +				nimaps);  		if (error)  			return error; +		iomap->flags = IOMAP_F_NEW;  		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);  	} else {  		ASSERT(nimaps); -		xfs_iunlock(ip, XFS_ILOCK_EXCL); +		xfs_iunlock(ip, lockmode);  		trace_xfs_iomap_found(ip, offset, length, 0, &imap);  	}  	xfs_bmbt_to_iomap(ip, iomap, &imap); +	if (shared) +		iomap->flags |= IOMAP_F_SHARED;  	return 0;  } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index fb8aca3d69ab..6d45cf01fcff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,14 +25,13 @@ struct xfs_bmbt_irec;  int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,  			struct xfs_bmbt_irec *, int); -int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, -			struct xfs_bmbt_irec *); -int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, +int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,  			struct xfs_bmbt_irec *);  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);  void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,  		struct xfs_bmbt_irec *); +xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);  extern struct iomap_ops xfs_iomap_ops;  extern struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b24c3102fa93..405a65cd9d6b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -542,6 +542,28 @@ xfs_setattr_time(  		inode->i_mtime = iattr->ia_mtime;  } +static int +xfs_vn_change_ok( +	struct dentry	*dentry, +	struct iattr	*iattr) +{ +	struct xfs_mount	*mp = XFS_I(d_inode(dentry))->i_mount; + +	if (mp->m_flags & XFS_MOUNT_RDONLY) +		return -EROFS; + +	if (XFS_FORCED_SHUTDOWN(mp)) +		return -EIO; + +	return setattr_prepare(dentry, iattr); +} + +/* + * Set non-size attributes of an inode. + * + * Caution: The caller of this function is responsible for calling + * setattr_prepare() or otherwise verifying the change is fine. + */  int  xfs_setattr_nonsize(  	struct xfs_inode	*ip, @@ -558,21 +580,6 @@ xfs_setattr_nonsize(  	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;  	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL; -	trace_xfs_setattr(ip); - -	/* If acls are being inherited, we already have this checked */ -	if (!(flags & XFS_ATTR_NOACL)) { -		if (mp->m_flags & XFS_MOUNT_RDONLY) -			return -EROFS; - -		if (XFS_FORCED_SHUTDOWN(mp)) -			return -EIO; - -		error = inode_change_ok(inode, iattr); -		if (error) -			return error; -	} -  	ASSERT((mask & ATTR_SIZE) == 0);  	/* @@ -743,8 +750,27 @@ out_dqrele:  	return error;  } +int +xfs_vn_setattr_nonsize( +	struct dentry		*dentry, +	struct iattr		*iattr) +{ +	struct xfs_inode	*ip = XFS_I(d_inode(dentry)); +	int error; + +	trace_xfs_setattr(ip); + +	error = xfs_vn_change_ok(dentry, iattr); +	if (error) +		return error; +	return xfs_setattr_nonsize(ip, iattr, 0); +} +  /*   * Truncate file.  Must have write permission and not be a directory. + * + * Caution: The caller of this function is responsible for calling + * setattr_prepare() or otherwise verifying the change is fine.   */  int  xfs_setattr_size( @@ -759,18 +785,6 @@ xfs_setattr_size(  	uint			lock_flags = 0;  	bool			did_zeroing = false; -	trace_xfs_setattr(ip); - -	if (mp->m_flags & XFS_MOUNT_RDONLY) -		return -EROFS; - -	if (XFS_FORCED_SHUTDOWN(mp)) -		return -EIO; - -	error = inode_change_ok(inode, iattr); -	if (error) -		return error; -  	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));  	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));  	ASSERT(S_ISREG(inode->i_mode)); @@ -882,7 +896,7 @@ xfs_setattr_size(  	if (newsize != oldsize &&  	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {  		iattr->ia_ctime = iattr->ia_mtime = -			current_fs_time(inode->i_sb); +			current_time(inode);  		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;  	} @@ -942,16 +956,32 @@ out_trans_cancel:  	goto out_unlock;  } +int +xfs_vn_setattr_size( +	struct dentry		*dentry, +	struct iattr		*iattr) +{ +	struct xfs_inode	*ip = XFS_I(d_inode(dentry)); +	int error; + +	trace_xfs_setattr(ip); + +	error = xfs_vn_change_ok(dentry, iattr); +	if (error) +		return error; +	return xfs_setattr_size(ip, iattr); +} +  STATIC int  xfs_vn_setattr(  	struct dentry		*dentry,  	struct iattr		*iattr)  { -	struct xfs_inode	*ip = XFS_I(d_inode(dentry));  	int			error;  	if (iattr->ia_valid & ATTR_SIZE) { -		uint		iolock = XFS_IOLOCK_EXCL; +		struct xfs_inode	*ip = XFS_I(d_inode(dentry)); +		uint			iolock = XFS_IOLOCK_EXCL;  		xfs_ilock(ip, iolock);  		error = xfs_break_layouts(d_inode(dentry), &iolock, true); @@ -959,11 +989,11 @@ xfs_vn_setattr(  			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);  			iolock |= XFS_MMAPLOCK_EXCL; -			error = xfs_setattr_size(ip, iattr); +			error = xfs_vn_setattr_size(dentry, iattr);  		}  		xfs_iunlock(ip, iolock);  	} else { -		error = xfs_setattr_nonsize(ip, iattr, 0); +		error = xfs_vn_setattr_nonsize(dentry, iattr);  	}  	return error; @@ -1036,9 +1066,6 @@ static const struct inode_operations xfs_inode_operations = {  	.set_acl		= xfs_set_acl,  	.getattr		= xfs_vn_getattr,  	.setattr		= xfs_vn_setattr, -	.setxattr		= generic_setxattr, -	.getxattr		= generic_getxattr, -	.removexattr		= generic_removexattr,  	.listxattr		= xfs_vn_listxattr,  	.fiemap			= xfs_vn_fiemap,  	.update_time		= xfs_vn_update_time, @@ -1059,14 +1086,11 @@ static const struct inode_operations xfs_dir_inode_operations = {  	 */  	.rmdir			= xfs_vn_unlink,  	.mknod			= xfs_vn_mknod, -	.rename2		= xfs_vn_rename, +	.rename			= xfs_vn_rename,  	.get_acl		= xfs_get_acl,  	.set_acl		= xfs_set_acl,  	.getattr		= xfs_vn_getattr,  	.setattr		= xfs_vn_setattr, -	.setxattr		= generic_setxattr, -	.getxattr		= generic_getxattr, -	.removexattr		= generic_removexattr,  	.listxattr		= xfs_vn_listxattr,  	.update_time		= xfs_vn_update_time,  	.tmpfile		= xfs_vn_tmpfile, @@ -1087,14 +1111,11 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {  	 */  	.rmdir			= xfs_vn_unlink,  	.mknod			= xfs_vn_mknod, -	.rename2		= xfs_vn_rename, +	.rename			= xfs_vn_rename,  	.get_acl		= xfs_get_acl,  	.set_acl		= xfs_set_acl,  	.getattr		= xfs_vn_getattr,  	.setattr		= xfs_vn_setattr, -	.setxattr		= generic_setxattr, -	.getxattr		= generic_getxattr, -	.removexattr		= generic_removexattr,  	.listxattr		= xfs_vn_listxattr,  	.update_time		= xfs_vn_update_time,  	.tmpfile		= xfs_vn_tmpfile, @@ -1105,9 +1126,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {  	.get_link		= xfs_vn_get_link,  	.getattr		= xfs_vn_getattr,  	.setattr		= xfs_vn_setattr, -	.setxattr		= generic_setxattr, -	.getxattr		= generic_getxattr, -	.removexattr		= generic_removexattr,  	.listxattr		= xfs_vn_listxattr,  	.update_time		= xfs_vn_update_time,  }; @@ -1117,9 +1135,6 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = {  	.get_link		= xfs_vn_get_link_inline,  	.getattr		= xfs_vn_getattr,  	.setattr		= xfs_vn_setattr, -	.setxattr		= generic_setxattr, -	.getxattr		= generic_getxattr, -	.removexattr		= generic_removexattr,  	.listxattr		= xfs_vn_listxattr,  	.update_time		= xfs_vn_update_time,  }; @@ -1144,6 +1159,7 @@ xfs_diflags_to_iflags(  		inode->i_flags |= S_NOATIME;  	if (S_ISREG(inode->i_mode) &&  	    ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && +	    !xfs_is_reflink_inode(ip) &&  	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||  	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))  		inode->i_flags |= S_DAX; diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index a0f84abb0d09..0259a383721a 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -33,6 +33,7 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);  extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);  extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,  			       int flags); -extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); +extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap); +extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap);  #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ce73eb34620d..66e881790c17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,7 +66,7 @@ xfs_bulkstat_one_int(  	if (!buffer || xfs_internal_inum(mp, ino))  		return -EINVAL; -	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); +	buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);  	if (!buf)  		return -ENOMEM; @@ -111,6 +111,12 @@ xfs_bulkstat_one_int(  	buf->bs_aextents = dic->di_anextents;  	buf->bs_forkoff = XFS_IFORK_BOFF(ip); +	if (dic->di_version == 3) { +		if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) +			buf->bs_cowextsize = dic->di_cowextsize << +					mp->m_sb.sb_blocklog; +	} +  	switch (dic->di_format) {  	case XFS_DINODE_FMT_DEV:  		buf->bs_rdev = ip->i_df.if_u2.if_rdev; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index b8d64d520e12..68640fb63a54 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,6 +116,7 @@ typedef __u32			xfs_nlink_t;  #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val  #define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val  #define xfs_eofb_secs		xfs_params.eofb_timer.val +#define xfs_cowb_secs		xfs_params.cowb_timer.val  #define current_cpu()		(raw_smp_processor_id())  #define current_pid()		(current->pid) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 765f084759b5..2b6eec52178e 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -413,7 +413,8 @@ struct xlog {  	/* log record crc error injection factor */  	uint32_t		l_badcrc_factor;  #endif - +	/* log recovery lsn tracking (for buffer submission */ +	xfs_lsn_t		l_recovery_lsn;  };  #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e8638fd2c0c3..9b3d7c76915d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -44,6 +44,9 @@  #include "xfs_error.h"  #include "xfs_dir2.h"  #include "xfs_rmap_item.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h"  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1) @@ -381,6 +384,15 @@ xlog_recover_iodone(  						SHUTDOWN_META_IO_ERROR);  		}  	} + +	/* +	 * On v5 supers, a bli could be attached to update the metadata LSN. +	 * Clean it up. +	 */ +	if (bp->b_fspriv) +		xfs_buf_item_relse(bp); +	ASSERT(bp->b_fspriv == NULL); +  	bp->b_iodone = NULL;  	xfs_buf_ioend(bp);  } @@ -1914,6 +1926,10 @@ xlog_recover_reorder_trans(  		case XFS_LI_EFI:  		case XFS_LI_RUI:  		case XFS_LI_RUD: +		case XFS_LI_CUI: +		case XFS_LI_CUD: +		case XFS_LI_BUI: +		case XFS_LI_BUD:  			trace_xfs_log_recover_item_reorder_tail(log,  							trans, item, pass);  			list_move_tail(&item->ri_list, &inode_list); @@ -2232,6 +2248,7 @@ xlog_recover_get_buf_lsn(  	case XFS_ABTB_MAGIC:  	case XFS_ABTC_MAGIC:  	case XFS_RMAP_CRC_MAGIC: +	case XFS_REFC_CRC_MAGIC:  	case XFS_IBT_CRC_MAGIC:  	case XFS_IBT_MAGIC: {  		struct xfs_btree_block *btb = blk; @@ -2360,12 +2377,14 @@ static void  xlog_recover_validate_buf_type(  	struct xfs_mount	*mp,  	struct xfs_buf		*bp, -	xfs_buf_log_format_t	*buf_f) +	xfs_buf_log_format_t	*buf_f, +	xfs_lsn_t		current_lsn)  {  	struct xfs_da_blkinfo	*info = bp->b_addr;  	__uint32_t		magic32;  	__uint16_t		magic16;  	__uint16_t		magicda; +	char			*warnmsg = NULL;  	/*  	 * We can only do post recovery validation on items on CRC enabled @@ -2403,32 +2422,31 @@ xlog_recover_validate_buf_type(  		case XFS_RMAP_CRC_MAGIC:  			bp->b_ops = &xfs_rmapbt_buf_ops;  			break; +		case XFS_REFC_CRC_MAGIC: +			bp->b_ops = &xfs_refcountbt_buf_ops; +			break;  		default: -			xfs_warn(mp, "Bad btree block magic!"); -			ASSERT(0); +			warnmsg = "Bad btree block magic!";  			break;  		}  		break;  	case XFS_BLFT_AGF_BUF:  		if (magic32 != XFS_AGF_MAGIC) { -			xfs_warn(mp, "Bad AGF block magic!"); -			ASSERT(0); +			warnmsg = "Bad AGF block magic!";  			break;  		}  		bp->b_ops = &xfs_agf_buf_ops;  		break;  	case XFS_BLFT_AGFL_BUF:  		if (magic32 != XFS_AGFL_MAGIC) { -			xfs_warn(mp, "Bad AGFL block magic!"); -			ASSERT(0); +			warnmsg = "Bad AGFL block magic!";  			break;  		}  		bp->b_ops = &xfs_agfl_buf_ops;  		break;  	case XFS_BLFT_AGI_BUF:  		if (magic32 != XFS_AGI_MAGIC) { -			xfs_warn(mp, "Bad AGI block magic!"); -			ASSERT(0); +			warnmsg = "Bad AGI block magic!";  			break;  		}  		bp->b_ops = &xfs_agi_buf_ops; @@ -2438,8 +2456,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_GDQUOT_BUF:  #ifdef CONFIG_XFS_QUOTA  		if (magic16 != XFS_DQUOT_MAGIC) { -			xfs_warn(mp, "Bad DQUOT block magic!"); -			ASSERT(0); +			warnmsg = "Bad DQUOT block magic!";  			break;  		}  		bp->b_ops = &xfs_dquot_buf_ops; @@ -2451,16 +2468,14 @@ xlog_recover_validate_buf_type(  		break;  	case XFS_BLFT_DINO_BUF:  		if (magic16 != XFS_DINODE_MAGIC) { -			xfs_warn(mp, "Bad INODE block magic!"); -			ASSERT(0); +			warnmsg = "Bad INODE block magic!";  			break;  		}  		bp->b_ops = &xfs_inode_buf_ops;  		break;  	case XFS_BLFT_SYMLINK_BUF:  		if (magic32 != XFS_SYMLINK_MAGIC) { -			xfs_warn(mp, "Bad symlink block magic!"); -			ASSERT(0); +			warnmsg = "Bad symlink block magic!";  			break;  		}  		bp->b_ops = &xfs_symlink_buf_ops; @@ -2468,8 +2483,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_DIR_BLOCK_BUF:  		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&  		    magic32 != XFS_DIR3_BLOCK_MAGIC) { -			xfs_warn(mp, "Bad dir block magic!"); -			ASSERT(0); +			warnmsg = "Bad dir block magic!";  			break;  		}  		bp->b_ops = &xfs_dir3_block_buf_ops; @@ -2477,8 +2491,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_DIR_DATA_BUF:  		if (magic32 != XFS_DIR2_DATA_MAGIC &&  		    magic32 != XFS_DIR3_DATA_MAGIC) { -			xfs_warn(mp, "Bad dir data magic!"); -			ASSERT(0); +			warnmsg = "Bad dir data magic!";  			break;  		}  		bp->b_ops = &xfs_dir3_data_buf_ops; @@ -2486,8 +2499,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_DIR_FREE_BUF:  		if (magic32 != XFS_DIR2_FREE_MAGIC &&  		    magic32 != XFS_DIR3_FREE_MAGIC) { -			xfs_warn(mp, "Bad dir3 free magic!"); -			ASSERT(0); +			warnmsg = "Bad dir3 free magic!";  			break;  		}  		bp->b_ops = &xfs_dir3_free_buf_ops; @@ -2495,8 +2507,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_DIR_LEAF1_BUF:  		if (magicda != XFS_DIR2_LEAF1_MAGIC &&  		    magicda != XFS_DIR3_LEAF1_MAGIC) { -			xfs_warn(mp, "Bad dir leaf1 magic!"); -			ASSERT(0); +			warnmsg = "Bad dir leaf1 magic!";  			break;  		}  		bp->b_ops = &xfs_dir3_leaf1_buf_ops; @@ -2504,8 +2515,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_DIR_LEAFN_BUF:  		if (magicda != XFS_DIR2_LEAFN_MAGIC &&  		    magicda != XFS_DIR3_LEAFN_MAGIC) { -			xfs_warn(mp, "Bad dir leafn magic!"); -			ASSERT(0); +			warnmsg = "Bad dir leafn magic!";  			break;  		}  		bp->b_ops = &xfs_dir3_leafn_buf_ops; @@ -2513,8 +2523,7 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_DA_NODE_BUF:  		if (magicda != XFS_DA_NODE_MAGIC &&  		    magicda != XFS_DA3_NODE_MAGIC) { -			xfs_warn(mp, "Bad da node magic!"); -			ASSERT(0); +			warnmsg = "Bad da node magic!";  			break;  		}  		bp->b_ops = &xfs_da3_node_buf_ops; @@ -2522,24 +2531,21 @@ xlog_recover_validate_buf_type(  	case XFS_BLFT_ATTR_LEAF_BUF:  		if (magicda != XFS_ATTR_LEAF_MAGIC &&  		    magicda != XFS_ATTR3_LEAF_MAGIC) { -			xfs_warn(mp, "Bad attr leaf magic!"); -			ASSERT(0); +			warnmsg = "Bad attr leaf magic!";  			break;  		}  		bp->b_ops = &xfs_attr3_leaf_buf_ops;  		break;  	case XFS_BLFT_ATTR_RMT_BUF:  		if (magic32 != XFS_ATTR3_RMT_MAGIC) { -			xfs_warn(mp, "Bad attr remote magic!"); -			ASSERT(0); +			warnmsg = "Bad attr remote magic!";  			break;  		}  		bp->b_ops = &xfs_attr3_rmt_buf_ops;  		break;  	case XFS_BLFT_SB_BUF:  		if (magic32 != XFS_SB_MAGIC) { -			xfs_warn(mp, "Bad SB block magic!"); -			ASSERT(0); +			warnmsg = "Bad SB block magic!";  			break;  		}  		bp->b_ops = &xfs_sb_buf_ops; @@ -2556,6 +2562,40 @@ xlog_recover_validate_buf_type(  			 xfs_blft_from_flags(buf_f));  		break;  	} + +	/* +	 * Nothing else to do in the case of a NULL current LSN as this means +	 * the buffer is more recent than the change in the log and will be +	 * skipped. +	 */ +	if (current_lsn == NULLCOMMITLSN) +		return; + +	if (warnmsg) { +		xfs_warn(mp, warnmsg); +		ASSERT(0); +	} + +	/* +	 * We must update the metadata LSN of the buffer as it is written out to +	 * ensure that older transactions never replay over this one and corrupt +	 * the buffer. This can occur if log recovery is interrupted at some +	 * point after the current transaction completes, at which point a +	 * subsequent mount starts recovery from the beginning. +	 * +	 * Write verifiers update the metadata LSN from log items attached to +	 * the buffer. Therefore, initialize a bli purely to carry the LSN to +	 * the verifier. We'll clean it up in our ->iodone() callback. +	 */ +	if (bp->b_ops) { +		struct xfs_buf_log_item	*bip; + +		ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); +		bp->b_iodone = xlog_recover_iodone; +		xfs_buf_item_init(bp, mp); +		bip = bp->b_fspriv; +		bip->bli_item.li_lsn = current_lsn; +	}  }  /* @@ -2569,7 +2609,8 @@ xlog_recover_do_reg_buffer(  	struct xfs_mount	*mp,  	xlog_recover_item_t	*item,  	struct xfs_buf		*bp, -	xfs_buf_log_format_t	*buf_f) +	xfs_buf_log_format_t	*buf_f, +	xfs_lsn_t		current_lsn)  {  	int			i;  	int			bit; @@ -2642,7 +2683,7 @@ xlog_recover_do_reg_buffer(  	/* Shouldn't be any more regions */  	ASSERT(i == item->ri_total); -	xlog_recover_validate_buf_type(mp, bp, buf_f); +	xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);  }  /* @@ -2685,7 +2726,7 @@ xlog_recover_do_dquot_buffer(  	if (log->l_quotaoffs_flag & type)  		return false; -	xlog_recover_do_reg_buffer(mp, item, bp, buf_f); +	xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);  	return true;  } @@ -2773,7 +2814,8 @@ xlog_recover_buffer_pass2(  	 */  	lsn = xlog_recover_get_buf_lsn(mp, bp);  	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { -		xlog_recover_validate_buf_type(mp, bp, buf_f); +		trace_xfs_log_recover_buf_skip(log, buf_f); +		xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);  		goto out_release;  	} @@ -2789,7 +2831,7 @@ xlog_recover_buffer_pass2(  		if (!dirty)  			goto out_release;  	} else { -		xlog_recover_do_reg_buffer(mp, item, bp, buf_f); +		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);  	}  	/* @@ -3515,6 +3557,242 @@ xlog_recover_rud_pass2(  }  /* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure.  The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( +	struct xfs_log_iovec		*buf, +	struct xfs_cui_log_format	*dst_cui_fmt) +{ +	struct xfs_cui_log_format	*src_cui_fmt; +	uint				len; + +	src_cui_fmt = buf->i_addr; +	len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + +	if (buf->i_len == len) { +		memcpy(dst_cui_fmt, src_cui_fmt, len); +		return 0; +	} +	return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_pass2( +	struct xlog			*log, +	struct xlog_recover_item	*item, +	xfs_lsn_t			lsn) +{ +	int				error; +	struct xfs_mount		*mp = log->l_mp; +	struct xfs_cui_log_item		*cuip; +	struct xfs_cui_log_format	*cui_formatp; + +	cui_formatp = item->ri_buf[0].i_addr; + +	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); +	error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); +	if (error) { +		xfs_cui_item_free(cuip); +		return error; +	} +	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + +	spin_lock(&log->l_ailp->xa_lock); +	/* +	 * The CUI has two references. One for the CUD and one for CUI to ensure +	 * it makes it into the AIL. Insert the CUI into the AIL directly and +	 * drop the CUI reference. Note that xfs_trans_ail_update() drops the +	 * AIL lock. +	 */ +	xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); +	xfs_cui_release(cuip); +	return 0; +} + + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_pass2( +	struct xlog			*log, +	struct xlog_recover_item	*item) +{ +	struct xfs_cud_log_format	*cud_formatp; +	struct xfs_cui_log_item		*cuip = NULL; +	struct xfs_log_item		*lip; +	__uint64_t			cui_id; +	struct xfs_ail_cursor		cur; +	struct xfs_ail			*ailp = log->l_ailp; + +	cud_formatp = item->ri_buf[0].i_addr; +	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) +		return -EFSCORRUPTED; +	cui_id = cud_formatp->cud_cui_id; + +	/* +	 * Search for the CUI with the id in the CUD format structure in the +	 * AIL. +	 */ +	spin_lock(&ailp->xa_lock); +	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +	while (lip != NULL) { +		if (lip->li_type == XFS_LI_CUI) { +			cuip = (struct xfs_cui_log_item *)lip; +			if (cuip->cui_format.cui_id == cui_id) { +				/* +				 * Drop the CUD reference to the CUI. This +				 * removes the CUI from the AIL and frees it. +				 */ +				spin_unlock(&ailp->xa_lock); +				xfs_cui_release(cuip); +				spin_lock(&ailp->xa_lock); +				break; +			} +		} +		lip = xfs_trans_ail_cursor_next(ailp, &cur); +	} + +	xfs_trans_ail_cursor_done(&cur); +	spin_unlock(&ailp->xa_lock); + +	return 0; +} + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure.  The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( +	struct xfs_log_iovec		*buf, +	struct xfs_bui_log_format	*dst_bui_fmt) +{ +	struct xfs_bui_log_format	*src_bui_fmt; +	uint				len; + +	src_bui_fmt = buf->i_addr; +	len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + +	if (buf->i_len == len) { +		memcpy(dst_bui_fmt, src_bui_fmt, len); +		return 0; +	} +	return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_pass2( +	struct xlog			*log, +	struct xlog_recover_item	*item, +	xfs_lsn_t			lsn) +{ +	int				error; +	struct xfs_mount		*mp = log->l_mp; +	struct xfs_bui_log_item		*buip; +	struct xfs_bui_log_format	*bui_formatp; + +	bui_formatp = item->ri_buf[0].i_addr; + +	if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) +		return -EFSCORRUPTED; +	buip = xfs_bui_init(mp); +	error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); +	if (error) { +		xfs_bui_item_free(buip); +		return error; +	} +	atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + +	spin_lock(&log->l_ailp->xa_lock); +	/* +	 * The RUI has two references. One for the RUD and one for RUI to ensure +	 * it makes it into the AIL. Insert the RUI into the AIL directly and +	 * drop the RUI reference. Note that xfs_trans_ail_update() drops the +	 * AIL lock. +	 */ +	xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); +	xfs_bui_release(buip); +	return 0; +} + + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_pass2( +	struct xlog			*log, +	struct xlog_recover_item	*item) +{ +	struct xfs_bud_log_format	*bud_formatp; +	struct xfs_bui_log_item		*buip = NULL; +	struct xfs_log_item		*lip; +	__uint64_t			bui_id; +	struct xfs_ail_cursor		cur; +	struct xfs_ail			*ailp = log->l_ailp; + +	bud_formatp = item->ri_buf[0].i_addr; +	if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) +		return -EFSCORRUPTED; +	bui_id = bud_formatp->bud_bui_id; + +	/* +	 * Search for the BUI with the id in the BUD format structure in the +	 * AIL. +	 */ +	spin_lock(&ailp->xa_lock); +	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +	while (lip != NULL) { +		if (lip->li_type == XFS_LI_BUI) { +			buip = (struct xfs_bui_log_item *)lip; +			if (buip->bui_format.bui_id == bui_id) { +				/* +				 * Drop the BUD reference to the BUI. This +				 * removes the BUI from the AIL and frees it. +				 */ +				spin_unlock(&ailp->xa_lock); +				xfs_bui_release(buip); +				spin_lock(&ailp->xa_lock); +				break; +			} +		} +		lip = xfs_trans_ail_cursor_next(ailp, &cur); +	} + +	xfs_trans_ail_cursor_done(&cur); +	spin_unlock(&ailp->xa_lock); + +	return 0; +} + +/*   * This routine is called when an inode create format structure is found in a   * committed transaction in the log.  It's purpose is to initialise the inodes   * being allocated on disk. This requires us to get inode cluster buffers that @@ -3741,6 +4019,10 @@ xlog_recover_ra_pass2(  	case XFS_LI_QUOTAOFF:  	case XFS_LI_RUI:  	case XFS_LI_RUD: +	case XFS_LI_CUI: +	case XFS_LI_CUD: +	case XFS_LI_BUI: +	case XFS_LI_BUD:  	default:  		break;  	} @@ -3766,6 +4048,10 @@ xlog_recover_commit_pass1(  	case XFS_LI_ICREATE:  	case XFS_LI_RUI:  	case XFS_LI_RUD: +	case XFS_LI_CUI: +	case XFS_LI_CUD: +	case XFS_LI_BUI: +	case XFS_LI_BUD:  		/* nothing to do in pass 1 */  		return 0;  	default: @@ -3800,6 +4086,14 @@ xlog_recover_commit_pass2(  		return xlog_recover_rui_pass2(log, item, trans->r_lsn);  	case XFS_LI_RUD:  		return xlog_recover_rud_pass2(log, item); +	case XFS_LI_CUI: +		return xlog_recover_cui_pass2(log, item, trans->r_lsn); +	case XFS_LI_CUD: +		return xlog_recover_cud_pass2(log, item); +	case XFS_LI_BUI: +		return xlog_recover_bui_pass2(log, item, trans->r_lsn); +	case XFS_LI_BUD: +		return xlog_recover_bud_pass2(log, item);  	case XFS_LI_DQUOT:  		return xlog_recover_dquot_pass2(log, buffer_list, item,  						trans->r_lsn); @@ -3846,14 +4140,13 @@ STATIC int  xlog_recover_commit_trans(  	struct xlog		*log,  	struct xlog_recover	*trans, -	int			pass) +	int			pass, +	struct list_head	*buffer_list)  {  	int				error = 0; -	int				error2;  	int				items_queued = 0;  	struct xlog_recover_item	*item;  	struct xlog_recover_item	*next; -	LIST_HEAD			(buffer_list);  	LIST_HEAD			(ra_list);  	LIST_HEAD			(done_list); @@ -3876,7 +4169,7 @@ xlog_recover_commit_trans(  			items_queued++;  			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {  				error = xlog_recover_items_pass2(log, trans, -						&buffer_list, &ra_list); +						buffer_list, &ra_list);  				list_splice_tail_init(&ra_list, &done_list);  				items_queued = 0;  			} @@ -3894,15 +4187,14 @@ out:  	if (!list_empty(&ra_list)) {  		if (!error)  			error = xlog_recover_items_pass2(log, trans, -					&buffer_list, &ra_list); +					buffer_list, &ra_list);  		list_splice_tail_init(&ra_list, &done_list);  	}  	if (!list_empty(&done_list))  		list_splice_init(&done_list, &trans->r_itemq); -	error2 = xfs_buf_delwri_submit(&buffer_list); -	return error ? error : error2; +	return error;  }  STATIC void @@ -4085,7 +4377,8 @@ xlog_recovery_process_trans(  	char			*dp,  	unsigned int		len,  	unsigned int		flags, -	int			pass) +	int			pass, +	struct list_head	*buffer_list)  {  	int			error = 0;  	bool			freeit = false; @@ -4109,7 +4402,8 @@ xlog_recovery_process_trans(  		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);  		break;  	case XLOG_COMMIT_TRANS: -		error = xlog_recover_commit_trans(log, trans, pass); +		error = xlog_recover_commit_trans(log, trans, pass, +						  buffer_list);  		/* success or fail, we are now done with this transaction. */  		freeit = true;  		break; @@ -4191,10 +4485,12 @@ xlog_recover_process_ophdr(  	struct xlog_op_header	*ohead,  	char			*dp,  	char			*end, -	int			pass) +	int			pass, +	struct list_head	*buffer_list)  {  	struct xlog_recover	*trans;  	unsigned int		len; +	int			error;  	/* Do we understand who wrote this op? */  	if (ohead->oh_clientid != XFS_TRANSACTION && @@ -4221,8 +4517,39 @@ xlog_recover_process_ophdr(  		return 0;  	} +	/* +	 * The recovered buffer queue is drained only once we know that all +	 * recovery items for the current LSN have been processed. This is +	 * required because: +	 * +	 * - Buffer write submission updates the metadata LSN of the buffer. +	 * - Log recovery skips items with a metadata LSN >= the current LSN of +	 *   the recovery item. +	 * - Separate recovery items against the same metadata buffer can share +	 *   a current LSN. I.e., consider that the LSN of a recovery item is +	 *   defined as the starting LSN of the first record in which its +	 *   transaction appears, that a record can hold multiple transactions, +	 *   and/or that a transaction can span multiple records. +	 * +	 * In other words, we are allowed to submit a buffer from log recovery +	 * once per current LSN. Otherwise, we may incorrectly skip recovery +	 * items and cause corruption. +	 * +	 * We don't know up front whether buffers are updated multiple times per +	 * LSN. Therefore, track the current LSN of each commit log record as it +	 * is processed and drain the queue when it changes. Use commit records +	 * because they are ordered correctly by the logging code. +	 */ +	if (log->l_recovery_lsn != trans->r_lsn && +	    ohead->oh_flags & XLOG_COMMIT_TRANS) { +		error = xfs_buf_delwri_submit(buffer_list); +		if (error) +			return error; +		log->l_recovery_lsn = trans->r_lsn; +	} +  	return xlog_recovery_process_trans(log, trans, dp, len, -					   ohead->oh_flags, pass); +					   ohead->oh_flags, pass, buffer_list);  }  /* @@ -4240,7 +4567,8 @@ xlog_recover_process_data(  	struct hlist_head	rhash[],  	struct xlog_rec_header	*rhead,  	char			*dp, -	int			pass) +	int			pass, +	struct list_head	*buffer_list)  {  	struct xlog_op_header	*ohead;  	char			*end; @@ -4254,6 +4582,7 @@ xlog_recover_process_data(  	if (xlog_header_check_recover(log->l_mp, rhead))  		return -EIO; +	trace_xfs_log_recover_record(log, rhead, pass);  	while ((dp < end) && num_logops) {  		ohead = (struct xlog_op_header *)dp; @@ -4262,7 +4591,7 @@ xlog_recover_process_data(  		/* errors will abort recovery */  		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, -						    dp, end, pass); +						   dp, end, pass, buffer_list);  		if (error)  			return error; @@ -4352,12 +4681,94 @@ xlog_recover_cancel_rui(  	spin_lock(&ailp->xa_lock);  } +/* Recover the CUI if necessary. */ +STATIC int +xlog_recover_process_cui( +	struct xfs_mount		*mp, +	struct xfs_ail			*ailp, +	struct xfs_log_item		*lip) +{ +	struct xfs_cui_log_item		*cuip; +	int				error; + +	/* +	 * Skip CUIs that we've already processed. +	 */ +	cuip = container_of(lip, struct xfs_cui_log_item, cui_item); +	if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) +		return 0; + +	spin_unlock(&ailp->xa_lock); +	error = xfs_cui_recover(mp, cuip); +	spin_lock(&ailp->xa_lock); + +	return error; +} + +/* Release the CUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_cui( +	struct xfs_mount		*mp, +	struct xfs_ail			*ailp, +	struct xfs_log_item		*lip) +{ +	struct xfs_cui_log_item		*cuip; + +	cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + +	spin_unlock(&ailp->xa_lock); +	xfs_cui_release(cuip); +	spin_lock(&ailp->xa_lock); +} + +/* Recover the BUI if necessary. */ +STATIC int +xlog_recover_process_bui( +	struct xfs_mount		*mp, +	struct xfs_ail			*ailp, +	struct xfs_log_item		*lip) +{ +	struct xfs_bui_log_item		*buip; +	int				error; + +	/* +	 * Skip BUIs that we've already processed. +	 */ +	buip = container_of(lip, struct xfs_bui_log_item, bui_item); +	if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) +		return 0; + +	spin_unlock(&ailp->xa_lock); +	error = xfs_bui_recover(mp, buip); +	spin_lock(&ailp->xa_lock); + +	return error; +} + +/* Release the BUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_bui( +	struct xfs_mount		*mp, +	struct xfs_ail			*ailp, +	struct xfs_log_item		*lip) +{ +	struct xfs_bui_log_item		*buip; + +	buip = container_of(lip, struct xfs_bui_log_item, bui_item); + +	spin_unlock(&ailp->xa_lock); +	xfs_bui_release(buip); +	spin_lock(&ailp->xa_lock); +} +  /* Is this log item a deferred action intent? */  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)  {  	switch (lip->li_type) {  	case XFS_LI_EFI:  	case XFS_LI_RUI: +	case XFS_LI_CUI: +	case XFS_LI_BUI:  		return true;  	default:  		return false; @@ -4421,6 +4832,12 @@ xlog_recover_process_intents(  		case XFS_LI_RUI:  			error = xlog_recover_process_rui(log->l_mp, ailp, lip);  			break; +		case XFS_LI_CUI: +			error = xlog_recover_process_cui(log->l_mp, ailp, lip); +			break; +		case XFS_LI_BUI: +			error = xlog_recover_process_bui(log->l_mp, ailp, lip); +			break;  		}  		if (error)  			goto out; @@ -4468,6 +4885,12 @@ xlog_recover_cancel_intents(  		case XFS_LI_RUI:  			xlog_recover_cancel_rui(log->l_mp, ailp, lip);  			break; +		case XFS_LI_CUI: +			xlog_recover_cancel_cui(log->l_mp, ailp, lip); +			break; +		case XFS_LI_BUI: +			xlog_recover_cancel_bui(log->l_mp, ailp, lip); +			break;  		}  		lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4546,6 +4969,7 @@ xlog_recover_process_one_iunlink(  	if (error)  		goto fail_iput; +	xfs_iflags_clear(ip, XFS_IRECOVERY);  	ASSERT(VFS_I(ip)->i_nlink == 0);  	ASSERT(VFS_I(ip)->i_mode != 0); @@ -4685,7 +5109,8 @@ xlog_recover_process(  	struct hlist_head	rhash[],  	struct xlog_rec_header	*rhead,  	char			*dp, -	int			pass) +	int			pass, +	struct list_head	*buffer_list)  {  	int			error;  	__le32			crc; @@ -4732,7 +5157,8 @@ xlog_recover_process(  	if (error)  		return error; -	return xlog_recover_process_data(log, rhash, rhead, dp, pass); +	return xlog_recover_process_data(log, rhash, rhead, dp, pass, +					 buffer_list);  }  STATIC int @@ -4793,9 +5219,11 @@ xlog_do_recovery_pass(  	char			*offset;  	xfs_buf_t		*hbp, *dbp;  	int			error = 0, h_size, h_len; +	int			error2 = 0;  	int			bblks, split_bblks;  	int			hblks, split_hblks, wrapped_hblks;  	struct hlist_head	rhash[XLOG_RHASH_SIZE]; +	LIST_HEAD		(buffer_list);  	ASSERT(head_blk != tail_blk);  	rhead_blk = 0; @@ -4981,7 +5409,7 @@ xlog_do_recovery_pass(  			}  			error = xlog_recover_process(log, rhash, rhead, offset, -						     pass); +						     pass, &buffer_list);  			if (error)  				goto bread_err2; @@ -5012,7 +5440,8 @@ xlog_do_recovery_pass(  		if (error)  			goto bread_err2; -		error = xlog_recover_process(log, rhash, rhead, offset, pass); +		error = xlog_recover_process(log, rhash, rhead, offset, pass, +					     &buffer_list);  		if (error)  			goto bread_err2; @@ -5025,10 +5454,17 @@ xlog_do_recovery_pass(   bread_err1:  	xlog_put_bp(hbp); +	/* +	 * Submit buffers that have been added from the last record processed, +	 * regardless of error status. +	 */ +	if (!list_empty(&buffer_list)) +		error2 = xfs_buf_delwri_submit(&buffer_list); +  	if (error && first_bad)  		*first_bad = rhead_blk; -	return error; +	return error ? error : error2;  }  /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index faeead671f9f..fc7873942bea 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@  #include "xfs_icache.h"  #include "xfs_sysfs.h"  #include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_reflink.h"  static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -684,6 +686,7 @@ xfs_mountfs(  	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);  	xfs_ialloc_compute_maxlevels(mp);  	xfs_rmapbt_compute_maxlevels(mp); +	xfs_refcountbt_compute_maxlevels(mp);  	xfs_set_maxicount(mp); @@ -923,6 +926,15 @@ xfs_mountfs(  	}  	/* +	 * During the second phase of log recovery, we need iget and +	 * iput to behave like they do for an active filesystem. +	 * xfs_fs_drop_inode needs to be able to prevent the deletion +	 * of inodes before we're done replaying log items on those +	 * inodes. +	 */ +	mp->m_super->s_flags |= MS_ACTIVE; + +	/*  	 * Finish recovering the file system.  This part needed to be delayed  	 * until after the root and real-time bitmap inodes were consistently  	 * read in. @@ -934,6 +946,20 @@ xfs_mountfs(  	}  	/* +	 * Now the log is fully replayed, we can transition to full read-only +	 * mode for read-only mounts. This will sync all the metadata and clean +	 * the log so that the recovery we just performed does not have to be +	 * replayed again on the next mount. +	 * +	 * We use the same quiesce mechanism as the rw->ro remount, as they are +	 * semantically identical operations. +	 */ +	if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == +							XFS_MOUNT_RDONLY) { +		xfs_quiesce_attr(mp); +	} + +	/*  	 * Complete the quota initialisation, post-log-replay component.  	 */  	if (quotamount) { @@ -960,10 +986,28 @@ xfs_mountfs(  		if (error)  			xfs_warn(mp,  	"Unable to allocate reserve blocks. Continuing without reserve pool."); + +		/* Recover any CoW blocks that never got remapped. */ +		error = xfs_reflink_recover_cow(mp); +		if (error) { +			xfs_err(mp, +	"Error %d recovering leftover CoW allocations.", error); +			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +			goto out_quota; +		} + +		/* Reserve AG blocks for future btree expansion. */ +		error = xfs_fs_reserve_ag_blocks(mp); +		if (error && error != -ENOSPC) +			goto out_agresv;  	}  	return 0; + out_agresv: +	xfs_fs_unreserve_ag_blocks(mp); + out_quota: +	xfs_qm_unmount_quotas(mp);   out_rtunmount:  	xfs_rtunmount_inodes(mp);   out_rele_rip: @@ -1005,7 +1049,9 @@ xfs_unmountfs(  	int			error;  	cancel_delayed_work_sync(&mp->m_eofblocks_work); +	cancel_delayed_work_sync(&mp->m_cowblocks_work); +	xfs_fs_unreserve_ag_blocks(mp);  	xfs_qm_unmount_quotas(mp);  	xfs_rtunmount_inodes(mp);  	IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b36676cde103..819b80b15bfb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,10 +57,16 @@ enum {  #define XFS_ERR_RETRY_FOREVER	-1 +/* + * Although retry_timeout is in jiffies which is normally an unsigned long, + * we limit the retry timeout to 86400 seconds, or one day.  So even a + * signed 32-bit long is sufficient for a HZ value up to 24855.  Making it + * signed lets us store the special "-1" value, meaning retry forever. + */  struct xfs_error_cfg {  	struct xfs_kobj	kobj;  	int		max_retries; -	unsigned long	retry_timeout;	/* in jiffies, 0 = no timeout */ +	long		retry_timeout;	/* in jiffies, -1 = infinite */  };  typedef struct xfs_mount { @@ -118,10 +124,13 @@ typedef struct xfs_mount {  	uint			m_inobt_mnr[2];	/* min inobt btree records */  	uint			m_rmap_mxr[2];	/* max rmap btree records */  	uint			m_rmap_mnr[2];	/* min rmap btree records */ +	uint			m_refc_mxr[2];	/* max refc btree records */ +	uint			m_refc_mnr[2];	/* min refc btree records */  	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */  	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */  	uint			m_in_maxlevels;	/* max inobt btree levels. */  	uint			m_rmap_maxlevels; /* max rmap btree levels */ +	uint			m_refc_maxlevels; /* max refcount btree level */  	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */  	uint			m_alloc_set_aside; /* space we can't use */  	uint			m_ag_max_usable; /* max space per AG */ @@ -155,6 +164,8 @@ typedef struct xfs_mount {  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */  	struct delayed_work	m_eofblocks_work; /* background eof blocks  						     trimming */ +	struct delayed_work	m_cowblocks_work; /* background cow blocks +						     trimming */  	bool			m_update_sb;	/* sb needs update in mount */  	int64_t			m_low_space[XFS_LOWSP_MAX];  						/* low free space thresholds */ @@ -325,6 +336,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)  }  #endif +/* per-AG block reservation data structures*/ +enum xfs_ag_resv_type { +	XFS_AG_RESV_NONE = 0, +	XFS_AG_RESV_METADATA, +	XFS_AG_RESV_AGFL, +}; + +struct xfs_ag_resv { +	/* number of blocks originally reserved here */ +	xfs_extlen_t			ar_orig_reserved; +	/* number of blocks reserved here */ +	xfs_extlen_t			ar_reserved; +	/* number of blocks originally asked for */ +	xfs_extlen_t			ar_asked; +}; +  /*   * Per-ag incore structure, copies of information in agf and agi, to improve the   * performance of allocation group selection. @@ -372,8 +399,31 @@ typedef struct xfs_perag {  	/* for rcu-safe freeing */  	struct rcu_head	rcu_head;  	int		pagb_count;	/* pagb slots in use */ + +	/* Blocks reserved for all kinds of metadata. */ +	struct xfs_ag_resv	pag_meta_resv; +	/* Blocks reserved for just AGFL-based metadata. */ +	struct xfs_ag_resv	pag_agfl_resv; + +	/* reference count */ +	__uint8_t		pagf_refcount_level;  } xfs_perag_t; +static inline struct xfs_ag_resv * +xfs_perag_resv( +	struct xfs_perag	*pag, +	enum xfs_ag_resv_type	type) +{ +	switch (type) { +	case XFS_AG_RESV_METADATA: +		return &pag->pag_meta_resv; +	case XFS_AG_RESV_AGFL: +		return &pag->pag_agfl_resv; +	default: +		return NULL; +	} +} +  extern void	xfs_uuid_table_free(void);  extern int	xfs_log_sbcount(xfs_mount_t *);  extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 69e2986a3776..0c381d71b242 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void)  	XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,		56);  	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,		4);  	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,		16); +	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key,		4); +	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec,		12);  	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key,		20);  	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec,		24);  	XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp,		8); @@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void)  	XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,			4);  	XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,			8);  	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4); +	XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t,		4);  	XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,			4);  	/* dir/attr trees */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0f14b2e4bf6c..93a7aafa56d6 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -114,6 +114,13 @@ xfs_fs_map_blocks(  		return -ENXIO;  	/* +	 * The pNFS block layout spec actually supports reflink like +	 * functionality, but the Linux pNFS server doesn't implement it yet. +	 */ +	if (xfs_is_reflink_inode(ip)) +		return -ENXIO; + +	/*  	 * Lock out any other I/O before we flush and invalidate the pagecache,  	 * and then hand out a layout to the remote system.  This is very  	 * similar to direct I/O, except that the synchronization is much more diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c new file mode 100644 index 000000000000..fe86a668a57e --- /dev/null +++ b/fs/xfs/xfs_refcount_item.c @@ -0,0 +1,539 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_log.h" +#include "xfs_refcount.h" + + +kmem_zone_t	*xfs_cui_zone; +kmem_zone_t	*xfs_cud_zone; + +static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) +{ +	return container_of(lip, struct xfs_cui_log_item, cui_item); +} + +void +xfs_cui_item_free( +	struct xfs_cui_log_item	*cuip) +{ +	if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) +		kmem_free(cuip); +	else +		kmem_zone_free(xfs_cui_zone, cuip); +} + +STATIC void +xfs_cui_item_size( +	struct xfs_log_item	*lip, +	int			*nvecs, +	int			*nbytes) +{ +	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip); + +	*nvecs += 1; +	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cui log item. We use only 1 iovec, and we point that + * at the cui_log_format structure embedded in the cui item. + * It is at this point that we assert that all of the extent + * slots in the cui item have been filled. + */ +STATIC void +xfs_cui_item_format( +	struct xfs_log_item	*lip, +	struct xfs_log_vec	*lv) +{ +	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip); +	struct xfs_log_iovec	*vecp = NULL; + +	ASSERT(atomic_read(&cuip->cui_next_extent) == +			cuip->cui_format.cui_nextents); + +	cuip->cui_format.cui_type = XFS_LI_CUI; +	cuip->cui_format.cui_size = 1; + +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, +			xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents)); +} + +/* + * Pinning has no meaning for an cui item, so just return. + */ +STATIC void +xfs_cui_item_pin( +	struct xfs_log_item	*lip) +{ +} + +/* + * The unpin operation is the last place an CUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the CUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the CUI to either construct + * and commit the CUD or drop the CUD's reference in the event of error. Simply + * drop the log's CUI reference now that the log is done with it. + */ +STATIC void +xfs_cui_item_unpin( +	struct xfs_log_item	*lip, +	int			remove) +{ +	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip); + +	xfs_cui_release(cuip); +} + +/* + * CUI items have no locking or pushing.  However, since CUIs are pulled from + * the AIL when their corresponding CUDs are committed to disk, their situation + * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log.  This should help in getting the CUI out of + * the AIL. + */ +STATIC uint +xfs_cui_item_push( +	struct xfs_log_item	*lip, +	struct list_head	*buffer_list) +{ +	return XFS_ITEM_PINNED; +} + +/* + * The CUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an CUD isn't going to be + * constructed and thus we free the CUI here directly. + */ +STATIC void +xfs_cui_item_unlock( +	struct xfs_log_item	*lip) +{ +	if (lip->li_flags & XFS_LI_ABORTED) +		xfs_cui_item_free(CUI_ITEM(lip)); +} + +/* + * The CUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_cui_item_committed( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +	return lsn; +} + +/* + * The CUI dependency tracking op doesn't do squat.  It can't because + * it doesn't know where the free extent is coming from.  The dependency + * tracking has to be handled by the "enclosing" metadata object.  For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cui_item_committing( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +} + +/* + * This is the ops vector shared by all cui log items. + */ +static const struct xfs_item_ops xfs_cui_item_ops = { +	.iop_size	= xfs_cui_item_size, +	.iop_format	= xfs_cui_item_format, +	.iop_pin	= xfs_cui_item_pin, +	.iop_unpin	= xfs_cui_item_unpin, +	.iop_unlock	= xfs_cui_item_unlock, +	.iop_committed	= xfs_cui_item_committed, +	.iop_push	= xfs_cui_item_push, +	.iop_committing = xfs_cui_item_committing, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( +	struct xfs_mount		*mp, +	uint				nextents) + +{ +	struct xfs_cui_log_item		*cuip; + +	ASSERT(nextents > 0); +	if (nextents > XFS_CUI_MAX_FAST_EXTENTS) +		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), +				KM_SLEEP); +	else +		cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + +	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); +	cuip->cui_format.cui_nextents = nextents; +	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; +	atomic_set(&cuip->cui_next_extent, 0); +	atomic_set(&cuip->cui_refcount, 2); + +	return cuip; +} + +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( +	struct xfs_cui_log_item	*cuip) +{ +	if (atomic_dec_and_test(&cuip->cui_refcount)) { +		xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); +		xfs_cui_item_free(cuip); +	} +} + +static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) +{ +	return container_of(lip, struct xfs_cud_log_item, cud_item); +} + +STATIC void +xfs_cud_item_size( +	struct xfs_log_item	*lip, +	int			*nvecs, +	int			*nbytes) +{ +	*nvecs += 1; +	*nbytes += sizeof(struct xfs_cud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cud log item. We use only 1 iovec, and we point that + * at the cud_log_format structure embedded in the cud item. + * It is at this point that we assert that all of the extent + * slots in the cud item have been filled. + */ +STATIC void +xfs_cud_item_format( +	struct xfs_log_item	*lip, +	struct xfs_log_vec	*lv) +{ +	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip); +	struct xfs_log_iovec	*vecp = NULL; + +	cudp->cud_format.cud_type = XFS_LI_CUD; +	cudp->cud_format.cud_size = 1; + +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, +			sizeof(struct xfs_cud_log_format)); +} + +/* + * Pinning has no meaning for an cud item, so just return. + */ +STATIC void +xfs_cud_item_pin( +	struct xfs_log_item	*lip) +{ +} + +/* + * Since pinning has no meaning for an cud item, unpinning does + * not either. + */ +STATIC void +xfs_cud_item_unpin( +	struct xfs_log_item	*lip, +	int			remove) +{ +} + +/* + * There isn't much you can do to push on an cud item.  It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_cud_item_push( +	struct xfs_log_item	*lip, +	struct list_head	*buffer_list) +{ +	return XFS_ITEM_PINNED; +} + +/* + * The CUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the CUI and free the + * CUD. + */ +STATIC void +xfs_cud_item_unlock( +	struct xfs_log_item	*lip) +{ +	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip); + +	if (lip->li_flags & XFS_LI_ABORTED) { +		xfs_cui_release(cudp->cud_cuip); +		kmem_zone_free(xfs_cud_zone, cudp); +	} +} + +/* + * When the cud item is committed to disk, all we need to do is delete our + * reference to our partner cui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_cud_item_committed( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip); + +	/* +	 * Drop the CUI reference regardless of whether the CUD has been +	 * aborted. Once the CUD transaction is constructed, it is the sole +	 * responsibility of the CUD to release the CUI (even if the CUI is +	 * aborted due to log I/O error). +	 */ +	xfs_cui_release(cudp->cud_cuip); +	kmem_zone_free(xfs_cud_zone, cudp); + +	return (xfs_lsn_t)-1; +} + +/* + * The CUD dependency tracking op doesn't do squat.  It can't because + * it doesn't know where the free extent is coming from.  The dependency + * tracking has to be handled by the "enclosing" metadata object.  For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cud_item_committing( +	struct xfs_log_item	*lip, +	xfs_lsn_t		lsn) +{ +} + +/* + * This is the ops vector shared by all cud log items. + */ +static const struct xfs_item_ops xfs_cud_item_ops = { +	.iop_size	= xfs_cud_item_size, +	.iop_format	= xfs_cud_item_format, +	.iop_pin	= xfs_cud_item_pin, +	.iop_unpin	= xfs_cud_item_unpin, +	.iop_unlock	= xfs_cud_item_unlock, +	.iop_committed	= xfs_cud_item_committed, +	.iop_push	= xfs_cud_item_push, +	.iop_committing = xfs_cud_item_committing, +}; + +/* + * Allocate and initialize an cud item with the given number of extents. + */ +struct xfs_cud_log_item * +xfs_cud_init( +	struct xfs_mount		*mp, +	struct xfs_cui_log_item		*cuip) + +{ +	struct xfs_cud_log_item	*cudp; + +	cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); +	xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); +	cudp->cud_cuip = cuip; +	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + +	return cudp; +} + +/* + * Process a refcount update intent item that was recovered from the log. + * We need to update the refcountbt. + */ +int +xfs_cui_recover( +	struct xfs_mount		*mp, +	struct xfs_cui_log_item		*cuip) +{ +	int				i; +	int				error = 0; +	unsigned int			refc_type; +	struct xfs_phys_extent		*refc; +	xfs_fsblock_t			startblock_fsb; +	bool				op_ok; +	struct xfs_cud_log_item		*cudp; +	struct xfs_trans		*tp; +	struct xfs_btree_cur		*rcur = NULL; +	enum xfs_refcount_intent_type	type; +	xfs_fsblock_t			firstfsb; +	xfs_fsblock_t			new_fsb; +	xfs_extlen_t			new_len; +	struct xfs_bmbt_irec		irec; +	struct xfs_defer_ops		dfops; +	bool				requeue_only = false; + +	ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + +	/* +	 * First check the validity of the extents described by the +	 * CUI.  If any are bad, then assume that all are bad and +	 * just toss the CUI. +	 */ +	for (i = 0; i < cuip->cui_format.cui_nextents; i++) { +		refc = &cuip->cui_format.cui_extents[i]; +		startblock_fsb = XFS_BB_TO_FSB(mp, +				   XFS_FSB_TO_DADDR(mp, refc->pe_startblock)); +		switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { +		case XFS_REFCOUNT_INCREASE: +		case XFS_REFCOUNT_DECREASE: +		case XFS_REFCOUNT_ALLOC_COW: +		case XFS_REFCOUNT_FREE_COW: +			op_ok = true; +			break; +		default: +			op_ok = false; +			break; +		} +		if (!op_ok || startblock_fsb == 0 || +		    refc->pe_len == 0 || +		    startblock_fsb >= mp->m_sb.sb_dblocks || +		    refc->pe_len >= mp->m_sb.sb_agblocks || +		    (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { +			/* +			 * This will pull the CUI from the AIL and +			 * free the memory associated with it. +			 */ +			set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); +			xfs_cui_release(cuip); +			return -EIO; +		} +	} + +	/* +	 * Under normal operation, refcount updates are deferred, so we +	 * wouldn't be adding them directly to a transaction.  All +	 * refcount updates manage reservation usage internally and +	 * dynamically by deferring work that won't fit in the +	 * transaction.  Normally, any work that needs to be deferred +	 * gets attached to the same defer_ops that scheduled the +	 * refcount update.  However, we're in log recovery here, so we +	 * we create our own defer_ops and use that to finish up any +	 * work that doesn't fit. +	 */ +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); +	if (error) +		return error; +	cudp = xfs_trans_get_cud(tp, cuip); + +	xfs_defer_init(&dfops, &firstfsb); +	for (i = 0; i < cuip->cui_format.cui_nextents; i++) { +		refc = &cuip->cui_format.cui_extents[i]; +		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; +		switch (refc_type) { +		case XFS_REFCOUNT_INCREASE: +		case XFS_REFCOUNT_DECREASE: +		case XFS_REFCOUNT_ALLOC_COW: +		case XFS_REFCOUNT_FREE_COW: +			type = refc_type; +			break; +		default: +			error = -EFSCORRUPTED; +			goto abort_error; +		} +		if (requeue_only) { +			new_fsb = refc->pe_startblock; +			new_len = refc->pe_len; +		} else +			error = xfs_trans_log_finish_refcount_update(tp, cudp, +				&dfops, type, refc->pe_startblock, refc->pe_len, +				&new_fsb, &new_len, &rcur); +		if (error) +			goto abort_error; + +		/* Requeue what we didn't finish. */ +		if (new_len > 0) { +			irec.br_startblock = new_fsb; +			irec.br_blockcount = new_len; +			switch (type) { +			case XFS_REFCOUNT_INCREASE: +				error = xfs_refcount_increase_extent( +						tp->t_mountp, &dfops, &irec); +				break; +			case XFS_REFCOUNT_DECREASE: +				error = xfs_refcount_decrease_extent( +						tp->t_mountp, &dfops, &irec); +				break; +			case XFS_REFCOUNT_ALLOC_COW: +				error = xfs_refcount_alloc_cow_extent( +						tp->t_mountp, &dfops, +						irec.br_startblock, +						irec.br_blockcount); +				break; +			case XFS_REFCOUNT_FREE_COW: +				error = xfs_refcount_free_cow_extent( +						tp->t_mountp, &dfops, +						irec.br_startblock, +						irec.br_blockcount); +				break; +			default: +				ASSERT(0); +			} +			if (error) +				goto abort_error; +			requeue_only = true; +		} +	} + +	xfs_refcount_finish_one_cleanup(tp, rcur, error); +	error = xfs_defer_finish(&tp, &dfops, NULL); +	if (error) +		goto abort_error; +	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); +	error = xfs_trans_commit(tp); +	return error; + +abort_error: +	xfs_refcount_finish_one_cleanup(tp, rcur, error); +	xfs_defer_cancel(&dfops); +	xfs_trans_cancel(tp); +	return error; +} diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h new file mode 100644 index 000000000000..5b74dddfa64b --- /dev/null +++ b/fs/xfs/xfs_refcount_item.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#ifndef	__XFS_REFCOUNT_ITEM_H__ +#define	__XFS_REFCOUNT_ITEM_H__ + +/* + * There are (currently) two pairs of refcount btree redo item types: + * increase and decrease.  The log items for these are CUI (refcount + * update intent) and CUD (refcount update done).  The redo item type + * is encoded in the flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same + * transaction that records the associated refcountbt updates. + * + * Should the system crash after the commit of the first transaction + * but before the commit of the final transaction in a series, log + * recovery will use the redo information recorded by the intent items + * to replay the refcountbt metadata updates. + */ + +/* kernel only CUI/CUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define	XFS_CUI_MAX_FAST_EXTENTS	16 + +/* + * Define CUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define	XFS_CUI_RECOVERED		1 + +/* + * This is the "refcount update intent" log item.  It is used to log + * the fact that some reverse mappings need to change.  It is used in + * conjunction with the "refcount update done" log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; + * see the comments about that structure (in xfs_extfree_item.h) for + * more details. + */ +struct xfs_cui_log_item { +	struct xfs_log_item		cui_item; +	atomic_t			cui_refcount; +	atomic_t			cui_next_extent; +	unsigned long			cui_flags;	/* misc flags */ +	struct xfs_cui_log_format	cui_format; +}; + +static inline size_t +xfs_cui_log_item_sizeof( +	unsigned int		nr) +{ +	return offsetof(struct xfs_cui_log_item, cui_format) + +			xfs_cui_log_format_sizeof(nr); +} + +/* + * This is the "refcount update done" log item.  It is used to log the + * fact that some refcountbt updates mentioned in an earlier cui item + * have been performed. + */ +struct xfs_cud_log_item { +	struct xfs_log_item		cud_item; +	struct xfs_cui_log_item		*cud_cuip; +	struct xfs_cud_log_format	cud_format; +}; + +extern struct kmem_zone	*xfs_cui_zone; +extern struct kmem_zone	*xfs_cud_zone; + +struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); +struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, +		struct xfs_cui_log_item *); +void xfs_cui_item_free(struct xfs_cui_log_item *); +void xfs_cui_release(struct xfs_cui_log_item *); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); + +#endif	/* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c new file mode 100644 index 000000000000..5965e9455d91 --- /dev/null +++ b/fs/xfs/xfs_reflink.c @@ -0,0 +1,1688 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_reflink.h" +#include "xfs_iomap.h" +#include "xfs_rmap_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" + +/* + * Copy on Write of Shared Blocks + * + * XFS must preserve "the usual" file semantics even when two files share + * the same physical blocks.  This means that a write to one file must not + * alter the blocks in a different file; the way that we'll do that is + * through the use of a copy-on-write mechanism.  At a high level, that + * means that when we want to write to a shared block, we allocate a new + * block, write the data to the new block, and if that succeeds we map the + * new block into the file. + * + * XFS provides a "delayed allocation" mechanism that defers the allocation + * of disk blocks to dirty-but-not-yet-mapped file blocks as long as + * possible.  This reduces fragmentation by enabling the filesystem to ask + * for bigger chunks less often, which is exactly what we want for CoW. + * + * The delalloc mechanism begins when the kernel wants to make a block + * writable (write_begin or page_mkwrite).  If the offset is not mapped, we + * create a delalloc mapping, which is a regular in-core extent, but without + * a real startblock.  (For delalloc mappings, the startblock encodes both + * a flag that this is a delalloc mapping, and a worst-case estimate of how + * many blocks might be required to put the mapping into the BMBT.)  delalloc + * mappings are a reservation against the free space in the filesystem; + * adjacent mappings can also be combined into fewer larger mappings. + * + * When dirty pages are being written out (typically in writepage), the + * delalloc reservations are converted into real mappings by allocating + * blocks and replacing the delalloc mapping with real ones.  A delalloc + * mapping can be replaced by several real ones if the free space is + * fragmented. + * + * We want to adapt the delalloc mechanism for copy-on-write, since the + * write paths are similar.  The first two steps (creating the reservation + * and allocating the blocks) are exactly the same as delalloc except that + * the mappings must be stored in a separate CoW fork because we do not want + * to disturb the mapping in the data fork until we're sure that the write + * succeeded.  IO completion in this case is the process of removing the old + * mapping from the data fork and moving the new mapping from the CoW fork to + * the data fork.  This will be discussed shortly. + * + * For now, unaligned directio writes will be bounced back to the page cache. + * Block-aligned directio writes will use the same mechanism as buffered + * writes. + * + * CoW remapping must be done after the data block write completes, + * because we don't want to destroy the old data fork map until we're sure + * the new block has been written.  Since the new mappings are kept in a + * separate fork, we can simply iterate these mappings to find the ones + * that cover the file blocks that we just CoW'd.  For each extent, simply + * unmap the corresponding range in the data fork, map the new range into + * the data fork, and remove the extent from the CoW fork. + * + * Since the remapping operation can be applied to an arbitrary file + * range, we record the need for the remap step as a flag in the ioend + * instead of declaring a new IO type.  This is required for direct io + * because we only have ioend for the whole dio, and we have to be able to + * remember the presence of unwritten blocks and CoW blocks with a single + * ioend structure.  Better yet, the more ground we can cover with one + * ioend, the better. + */ + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen.  If + * find_end_of_shared is true, return the longest contiguous extent of + * shared blocks.  If there are no shared extents, fbno and flen will + * be set to NULLAGBLOCK and 0, respectively. + */ +int +xfs_reflink_find_shared( +	struct xfs_mount	*mp, +	xfs_agnumber_t		agno, +	xfs_agblock_t		agbno, +	xfs_extlen_t		aglen, +	xfs_agblock_t		*fbno, +	xfs_extlen_t		*flen, +	bool			find_end_of_shared) +{ +	struct xfs_buf		*agbp; +	struct xfs_btree_cur	*cur; +	int			error; + +	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); +	if (error) +		return error; + +	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + +	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, +			find_end_of_shared); + +	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + +	xfs_buf_relse(agbp); +	return error; +} + +/* + * Trim the mapping to the next block where there's a change in the + * shared/unshared status.  More specifically, this means that we + * find the lowest-numbered extent of shared blocks that coincides with + * the given block mapping.  If the shared extent overlaps the start of + * the mapping, trim the mapping to the end of the shared extent.  If + * the shared region intersects the mapping, trim the mapping to the + * start of the shared extent.  If there are no shared regions that + * overlap, just return the original extent. + */ +int +xfs_reflink_trim_around_shared( +	struct xfs_inode	*ip, +	struct xfs_bmbt_irec	*irec, +	bool			*shared, +	bool			*trimmed) +{ +	xfs_agnumber_t		agno; +	xfs_agblock_t		agbno; +	xfs_extlen_t		aglen; +	xfs_agblock_t		fbno; +	xfs_extlen_t		flen; +	int			error = 0; + +	/* Holes, unwritten, and delalloc extents cannot be shared */ +	if (!xfs_is_reflink_inode(ip) || +	    ISUNWRITTEN(irec) || +	    irec->br_startblock == HOLESTARTBLOCK || +	    irec->br_startblock == DELAYSTARTBLOCK) { +		*shared = false; +		return 0; +	} + +	trace_xfs_reflink_trim_around_shared(ip, irec); + +	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); +	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); +	aglen = irec->br_blockcount; + +	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, +			aglen, &fbno, &flen, true); +	if (error) +		return error; + +	*shared = *trimmed = false; +	if (fbno == NULLAGBLOCK) { +		/* No shared blocks at all. */ +		return 0; +	} else if (fbno == agbno) { +		/* +		 * The start of this extent is shared.  Truncate the +		 * mapping at the end of the shared region so that a +		 * subsequent iteration starts at the start of the +		 * unshared region. +		 */ +		irec->br_blockcount = flen; +		*shared = true; +		if (flen != aglen) +			*trimmed = true; +		return 0; +	} else { +		/* +		 * There's a shared extent midway through this extent. +		 * Truncate the mapping at the start of the shared +		 * extent so that a subsequent iteration starts at the +		 * start of the shared region. +		 */ +		irec->br_blockcount = fbno - agbno; +		*trimmed = true; +		return 0; +	} +} + +/* Create a CoW reservation for a range of blocks within a file. */ +static int +__xfs_reflink_reserve_cow( +	struct xfs_inode	*ip, +	xfs_fileoff_t		*offset_fsb, +	xfs_fileoff_t		end_fsb, +	bool			*skipped) +{ +	struct xfs_bmbt_irec	got, prev, imap; +	xfs_fileoff_t		orig_end_fsb; +	int			nimaps, eof = 0, error = 0; +	bool			shared = false, trimmed = false; +	xfs_extnum_t		idx; +	xfs_extlen_t		align; + +	/* Already reserved?  Skip the refcount btree access. */ +	xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, +			&got, &prev); +	if (!eof && got.br_startoff <= *offset_fsb) { +		end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; +		trace_xfs_reflink_cow_found(ip, &got); +		goto done; +	} + +	/* Read extent from the source file. */ +	nimaps = 1; +	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, +			&imap, &nimaps, 0); +	if (error) +		goto out_unlock; +	ASSERT(nimaps == 1); + +	/* Trim the mapping to the nearest shared extent boundary. */ +	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); +	if (error) +		goto out_unlock; + +	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; + +	/* Not shared?  Just report the (potentially capped) extent. */ +	if (!shared) { +		*skipped = true; +		goto done; +	} + +	/* +	 * Fork all the shared blocks from our write offset until the end of +	 * the extent. +	 */ +	error = xfs_qm_dqattach_locked(ip, 0); +	if (error) +		goto out_unlock; + +	align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); +	if (align) +		end_fsb = roundup_64(end_fsb, align); + +retry: +	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, +			end_fsb - *offset_fsb, &got, +			&prev, &idx, eof); +	switch (error) { +	case 0: +		break; +	case -ENOSPC: +	case -EDQUOT: +		/* retry without any preallocation */ +		trace_xfs_reflink_cow_enospc(ip, &imap); +		if (end_fsb != orig_end_fsb) { +			end_fsb = orig_end_fsb; +			goto retry; +		} +		/*FALLTHRU*/ +	default: +		goto out_unlock; +	} + +	if (end_fsb != orig_end_fsb) +		xfs_inode_set_cowblocks_tag(ip); + +	trace_xfs_reflink_cow_alloc(ip, &got); +done: +	*offset_fsb = end_fsb; +out_unlock: +	return error; +} + +/* Create a CoW reservation for part of a file. */ +int +xfs_reflink_reserve_cow_range( +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	xfs_off_t		count) +{ +	struct xfs_mount	*mp = ip->i_mount; +	xfs_fileoff_t		offset_fsb, end_fsb; +	bool			skipped = false; +	int			error; + +	trace_xfs_reflink_reserve_cow_range(ip, offset, count); + +	offset_fsb = XFS_B_TO_FSBT(mp, offset); +	end_fsb = XFS_B_TO_FSB(mp, offset + count); + +	xfs_ilock(ip, XFS_ILOCK_EXCL); +	while (offset_fsb < end_fsb) { +		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, +				&skipped); +		if (error) { +			trace_xfs_reflink_reserve_cow_range_error(ip, error, +				_RET_IP_); +			break; +		} +	} +	xfs_iunlock(ip, XFS_ILOCK_EXCL); + +	return error; +} + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +static int +__xfs_reflink_allocate_cow( +	struct xfs_inode	*ip, +	xfs_fileoff_t		*offset_fsb, +	xfs_fileoff_t		end_fsb) +{ +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_bmbt_irec	imap; +	struct xfs_defer_ops	dfops; +	struct xfs_trans	*tp; +	xfs_fsblock_t		first_block; +	xfs_fileoff_t		next_fsb; +	int			nimaps = 1, error; +	bool			skipped = false; + +	xfs_defer_init(&dfops, &first_block); + +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, +			XFS_TRANS_RESERVE, &tp); +	if (error) +		return error; + +	xfs_ilock(ip, XFS_ILOCK_EXCL); + +	next_fsb = *offset_fsb; +	error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); +	if (error) +		goto out_trans_cancel; + +	if (skipped) { +		*offset_fsb = next_fsb; +		goto out_trans_cancel; +	} + +	xfs_trans_ijoin(tp, ip, 0); +	error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, +			XFS_BMAPI_COWFORK, &first_block, +			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), +			&imap, &nimaps, &dfops); +	if (error) +		goto out_trans_cancel; + +	/* We might not have been able to map the whole delalloc extent */ +	*offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); + +	error = xfs_defer_finish(&tp, &dfops, NULL); +	if (error) +		goto out_trans_cancel; + +	error = xfs_trans_commit(tp); + +out_unlock: +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	return error; +out_trans_cancel: +	xfs_defer_cancel(&dfops); +	xfs_trans_cancel(tp); +	goto out_unlock; +} + +/* Allocate all CoW reservations covering a part of a file. */ +int +xfs_reflink_allocate_cow_range( +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	xfs_off_t		count) +{ +	struct xfs_mount	*mp = ip->i_mount; +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset); +	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count); +	int			error; + +	ASSERT(xfs_is_reflink_inode(ip)); + +	trace_xfs_reflink_allocate_cow_range(ip, offset, count); + +	/* +	 * Make sure that the dquots are there. +	 */ +	error = xfs_qm_dqattach(ip, 0); +	if (error) +		return error; + +	while (offset_fsb < end_fsb) { +		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); +		if (error) { +			trace_xfs_reflink_allocate_cow_range_error(ip, error, +					_RET_IP_); +			break; +		} +	} + +	return error; +} + +/* + * Find the CoW reservation (and whether or not it needs block allocation) + * for a given byte offset of a file. + */ +bool +xfs_reflink_find_cow_mapping( +	struct xfs_inode		*ip, +	xfs_off_t			offset, +	struct xfs_bmbt_irec		*imap, +	bool				*need_alloc) +{ +	struct xfs_bmbt_irec		irec; +	struct xfs_ifork		*ifp; +	struct xfs_bmbt_rec_host	*gotp; +	xfs_fileoff_t			bno; +	xfs_extnum_t			idx; + +	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); +	ASSERT(xfs_is_reflink_inode(ip)); + +	/* Find the extent in the CoW fork. */ +	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); +	bno = XFS_B_TO_FSBT(ip->i_mount, offset); +	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); +	if (!gotp) +		return false; + +	xfs_bmbt_get_all(gotp, &irec); +	if (bno >= irec.br_startoff + irec.br_blockcount || +	    bno < irec.br_startoff) +		return false; + +	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, +			&irec); + +	/* If it's still delalloc, we must allocate later. */ +	*imap = irec; +	*need_alloc = !!(isnullstartblock(irec.br_startblock)); + +	return true; +} + +/* + * Trim an extent to end at the next CoW reservation past offset_fsb. + */ +int +xfs_reflink_trim_irec_to_next_cow( +	struct xfs_inode		*ip, +	xfs_fileoff_t			offset_fsb, +	struct xfs_bmbt_irec		*imap) +{ +	struct xfs_bmbt_irec		irec; +	struct xfs_ifork		*ifp; +	struct xfs_bmbt_rec_host	*gotp; +	xfs_extnum_t			idx; + +	if (!xfs_is_reflink_inode(ip)) +		return 0; + +	/* Find the extent in the CoW fork. */ +	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); +	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); +	if (!gotp) +		return 0; +	xfs_bmbt_get_all(gotp, &irec); + +	/* This is the extent before; try sliding up one. */ +	if (irec.br_startoff < offset_fsb) { +		idx++; +		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) +			return 0; +		gotp = xfs_iext_get_ext(ifp, idx); +		xfs_bmbt_get_all(gotp, &irec); +	} + +	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) +		return 0; + +	imap->br_blockcount = irec.br_startoff - imap->br_startoff; +	trace_xfs_reflink_trim_irec(ip, imap); + +	return 0; +} + +/* + * Cancel all pending CoW reservations for some block range of an inode. + */ +int +xfs_reflink_cancel_cow_blocks( +	struct xfs_inode		*ip, +	struct xfs_trans		**tpp, +	xfs_fileoff_t			offset_fsb, +	xfs_fileoff_t			end_fsb) +{ +	struct xfs_bmbt_irec		irec; +	xfs_filblks_t			count_fsb; +	xfs_fsblock_t			firstfsb; +	struct xfs_defer_ops		dfops; +	int				error = 0; +	int				nimaps; + +	if (!xfs_is_reflink_inode(ip)) +		return 0; + +	/* Go find the old extent in the CoW fork. */ +	while (offset_fsb < end_fsb) { +		nimaps = 1; +		count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); +		error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, +				&nimaps, XFS_BMAPI_COWFORK); +		if (error) +			break; +		ASSERT(nimaps == 1); + +		trace_xfs_reflink_cancel_cow(ip, &irec); + +		if (irec.br_startblock == DELAYSTARTBLOCK) { +			/* Free a delayed allocation. */ +			xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, +					false); +			ip->i_delayed_blks -= irec.br_blockcount; + +			/* Remove the mapping from the CoW fork. */ +			error = xfs_bunmapi_cow(ip, &irec); +			if (error) +				break; +		} else if (irec.br_startblock == HOLESTARTBLOCK) { +			/* empty */ +		} else { +			xfs_trans_ijoin(*tpp, ip, 0); +			xfs_defer_init(&dfops, &firstfsb); + +			/* Free the CoW orphan record. */ +			error = xfs_refcount_free_cow_extent(ip->i_mount, +					&dfops, irec.br_startblock, +					irec.br_blockcount); +			if (error) +				break; + +			xfs_bmap_add_free(ip->i_mount, &dfops, +					irec.br_startblock, irec.br_blockcount, +					NULL); + +			/* Update quota accounting */ +			xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, +					-(long)irec.br_blockcount); + +			/* Roll the transaction */ +			error = xfs_defer_finish(tpp, &dfops, ip); +			if (error) { +				xfs_defer_cancel(&dfops); +				break; +			} + +			/* Remove the mapping from the CoW fork. */ +			error = xfs_bunmapi_cow(ip, &irec); +			if (error) +				break; +		} + +		/* Roll on... */ +		offset_fsb = irec.br_startoff + irec.br_blockcount; +	} + +	return error; +} + +/* + * Cancel all pending CoW reservations for some byte range of an inode. + */ +int +xfs_reflink_cancel_cow_range( +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	xfs_off_t		count) +{ +	struct xfs_trans	*tp; +	xfs_fileoff_t		offset_fsb; +	xfs_fileoff_t		end_fsb; +	int			error; + +	trace_xfs_reflink_cancel_cow_range(ip, offset, count); +	ASSERT(xfs_is_reflink_inode(ip)); + +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); +	if (count == NULLFILEOFF) +		end_fsb = NULLFILEOFF; +	else +		end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + +	/* Start a rolling transaction to remove the mappings */ +	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, +			0, 0, 0, &tp); +	if (error) +		goto out; + +	xfs_ilock(ip, XFS_ILOCK_EXCL); +	xfs_trans_ijoin(tp, ip, 0); + +	/* Scrape out the old CoW reservations */ +	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); +	if (error) +		goto out_cancel; + +	error = xfs_trans_commit(tp); + +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	return error; + +out_cancel: +	xfs_trans_cancel(tp); +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: +	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); +	return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( +	struct xfs_inode		*ip, +	xfs_off_t			offset, +	xfs_off_t			count) +{ +	struct xfs_bmbt_irec		irec; +	struct xfs_bmbt_irec		uirec; +	struct xfs_trans		*tp; +	xfs_fileoff_t			offset_fsb; +	xfs_fileoff_t			end_fsb; +	xfs_filblks_t			count_fsb; +	xfs_fsblock_t			firstfsb; +	struct xfs_defer_ops		dfops; +	int				error; +	unsigned int			resblks; +	xfs_filblks_t			ilen; +	xfs_filblks_t			rlen; +	int				nimaps; + +	trace_xfs_reflink_end_cow(ip, offset, count); + +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); +	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + +	/* Start a rolling transaction to switch the mappings */ +	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); +	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, +			resblks, 0, 0, &tp); +	if (error) +		goto out; + +	xfs_ilock(ip, XFS_ILOCK_EXCL); +	xfs_trans_ijoin(tp, ip, 0); + +	/* Go find the old extent in the CoW fork. */ +	while (offset_fsb < end_fsb) { +		/* Read extent from the source file */ +		nimaps = 1; +		count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); +		error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, +				&nimaps, XFS_BMAPI_COWFORK); +		if (error) +			goto out_cancel; +		ASSERT(nimaps == 1); + +		ASSERT(irec.br_startblock != DELAYSTARTBLOCK); +		trace_xfs_reflink_cow_remap(ip, &irec); + +		/* +		 * We can have a hole in the CoW fork if part of a directio +		 * write is CoW but part of it isn't. +		 */ +		rlen = ilen = irec.br_blockcount; +		if (irec.br_startblock == HOLESTARTBLOCK) +			goto next_extent; + +		/* Unmap the old blocks in the data fork. */ +		while (rlen) { +			xfs_defer_init(&dfops, &firstfsb); +			error = __xfs_bunmapi(tp, ip, irec.br_startoff, +					&rlen, 0, 1, &firstfsb, &dfops); +			if (error) +				goto out_defer; + +			/* +			 * Trim the extent to whatever got unmapped. +			 * Remember, bunmapi works backwards. +			 */ +			uirec.br_startblock = irec.br_startblock + rlen; +			uirec.br_startoff = irec.br_startoff + rlen; +			uirec.br_blockcount = irec.br_blockcount - rlen; +			irec.br_blockcount = rlen; +			trace_xfs_reflink_cow_remap_piece(ip, &uirec); + +			/* Free the CoW orphan record. */ +			error = xfs_refcount_free_cow_extent(tp->t_mountp, +					&dfops, uirec.br_startblock, +					uirec.br_blockcount); +			if (error) +				goto out_defer; + +			/* Map the new blocks into the data fork. */ +			error = xfs_bmap_map_extent(tp->t_mountp, &dfops, +					ip, &uirec); +			if (error) +				goto out_defer; + +			/* Remove the mapping from the CoW fork. */ +			error = xfs_bunmapi_cow(ip, &uirec); +			if (error) +				goto out_defer; + +			error = xfs_defer_finish(&tp, &dfops, ip); +			if (error) +				goto out_defer; +		} + +next_extent: +		/* Roll on... */ +		offset_fsb = irec.br_startoff + ilen; +	} + +	error = xfs_trans_commit(tp); +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	if (error) +		goto out; +	return 0; + +out_defer: +	xfs_defer_cancel(&dfops); +out_cancel: +	xfs_trans_cancel(tp); +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: +	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); +	return error; +} + +/* + * Free leftover CoW reservations that didn't get cleaned out. + */ +int +xfs_reflink_recover_cow( +	struct xfs_mount	*mp) +{ +	xfs_agnumber_t		agno; +	int			error = 0; + +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return 0; + +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { +		error = xfs_refcount_recover_cow_leftovers(mp, agno); +		if (error) +			break; +	} + +	return error; +} + +/* + * Reflinking (Block) Ranges of Two Files Together + * + * First, ensure that the reflink flag is set on both inodes.  The flag is an + * optimization to avoid unnecessary refcount btree lookups in the write path. + * + * Now we can iteratively remap the range of extents (and holes) in src to the + * corresponding ranges in dest.  Let drange and srange denote the ranges of + * logical blocks in dest and src touched by the reflink operation. + * + * While the length of drange is greater than zero, + *    - Read src's bmbt at the start of srange ("imap") + *    - If imap doesn't exist, make imap appear to start at the end of srange + *      with zero length. + *    - If imap starts before srange, advance imap to start at srange. + *    - If imap goes beyond srange, truncate imap to end at the end of srange. + *    - Punch (imap start - srange start + imap len) blocks from dest at + *      offset (drange start). + *    - If imap points to a real range of pblks, + *         > Increase the refcount of the imap's pblks + *         > Map imap's pblks into dest at the offset + *           (drange start + imap start - srange start) + *    - Advance drange and srange by (imap start - srange start + imap len) + * + * Finally, if the reflink made dest longer, update both the in-core and + * on-disk file sizes. + * + * ASCII Art Demonstration: + * + * Let's say we want to reflink this source file: + * + * ----SSSSSSS-SSSSS----SSSSSS (src file) + *   <--------------------> + * + * into this destination file: + * + * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) + *        <--------------------> + * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. + * Observe that the range has different logical offsets in either file. + * + * Consider that the first extent in the source file doesn't line up with our + * reflink range.  Unmapping  and remapping are separate operations, so we can + * unmap more blocks from the destination file than we remap. + * + * ----SSSSSSS-SSSSS----SSSSSS + *   <-------> + * --DDDDD---------DDDDD--DDD + *        <-------> + * + * Now remap the source extent into the destination file: + * + * ----SSSSSSS-SSSSS----SSSSSS + *   <-------> + * --DDDDD--SSSSSSSDDDDD--DDD + *        <-------> + * + * Do likewise with the second hole and extent in our range.  Holes in the + * unmap range don't affect our operation. + * + * ----SSSSSSS-SSSSS----SSSSSS + *            <----> + * --DDDDD--SSSSSSS-SSSSS-DDD + *                 <----> + * + * Finally, unmap and remap part of the third extent.  This will increase the + * size of the destination file. + * + * ----SSSSSSS-SSSSS----SSSSSS + *                  <-----> + * --DDDDD--SSSSSSS-SSSSS----SSS + *                       <-----> + * + * Once we update the destination file's i_size, we're done. + */ + +/* + * Ensure the reflink bit is set in both inodes. + */ +STATIC int +xfs_reflink_set_inode_flag( +	struct xfs_inode	*src, +	struct xfs_inode	*dest) +{ +	struct xfs_mount	*mp = src->i_mount; +	int			error; +	struct xfs_trans	*tp; + +	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) +		return 0; + +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); +	if (error) +		goto out_error; + +	/* Lock both files against IO */ +	if (src->i_ino == dest->i_ino) +		xfs_ilock(src, XFS_ILOCK_EXCL); +	else +		xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + +	if (!xfs_is_reflink_inode(src)) { +		trace_xfs_reflink_set_inode_flag(src); +		xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); +		src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; +		xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); +		xfs_ifork_init_cow(src); +	} else +		xfs_iunlock(src, XFS_ILOCK_EXCL); + +	if (src->i_ino == dest->i_ino) +		goto commit_flags; + +	if (!xfs_is_reflink_inode(dest)) { +		trace_xfs_reflink_set_inode_flag(dest); +		xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); +		dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; +		xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); +		xfs_ifork_init_cow(dest); +	} else +		xfs_iunlock(dest, XFS_ILOCK_EXCL); + +commit_flags: +	error = xfs_trans_commit(tp); +	if (error) +		goto out_error; +	return error; + +out_error: +	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); +	return error; +} + +/* + * Update destination inode size & cowextsize hint, if necessary. + */ +STATIC int +xfs_reflink_update_dest( +	struct xfs_inode	*dest, +	xfs_off_t		newlen, +	xfs_extlen_t		cowextsize) +{ +	struct xfs_mount	*mp = dest->i_mount; +	struct xfs_trans	*tp; +	int			error; + +	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) +		return 0; + +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); +	if (error) +		goto out_error; + +	xfs_ilock(dest, XFS_ILOCK_EXCL); +	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + +	if (newlen > i_size_read(VFS_I(dest))) { +		trace_xfs_reflink_update_inode_size(dest, newlen); +		i_size_write(VFS_I(dest), newlen); +		dest->i_d.di_size = newlen; +	} + +	if (cowextsize) { +		dest->i_d.di_cowextsize = cowextsize; +		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; +	} + +	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + +	error = xfs_trans_commit(tp); +	if (error) +		goto out_error; +	return error; + +out_error: +	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); +	return error; +} + +/* + * Do we have enough reserve in this AG to handle a reflink?  The refcount + * btree already reserved all the space it needs, but the rmap btree can grow + * infinitely, so we won't allow more reflinks when the AG is down to the + * btree reserves. + */ +static int +xfs_reflink_ag_has_free_space( +	struct xfs_mount	*mp, +	xfs_agnumber_t		agno) +{ +	struct xfs_perag	*pag; +	int			error = 0; + +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) +		return 0; + +	pag = xfs_perag_get(mp, agno); +	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || +	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) +		error = -ENOSPC; +	xfs_perag_put(pag); +	return error; +} + +/* + * Unmap a range of blocks from a file, then map other blocks into the hole. + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). + * The extent irec is mapped into dest at irec->br_startoff. + */ +STATIC int +xfs_reflink_remap_extent( +	struct xfs_inode	*ip, +	struct xfs_bmbt_irec	*irec, +	xfs_fileoff_t		destoff, +	xfs_off_t		new_isize) +{ +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_trans	*tp; +	xfs_fsblock_t		firstfsb; +	unsigned int		resblks; +	struct xfs_defer_ops	dfops; +	struct xfs_bmbt_irec	uirec; +	bool			real_extent; +	xfs_filblks_t		rlen; +	xfs_filblks_t		unmap_len; +	xfs_off_t		newlen; +	int			error; + +	unmap_len = irec->br_startoff + irec->br_blockcount - destoff; +	trace_xfs_reflink_punch_range(ip, destoff, unmap_len); + +	/* Only remap normal extents. */ +	real_extent =  (irec->br_startblock != HOLESTARTBLOCK && +			irec->br_startblock != DELAYSTARTBLOCK && +			!ISUNWRITTEN(irec)); + +	/* No reflinking if we're low on space */ +	if (real_extent) { +		error = xfs_reflink_ag_has_free_space(mp, +				XFS_FSB_TO_AGNO(mp, irec->br_startblock)); +		if (error) +			goto out; +	} + +	/* Start a rolling transaction to switch the mappings */ +	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); +	if (error) +		goto out; + +	xfs_ilock(ip, XFS_ILOCK_EXCL); +	xfs_trans_ijoin(tp, ip, 0); + +	/* If we're not just clearing space, then do we have enough quota? */ +	if (real_extent) { +		error = xfs_trans_reserve_quota_nblks(tp, ip, +				irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); +		if (error) +			goto out_cancel; +	} + +	trace_xfs_reflink_remap(ip, irec->br_startoff, +				irec->br_blockcount, irec->br_startblock); + +	/* Unmap the old blocks in the data fork. */ +	rlen = unmap_len; +	while (rlen) { +		xfs_defer_init(&dfops, &firstfsb); +		error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, +				&firstfsb, &dfops); +		if (error) +			goto out_defer; + +		/* +		 * Trim the extent to whatever got unmapped. +		 * Remember, bunmapi works backwards. +		 */ +		uirec.br_startblock = irec->br_startblock + rlen; +		uirec.br_startoff = irec->br_startoff + rlen; +		uirec.br_blockcount = unmap_len - rlen; +		unmap_len = rlen; + +		/* If this isn't a real mapping, we're done. */ +		if (!real_extent || uirec.br_blockcount == 0) +			goto next_extent; + +		trace_xfs_reflink_remap(ip, uirec.br_startoff, +				uirec.br_blockcount, uirec.br_startblock); + +		/* Update the refcount tree */ +		error = xfs_refcount_increase_extent(mp, &dfops, &uirec); +		if (error) +			goto out_defer; + +		/* Map the new blocks into the data fork. */ +		error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); +		if (error) +			goto out_defer; + +		/* Update quota accounting. */ +		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, +				uirec.br_blockcount); + +		/* Update dest isize if needed. */ +		newlen = XFS_FSB_TO_B(mp, +				uirec.br_startoff + uirec.br_blockcount); +		newlen = min_t(xfs_off_t, newlen, new_isize); +		if (newlen > i_size_read(VFS_I(ip))) { +			trace_xfs_reflink_update_inode_size(ip, newlen); +			i_size_write(VFS_I(ip), newlen); +			ip->i_d.di_size = newlen; +			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +		} + +next_extent: +		/* Process all the deferred stuff. */ +		error = xfs_defer_finish(&tp, &dfops, ip); +		if (error) +			goto out_defer; +	} + +	error = xfs_trans_commit(tp); +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	if (error) +		goto out; +	return 0; + +out_defer: +	xfs_defer_cancel(&dfops); +out_cancel: +	xfs_trans_cancel(tp); +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: +	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); +	return error; +} + +/* + * Iteratively remap one file's extents (and holes) to another's. + */ +STATIC int +xfs_reflink_remap_blocks( +	struct xfs_inode	*src, +	xfs_fileoff_t		srcoff, +	struct xfs_inode	*dest, +	xfs_fileoff_t		destoff, +	xfs_filblks_t		len, +	xfs_off_t		new_isize) +{ +	struct xfs_bmbt_irec	imap; +	int			nimaps; +	int			error = 0; +	xfs_filblks_t		range_len; + +	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ +	while (len) { +		trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, +				dest, destoff); +		/* Read extent from the source file */ +		nimaps = 1; +		xfs_ilock(src, XFS_ILOCK_EXCL); +		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); +		xfs_iunlock(src, XFS_ILOCK_EXCL); +		if (error) +			goto err; +		ASSERT(nimaps == 1); + +		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, +				&imap); + +		/* Translate imap into the destination file. */ +		range_len = imap.br_startoff + imap.br_blockcount - srcoff; +		imap.br_startoff += destoff - srcoff; + +		/* Clear dest from destoff to the end of imap and map it in. */ +		error = xfs_reflink_remap_extent(dest, &imap, destoff, +				new_isize); +		if (error) +			goto err; + +		if (fatal_signal_pending(current)) { +			error = -EINTR; +			goto err; +		} + +		/* Advance drange/srange */ +		srcoff += range_len; +		destoff += range_len; +		len -= range_len; +	} + +	return 0; + +err: +	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); +	return error; +} + +/* + * Read a page's worth of file data into the page cache.  Return the page + * locked. + */ +static struct page * +xfs_get_page( +	struct inode	*inode, +	xfs_off_t	offset) +{ +	struct address_space	*mapping; +	struct page		*page; +	pgoff_t			n; + +	n = offset >> PAGE_SHIFT; +	mapping = inode->i_mapping; +	page = read_mapping_page(mapping, n, NULL); +	if (IS_ERR(page)) +		return page; +	if (!PageUptodate(page)) { +		put_page(page); +		return ERR_PTR(-EIO); +	} +	lock_page(page); +	return page; +} + +/* + * Compare extents of two files to see if they are the same. + */ +static int +xfs_compare_extents( +	struct inode	*src, +	xfs_off_t	srcoff, +	struct inode	*dest, +	xfs_off_t	destoff, +	xfs_off_t	len, +	bool		*is_same) +{ +	xfs_off_t	src_poff; +	xfs_off_t	dest_poff; +	void		*src_addr; +	void		*dest_addr; +	struct page	*src_page; +	struct page	*dest_page; +	xfs_off_t	cmp_len; +	bool		same; +	int		error; + +	error = -EINVAL; +	same = true; +	while (len) { +		src_poff = srcoff & (PAGE_SIZE - 1); +		dest_poff = destoff & (PAGE_SIZE - 1); +		cmp_len = min(PAGE_SIZE - src_poff, +			      PAGE_SIZE - dest_poff); +		cmp_len = min(cmp_len, len); +		ASSERT(cmp_len > 0); + +		trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, +				XFS_I(dest), destoff); + +		src_page = xfs_get_page(src, srcoff); +		if (IS_ERR(src_page)) { +			error = PTR_ERR(src_page); +			goto out_error; +		} +		dest_page = xfs_get_page(dest, destoff); +		if (IS_ERR(dest_page)) { +			error = PTR_ERR(dest_page); +			unlock_page(src_page); +			put_page(src_page); +			goto out_error; +		} +		src_addr = kmap_atomic(src_page); +		dest_addr = kmap_atomic(dest_page); + +		flush_dcache_page(src_page); +		flush_dcache_page(dest_page); + +		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) +			same = false; + +		kunmap_atomic(dest_addr); +		kunmap_atomic(src_addr); +		unlock_page(dest_page); +		unlock_page(src_page); +		put_page(dest_page); +		put_page(src_page); + +		if (!same) +			break; + +		srcoff += cmp_len; +		destoff += cmp_len; +		len -= cmp_len; +	} + +	*is_same = same; +	return 0; + +out_error: +	trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); +	return error; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( +	struct xfs_inode	*src, +	xfs_off_t		srcoff, +	struct xfs_inode	*dest, +	xfs_off_t		destoff, +	xfs_off_t		len, +	unsigned int		flags) +{ +	struct xfs_mount	*mp = src->i_mount; +	xfs_fileoff_t		sfsbno, dfsbno; +	xfs_filblks_t		fsblen; +	int			error; +	xfs_extlen_t		cowextsize; +	bool			is_same; + +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return -EOPNOTSUPP; + +	if (XFS_FORCED_SHUTDOWN(mp)) +		return -EIO; + +	/* Don't reflink realtime inodes */ +	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) +		return -EINVAL; + +	if (flags & ~XFS_REFLINK_ALL) +		return -EINVAL; + +	trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); + +	/* Lock both files against IO */ +	if (src->i_ino == dest->i_ino) { +		xfs_ilock(src, XFS_IOLOCK_EXCL); +		xfs_ilock(src, XFS_MMAPLOCK_EXCL); +	} else { +		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); +		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); +	} + +	/* +	 * Check that the extents are the same. +	 */ +	if (flags & XFS_REFLINK_DEDUPE) { +		is_same = false; +		error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), +				destoff, len, &is_same); +		if (error) +			goto out_error; +		if (!is_same) { +			error = -EBADE; +			goto out_error; +		} +	} + +	error = xfs_reflink_set_inode_flag(src, dest); +	if (error) +		goto out_error; + +	/* +	 * Invalidate the page cache so that we can clear any CoW mappings +	 * in the destination file. +	 */ +	truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, +				   PAGE_ALIGN(destoff + len) - 1); + +	dfsbno = XFS_B_TO_FSBT(mp, destoff); +	sfsbno = XFS_B_TO_FSBT(mp, srcoff); +	fsblen = XFS_B_TO_FSB(mp, len); +	error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, +			destoff + len); +	if (error) +		goto out_error; + +	/* +	 * Carry the cowextsize hint from src to dest if we're sharing the +	 * entire source file to the entire destination file, the source file +	 * has a cowextsize hint, and the destination file does not. +	 */ +	cowextsize = 0; +	if (srcoff == 0 && len == i_size_read(VFS_I(src)) && +	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && +	    destoff == 0 && len >= i_size_read(VFS_I(dest)) && +	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) +		cowextsize = src->i_d.di_cowextsize; + +	error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); +	if (error) +		goto out_error; + +out_error: +	xfs_iunlock(src, XFS_MMAPLOCK_EXCL); +	xfs_iunlock(src, XFS_IOLOCK_EXCL); +	if (src->i_ino != dest->i_ino) { +		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); +		xfs_iunlock(dest, XFS_IOLOCK_EXCL); +	} +	if (error) +		trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); +	return error; +} + +/* + * The user wants to preemptively CoW all shared blocks in this file, + * which enables us to turn off the reflink flag.  Iterate all + * extents which are not prealloc/delalloc to see which ranges are + * mentioned in the refcount tree, then read those blocks into the + * pagecache, dirty them, fsync them back out, and then we can update + * the inode flag.  What happens if we run out of memory? :) + */ +STATIC int +xfs_reflink_dirty_extents( +	struct xfs_inode	*ip, +	xfs_fileoff_t		fbno, +	xfs_filblks_t		end, +	xfs_off_t		isize) +{ +	struct xfs_mount	*mp = ip->i_mount; +	xfs_agnumber_t		agno; +	xfs_agblock_t		agbno; +	xfs_extlen_t		aglen; +	xfs_agblock_t		rbno; +	xfs_extlen_t		rlen; +	xfs_off_t		fpos; +	xfs_off_t		flen; +	struct xfs_bmbt_irec	map[2]; +	int			nmaps; +	int			error = 0; + +	while (end - fbno > 0) { +		nmaps = 1; +		/* +		 * Look for extents in the file.  Skip holes, delalloc, or +		 * unwritten extents; they can't be reflinked. +		 */ +		error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); +		if (error) +			goto out; +		if (nmaps == 0) +			break; +		if (map[0].br_startblock == HOLESTARTBLOCK || +		    map[0].br_startblock == DELAYSTARTBLOCK || +		    ISUNWRITTEN(&map[0])) +			goto next; + +		map[1] = map[0]; +		while (map[1].br_blockcount) { +			agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); +			agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); +			aglen = map[1].br_blockcount; + +			error = xfs_reflink_find_shared(mp, agno, agbno, aglen, +					&rbno, &rlen, true); +			if (error) +				goto out; +			if (rbno == NULLAGBLOCK) +				break; + +			/* Dirty the pages */ +			xfs_iunlock(ip, XFS_ILOCK_EXCL); +			fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + +					(rbno - agbno)); +			flen = XFS_FSB_TO_B(mp, rlen); +			if (fpos + flen > isize) +				flen = isize - fpos; +			error = iomap_file_dirty(VFS_I(ip), fpos, flen, +					&xfs_iomap_ops); +			xfs_ilock(ip, XFS_ILOCK_EXCL); +			if (error) +				goto out; + +			map[1].br_blockcount -= (rbno - agbno + rlen); +			map[1].br_startoff += (rbno - agbno + rlen); +			map[1].br_startblock += (rbno - agbno + rlen); +		} + +next: +		fbno = map[0].br_startoff + map[0].br_blockcount; +	} +out: +	return error; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( +	struct xfs_inode	*ip, +	struct xfs_trans	**tpp) +{ +	struct xfs_mount	*mp = ip->i_mount; +	xfs_fileoff_t		fbno; +	xfs_filblks_t		end; +	xfs_agnumber_t		agno; +	xfs_agblock_t		agbno; +	xfs_extlen_t		aglen; +	xfs_agblock_t		rbno; +	xfs_extlen_t		rlen; +	struct xfs_bmbt_irec	map; +	int			nmaps; +	int			error = 0; + +	ASSERT(xfs_is_reflink_inode(ip)); + +	fbno = 0; +	end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); +	while (end - fbno > 0) { +		nmaps = 1; +		/* +		 * Look for extents in the file.  Skip holes, delalloc, or +		 * unwritten extents; they can't be reflinked. +		 */ +		error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); +		if (error) +			return error; +		if (nmaps == 0) +			break; +		if (map.br_startblock == HOLESTARTBLOCK || +		    map.br_startblock == DELAYSTARTBLOCK || +		    ISUNWRITTEN(&map)) +			goto next; + +		agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); +		agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); +		aglen = map.br_blockcount; + +		error = xfs_reflink_find_shared(mp, agno, agbno, aglen, +				&rbno, &rlen, false); +		if (error) +			return error; +		/* Is there still a shared block here? */ +		if (rbno != NULLAGBLOCK) +			return 0; +next: +		fbno = map.br_startoff + map.br_blockcount; +	} + +	/* +	 * We didn't find any shared blocks so turn off the reflink flag. +	 * First, get rid of any leftover CoW mappings. +	 */ +	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); +	if (error) +		return error; + +	/* Clear the inode flag. */ +	trace_xfs_reflink_unset_inode_flag(ip); +	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; +	xfs_inode_clear_cowblocks_tag(ip); +	xfs_trans_ijoin(*tpp, ip, 0); +	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + +	return error; +} + +/* + * Clear the inode reflink flag if there are no shared extents and the size + * hasn't changed. + */ +STATIC int +xfs_reflink_try_clear_inode_flag( +	struct xfs_inode	*ip) +{ +	struct xfs_mount	*mp = ip->i_mount; +	struct xfs_trans	*tp; +	int			error = 0; + +	/* Start a rolling transaction to remove the mappings */ +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); +	if (error) +		return error; + +	xfs_ilock(ip, XFS_ILOCK_EXCL); +	xfs_trans_ijoin(tp, ip, 0); + +	error = xfs_reflink_clear_inode_flag(ip, &tp); +	if (error) +		goto cancel; + +	error = xfs_trans_commit(tp); +	if (error) +		goto out; + +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	return 0; +cancel: +	xfs_trans_cancel(tp); +out: +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +	return error; +} + +/* + * Pre-COW all shared blocks within a given byte range of a file and turn off + * the reflink flag if we unshare all of the file's blocks. + */ +int +xfs_reflink_unshare( +	struct xfs_inode	*ip, +	xfs_off_t		offset, +	xfs_off_t		len) +{ +	struct xfs_mount	*mp = ip->i_mount; +	xfs_fileoff_t		fbno; +	xfs_filblks_t		end; +	xfs_off_t		isize; +	int			error; + +	if (!xfs_is_reflink_inode(ip)) +		return 0; + +	trace_xfs_reflink_unshare(ip, offset, len); + +	inode_dio_wait(VFS_I(ip)); + +	/* Try to CoW the selected ranges */ +	xfs_ilock(ip, XFS_ILOCK_EXCL); +	fbno = XFS_B_TO_FSBT(mp, offset); +	isize = i_size_read(VFS_I(ip)); +	end = XFS_B_TO_FSB(mp, offset + len); +	error = xfs_reflink_dirty_extents(ip, fbno, end, isize); +	if (error) +		goto out_unlock; +	xfs_iunlock(ip, XFS_ILOCK_EXCL); + +	/* Wait for the IO to finish */ +	error = filemap_write_and_wait(VFS_I(ip)->i_mapping); +	if (error) +		goto out; + +	/* Turn off the reflink flag if possible. */ +	error = xfs_reflink_try_clear_inode_flag(ip); +	if (error) +		goto out; + +	return 0; + +out_unlock: +	xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: +	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); +	return error; +} + +/* + * Does this inode have any real CoW reservations? + */ +bool +xfs_reflink_has_real_cow_blocks( +	struct xfs_inode		*ip) +{ +	struct xfs_bmbt_irec		irec; +	struct xfs_ifork		*ifp; +	struct xfs_bmbt_rec_host	*gotp; +	xfs_extnum_t			idx; + +	if (!xfs_is_reflink_inode(ip)) +		return false; + +	/* Go find the old extent in the CoW fork. */ +	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); +	gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); +	while (gotp) { +		xfs_bmbt_get_all(gotp, &irec); + +		if (!isnullstartblock(irec.br_startblock)) +			return true; + +		/* Roll on... */ +		idx++; +		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) +			break; +		gotp = xfs_iext_get_ext(ifp, idx); +	} + +	return false; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h new file mode 100644 index 000000000000..5dc3c8ac12aa --- /dev/null +++ b/fs/xfs/xfs_reflink.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#ifndef __XFS_REFLINK_H +#define __XFS_REFLINK_H 1 + +extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, +		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, +		xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, +		struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + +extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, +		xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, +		xfs_off_t offset, xfs_off_t count); +extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, +		struct xfs_bmbt_irec *imap, bool *need_alloc); +extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, +		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); + +extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, +		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, +		xfs_fileoff_t end_fsb); +extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, +		xfs_off_t count); +extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, +		xfs_off_t count); +extern int xfs_reflink_recover_cow(struct xfs_mount *mp); +#define XFS_REFLINK_DEDUPE	1	/* only reflink if contents match */ +#define XFS_REFLINK_ALL		(XFS_REFLINK_DEDUPE) +extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, +		struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len, +		unsigned int flags); +extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, +		struct xfs_trans **tpp); +extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, +		xfs_off_t len); + +extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); + +#endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2500f28689d5..73c827831551 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -51,28 +51,16 @@ xfs_rui_item_free(  		kmem_zone_free(xfs_rui_zone, ruip);  } -/* - * This returns the number of iovecs needed to log the given rui item. - * We only need 1 iovec for an rui item.  It just logs the rui_log_format - * structure. - */ -static inline int -xfs_rui_item_sizeof( -	struct xfs_rui_log_item *ruip) -{ -	return sizeof(struct xfs_rui_log_format) + -			(ruip->rui_format.rui_nextents - 1) * -			sizeof(struct xfs_map_extent); -} -  STATIC void  xfs_rui_item_size(  	struct xfs_log_item	*lip,  	int			*nvecs,  	int			*nbytes)  { +	struct xfs_rui_log_item	*ruip = RUI_ITEM(lip); +  	*nvecs += 1; -	*nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); +	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);  }  /* @@ -97,7 +85,7 @@ xfs_rui_item_format(  	ruip->rui_format.rui_size = 1;  	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, -			xfs_rui_item_sizeof(ruip)); +			xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));  }  /* @@ -205,16 +193,12 @@ xfs_rui_init(  {  	struct xfs_rui_log_item		*ruip; -	uint				size;  	ASSERT(nextents > 0); -	if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { -		size = (uint)(sizeof(struct xfs_rui_log_item) + -			((nextents - 1) * sizeof(struct xfs_map_extent))); -		ruip = kmem_zalloc(size, KM_SLEEP); -	} else { +	if (nextents > XFS_RUI_MAX_FAST_EXTENTS) +		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); +	else  		ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); -	}  	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);  	ruip->rui_format.rui_nextents = nextents; @@ -239,14 +223,12 @@ xfs_rui_copy_format(  	uint				len;  	src_rui_fmt = buf->i_addr; -	len = sizeof(struct xfs_rui_log_format) + -			(src_rui_fmt->rui_nextents - 1) * -			sizeof(struct xfs_map_extent); +	len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);  	if (buf->i_len != len)  		return -EFSCORRUPTED; -	memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); +	memcpy(dst_rui_fmt, src_rui_fmt, len);  	return 0;  } @@ -459,8 +441,11 @@ xfs_rui_recover(  				   XFS_FSB_TO_DADDR(mp, rmap->me_startblock));  		switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {  		case XFS_RMAP_EXTENT_MAP: +		case XFS_RMAP_EXTENT_MAP_SHARED:  		case XFS_RMAP_EXTENT_UNMAP: +		case XFS_RMAP_EXTENT_UNMAP_SHARED:  		case XFS_RMAP_EXTENT_CONVERT: +		case XFS_RMAP_EXTENT_CONVERT_SHARED:  		case XFS_RMAP_EXTENT_ALLOC:  		case XFS_RMAP_EXTENT_FREE:  			op_ok = true; @@ -499,12 +484,21 @@ xfs_rui_recover(  		case XFS_RMAP_EXTENT_MAP:  			type = XFS_RMAP_MAP;  			break; +		case XFS_RMAP_EXTENT_MAP_SHARED: +			type = XFS_RMAP_MAP_SHARED; +			break;  		case XFS_RMAP_EXTENT_UNMAP:  			type = XFS_RMAP_UNMAP;  			break; +		case XFS_RMAP_EXTENT_UNMAP_SHARED: +			type = XFS_RMAP_UNMAP_SHARED; +			break;  		case XFS_RMAP_EXTENT_CONVERT:  			type = XFS_RMAP_CONVERT;  			break; +		case XFS_RMAP_EXTENT_CONVERT_SHARED: +			type = XFS_RMAP_CONVERT_SHARED; +			break;  		case XFS_RMAP_EXTENT_ALLOC:  			type = XFS_RMAP_ALLOC;  			break; diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index aefcc3a318a5..340c968e1f9c 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -70,6 +70,14 @@ struct xfs_rui_log_item {  	struct xfs_rui_log_format	rui_format;  }; +static inline size_t +xfs_rui_log_item_sizeof( +	unsigned int		nr) +{ +	return offsetof(struct xfs_rui_log_item, rui_format) + +			xfs_rui_log_format_sizeof(nr); +} +  /*   * This is the "rmap update done" log item.  It is used to log the fact that   * some rmapbt updates mentioned in an earlier rui item have been performed. diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 6e812fe0fd43..12d48cd8f8a4 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)  		{ "ibt2",		XFSSTAT_END_IBT_V2		},  		{ "fibt2",		XFSSTAT_END_FIBT_V2		},  		{ "rmapbt",		XFSSTAT_END_RMAP_V2		}, +		{ "refcntbt",		XFSSTAT_END_REFCOUNT		},  		/* we print both series of quota information together */  		{ "qm",			XFSSTAT_END_QM			},  	}; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 657865f51e78..79ad2e69fc33 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -213,7 +213,23 @@ struct xfsstats {  	__uint32_t		xs_rmap_2_alloc;  	__uint32_t		xs_rmap_2_free;  	__uint32_t		xs_rmap_2_moves; -#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_RMAP_V2+6) +#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + 15) +	__uint32_t		xs_refcbt_2_lookup; +	__uint32_t		xs_refcbt_2_compare; +	__uint32_t		xs_refcbt_2_insrec; +	__uint32_t		xs_refcbt_2_delrec; +	__uint32_t		xs_refcbt_2_newroot; +	__uint32_t		xs_refcbt_2_killroot; +	__uint32_t		xs_refcbt_2_increment; +	__uint32_t		xs_refcbt_2_decrement; +	__uint32_t		xs_refcbt_2_lshift; +	__uint32_t		xs_refcbt_2_rshift; +	__uint32_t		xs_refcbt_2_split; +	__uint32_t		xs_refcbt_2_join; +	__uint32_t		xs_refcbt_2_alloc; +	__uint32_t		xs_refcbt_2_free; +	__uint32_t		xs_refcbt_2_moves; +#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)  	__uint32_t		xs_qm_dqreclaims;  	__uint32_t		xs_qm_dqreclaim_misses;  	__uint32_t		xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fd6be45b3a1e..ade4691e3f74 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,9 @@  #include "xfs_sysfs.h"  #include "xfs_ondisk.h"  #include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" +#include "xfs_reflink.h"  #include <linux/namei.h>  #include <linux/init.h> @@ -936,6 +939,7 @@ xfs_fs_destroy_inode(  	struct inode		*inode)  {  	struct xfs_inode	*ip = XFS_I(inode); +	int			error;  	trace_xfs_destroy_inode(ip); @@ -943,6 +947,14 @@ xfs_fs_destroy_inode(  	XFS_STATS_INC(ip->i_mount, vn_rele);  	XFS_STATS_INC(ip->i_mount, vn_remove); +	if (xfs_is_reflink_inode(ip)) { +		error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); +		if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) +			xfs_warn(ip->i_mount, +"Error %d while evicting CoW blocks for inode %llu.", +					error, ip->i_ino); +	} +  	xfs_inactive(ip);  	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -1006,6 +1018,16 @@ xfs_fs_drop_inode(  {  	struct xfs_inode	*ip = XFS_I(inode); +	/* +	 * If this unlinked inode is in the middle of recovery, don't +	 * drop the inode just yet; log recovery will take care of +	 * that.  See the comment for this inode flag. +	 */ +	if (ip->i_flags & XFS_IRECOVERY) { +		ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); +		return 0; +	} +  	return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);  } @@ -1137,7 +1159,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)   * Note: xfs_log_quiesce() stops background log work - the callers must ensure   * it is started again when appropriate.   */ -static void +void  xfs_quiesce_attr(  	struct xfs_mount	*mp)  { @@ -1296,10 +1318,31 @@ xfs_fs_remount(  		xfs_restore_resvblks(mp);  		xfs_log_work_queue(mp);  		xfs_queue_eofblocks(mp); + +		/* Recover any CoW blocks that never got remapped. */ +		error = xfs_reflink_recover_cow(mp); +		if (error) { +			xfs_err(mp, +	"Error %d recovering leftover CoW allocations.", error); +			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +			return error; +		} + +		/* Create the per-AG metadata reservation pool .*/ +		error = xfs_fs_reserve_ag_blocks(mp); +		if (error && error != -ENOSPC) +			return error;  	}  	/* rw -> ro */  	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { +		/* Free the per-AG metadata reservation pool. */ +		error = xfs_fs_unreserve_ag_blocks(mp); +		if (error) { +			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +			return error; +		} +  		/*  		 * Before we sync the metadata, we need to free up the reserve  		 * block pool so that the used block count in the superblock on @@ -1490,6 +1533,7 @@ xfs_fs_fill_super(  	atomic_set(&mp->m_active_trans, 0);  	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);  	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); +	INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);  	mp->m_kobj.kobject.kset = xfs_kset;  	mp->m_super = sb; @@ -1572,6 +1616,9 @@ xfs_fs_fill_super(  			"DAX unsupported by block device. Turning off DAX.");  			mp->m_flags &= ~XFS_MOUNT_DAX;  		} +		if (xfs_sb_version_hasreflink(&mp->m_sb)) +			xfs_alert(mp, +		"DAX and reflink have not been tested together!");  	}  	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { @@ -1585,6 +1632,10 @@ xfs_fs_fill_super(  	"EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");  	} +	if (xfs_sb_version_hasreflink(&mp->m_sb)) +		xfs_alert(mp, +	"EXPERIMENTAL reflink feature enabled. Use at your own risk!"); +  	error = xfs_mountfs(mp);  	if (error)  		goto out_filestream_unmount; @@ -1782,15 +1833,44 @@ xfs_init_zones(void)  	if (!xfs_rud_zone)  		goto out_destroy_icreate_zone; -	xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + -			((XFS_RUI_MAX_FAST_EXTENTS - 1) * -				sizeof(struct xfs_map_extent))), +	xfs_rui_zone = kmem_zone_init( +			xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),  			"xfs_rui_item");  	if (!xfs_rui_zone)  		goto out_destroy_rud_zone; +	xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), +			"xfs_cud_item"); +	if (!xfs_cud_zone) +		goto out_destroy_rui_zone; + +	xfs_cui_zone = kmem_zone_init( +			xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), +			"xfs_cui_item"); +	if (!xfs_cui_zone) +		goto out_destroy_cud_zone; + +	xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), +			"xfs_bud_item"); +	if (!xfs_bud_zone) +		goto out_destroy_cui_zone; + +	xfs_bui_zone = kmem_zone_init( +			xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), +			"xfs_bui_item"); +	if (!xfs_bui_zone) +		goto out_destroy_bud_zone; +  	return 0; + out_destroy_bud_zone: +	kmem_zone_destroy(xfs_bud_zone); + out_destroy_cui_zone: +	kmem_zone_destroy(xfs_cui_zone); + out_destroy_cud_zone: +	kmem_zone_destroy(xfs_cud_zone); + out_destroy_rui_zone: +	kmem_zone_destroy(xfs_rui_zone);   out_destroy_rud_zone:  	kmem_zone_destroy(xfs_rud_zone);   out_destroy_icreate_zone: @@ -1833,6 +1913,10 @@ xfs_destroy_zones(void)  	 * destroy caches.  	 */  	rcu_barrier(); +	kmem_zone_destroy(xfs_bui_zone); +	kmem_zone_destroy(xfs_bud_zone); +	kmem_zone_destroy(xfs_cui_zone); +	kmem_zone_destroy(xfs_cud_zone);  	kmem_zone_destroy(xfs_rui_zone);  	kmem_zone_destroy(xfs_rud_zone);  	kmem_zone_destroy(xfs_icreate_zone); @@ -1886,6 +1970,8 @@ init_xfs_fs(void)  	xfs_extent_free_init_defer_op();  	xfs_rmap_update_init_defer_op(); +	xfs_refcount_update_init_defer_op(); +	xfs_bmap_update_init_defer_op();  	xfs_dir_startup(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 529bce9fc37e..b6418abd85ad 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,6 +61,7 @@ struct xfs_mount;  struct xfs_buftarg;  struct block_device; +extern void xfs_quiesce_attr(struct xfs_mount *mp);  extern void xfs_flush_inodes(struct xfs_mount *mp);  extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);  extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index aed74d3f8da9..afe1f66aaa69 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = {  		.extra1		= &xfs_params.eofb_timer.min,  		.extra2		= &xfs_params.eofb_timer.max,  	}, +	{ +		.procname	= "speculative_cow_prealloc_lifetime", +		.data		= &xfs_params.cowb_timer.val, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &xfs_params.cowb_timer.min, +		.extra2		= &xfs_params.cowb_timer.max, +	},  	/* please keep this the last entry */  #ifdef CONFIG_PROC_FS  	{ diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index ffef45375754..984a3499cfe3 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -48,6 +48,7 @@ typedef struct xfs_param {  	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */  	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */  	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */ +	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */  } xfs_param_t;  /* diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 79cfd3fc5324..5f8d55d29a11 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -393,9 +393,15 @@ max_retries_show(  	struct kobject	*kobject,  	char		*buf)  { +	int		retries;  	struct xfs_error_cfg *cfg = to_error_cfg(kobject); -	return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); +	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) +		retries = -1; +	else +		retries = cfg->max_retries; + +	return snprintf(buf, PAGE_SIZE, "%d\n", retries);  }  static ssize_t @@ -415,7 +421,10 @@ max_retries_store(  	if (val < -1)  		return -EINVAL; -	cfg->max_retries = val; +	if (val == -1) +		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; +	else +		cfg->max_retries = val;  	return count;  }  XFS_SYSFS_ATTR_RW(max_retries); @@ -425,10 +434,15 @@ retry_timeout_seconds_show(  	struct kobject	*kobject,  	char		*buf)  { +	int		timeout;  	struct xfs_error_cfg *cfg = to_error_cfg(kobject); -	return snprintf(buf, PAGE_SIZE, "%ld\n", -			jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); +	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) +		timeout = -1; +	else +		timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; + +	return snprintf(buf, PAGE_SIZE, "%d\n", timeout);  }  static ssize_t @@ -445,11 +459,16 @@ retry_timeout_seconds_store(  	if (ret)  		return ret; -	/* 1 day timeout maximum */ -	if (val < 0 || val > 86400) +	/* 1 day timeout maximum, -1 means infinite */ +	if (val < -1 || val > 86400)  		return -EINVAL; -	cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); +	if (val == -1) +		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; +	else { +		cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); +		ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); +	}  	return count;  }  XFS_SYSFS_ATTR_RW(retry_timeout_seconds); @@ -519,18 +538,19 @@ struct xfs_error_init {  static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {  	{ .name = "default",  	  .max_retries = XFS_ERR_RETRY_FOREVER, -	  .retry_timeout = 0, +	  .retry_timeout = XFS_ERR_RETRY_FOREVER,  	},  	{ .name = "EIO",  	  .max_retries = XFS_ERR_RETRY_FOREVER, -	  .retry_timeout = 0, +	  .retry_timeout = XFS_ERR_RETRY_FOREVER,  	},  	{ .name = "ENOSPC",  	  .max_retries = XFS_ERR_RETRY_FOREVER, -	  .retry_timeout = 0, +	  .retry_timeout = XFS_ERR_RETRY_FOREVER,  	},  	{ .name = "ENODEV", -	  .max_retries = 0, +	  .max_retries = 0,	/* We can't recover from devices disappearing */ +	  .retry_timeout = 0,  	},  }; @@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(  			goto out_error;  		cfg->max_retries = init[i].max_retries; -		cfg->retry_timeout = msecs_to_jiffies( +		if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) +			cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; +		else +			cfg->retry_timeout = msecs_to_jiffies(  					init[i].retry_timeout * MSEC_PER_SEC);  	}  	return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d303a665dba9..ad188d3a83f3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -39,6 +39,7 @@ struct xfs_buf_log_format;  struct xfs_inode_log_format;  struct xfs_bmbt_irec;  struct xfs_btree_cur; +struct xfs_refcount_irec;  DECLARE_EVENT_CLASS(xfs_attr_list_class,  	TP_PROTO(struct xfs_attr_list_context *ctx), @@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);  DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);  DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);  DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);  DECLARE_EVENT_CLASS(xfs_ag_class,  	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,  		__field(unsigned long, caller_ip)  	),  	TP_fast_assign( -		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ? -						ip->i_afp : &ip->i_df; +		struct xfs_ifork	*ifp;  		struct xfs_bmbt_irec	r; +		ifp = xfs_iext_state_to_fork(ip, state);  		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);  		__entry->dev = VFS_I(ip)->i_sb->s_dev;  		__entry->ino = ip->i_ino; @@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach);  DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);  DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);  DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);  DEFINE_INODE_EVENT(xfs_filemap_fault);  DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); @@ -1170,7 +1176,6 @@ DEFINE_RW_EVENT(xfs_file_dax_read);  DEFINE_RW_EVENT(xfs_file_buffered_write);  DEFINE_RW_EVENT(xfs_file_direct_write);  DEFINE_RW_EVENT(xfs_file_dax_write); -DEFINE_RW_EVENT(xfs_file_splice_read);  DECLARE_EVENT_CLASS(xfs_page_class,  	TP_PROTO(struct inode *inode, struct page *page, unsigned long off, @@ -1570,14 +1575,15 @@ TRACE_EVENT(xfs_agf,  TRACE_EVENT(xfs_free_extent,  	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, -		 xfs_extlen_t len, bool isfl, int haveleft, int haveright), -	TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), +		 xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, +		 int haveright), +	TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),  	TP_STRUCT__entry(  		__field(dev_t, dev)  		__field(xfs_agnumber_t, agno)  		__field(xfs_agblock_t, agbno)  		__field(xfs_extlen_t, len) -		__field(int, isfl) +		__field(int, resv)  		__field(int, haveleft)  		__field(int, haveright)  	), @@ -1586,16 +1592,16 @@ TRACE_EVENT(xfs_free_extent,  		__entry->agno = agno;  		__entry->agbno = agbno;  		__entry->len = len; -		__entry->isfl = isfl; +		__entry->resv = resv;  		__entry->haveleft = haveleft;  		__entry->haveright = haveright;  	), -	TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", +	TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",  		  MAJOR(__entry->dev), MINOR(__entry->dev),  		  __entry->agno,  		  __entry->agbno,  		  __entry->len, -		  __entry->isfl, +		  __entry->resv,  		  __entry->haveleft ?  			(__entry->haveright ? "both" : "left") :  			(__entry->haveright ? "right" : "none")) @@ -1622,8 +1628,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,  		__field(short, otype)  		__field(char, wasdel)  		__field(char, wasfromfl) -		__field(char, isfl) -		__field(char, userdata) +		__field(int, resv) +		__field(int, datatype)  		__field(xfs_fsblock_t, firstblock)  	),  	TP_fast_assign( @@ -1643,14 +1649,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,  		__entry->otype = args->otype;  		__entry->wasdel = args->wasdel;  		__entry->wasfromfl = args->wasfromfl; -		__entry->isfl = args->isfl; -		__entry->userdata = args->userdata; +		__entry->resv = args->resv; +		__entry->datatype = args->datatype;  		__entry->firstblock = args->firstblock;  	),  	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "  		  "prod %u minleft %u total %u alignment %u minalignslop %u " -		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " -		  "userdata %d firstblock 0x%llx", +		  "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " +		  "datatype 0x%x firstblock 0x%llx",  		  MAJOR(__entry->dev), MINOR(__entry->dev),  		  __entry->agno,  		  __entry->agbno, @@ -1667,8 +1673,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,  		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),  		  __entry->wasdel,  		  __entry->wasfromfl, -		  __entry->isfl, -		  __entry->userdata, +		  __entry->resv, +		  __entry->datatype,  		  (unsigned long long)__entry->firstblock)  ) @@ -1984,6 +1990,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \  DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);  DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover_record, +	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), +	TP_ARGS(log, rhead, pass), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_lsn_t, lsn) +		__field(int, len) +		__field(int, num_logops) +		__field(int, pass) +	), +	TP_fast_assign( +		__entry->dev = log->l_mp->m_super->s_dev; +		__entry->lsn = be64_to_cpu(rhead->h_lsn); +		__entry->len = be32_to_cpu(rhead->h_len); +		__entry->num_logops = be32_to_cpu(rhead->h_num_logops); +		__entry->pass = pass; +	), +	TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->lsn, __entry->len, __entry->num_logops, +		   __entry->pass) +) +  DECLARE_EVENT_CLASS(xfs_log_recover_item_class,  	TP_PROTO(struct xlog *log, struct xlog_recover *trans,  		struct xlog_recover_item *item, int pass), @@ -1992,6 +2021,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,  		__field(dev_t, dev)  		__field(unsigned long, item)  		__field(xlog_tid_t, tid) +		__field(xfs_lsn_t, lsn)  		__field(int, type)  		__field(int, pass)  		__field(int, count) @@ -2001,15 +2031,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,  		__entry->dev = log->l_mp->m_super->s_dev;  		__entry->item = (unsigned long)item;  		__entry->tid = trans->r_log_tid; +		__entry->lsn = trans->r_lsn;  		__entry->type = ITEM_TYPE(item);  		__entry->pass = pass;  		__entry->count = item->ri_cnt;  		__entry->total = item->ri_total;  	), -	TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " -		  "item region count/total %d/%d", +	TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " +		  "item type %s item region count/total %d/%d",  		  MAJOR(__entry->dev), MINOR(__entry->dev),  		  __entry->tid, +		  __entry->lsn,  		  __entry->pass,  		  (void *)__entry->item,  		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), @@ -2068,6 +2100,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); @@ -2554,10 +2587,796 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete);  DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);  DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);  DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); + +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range);  DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);  DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);  DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* deferred bmbt updates */ +#define DEFINE_BMAP_DEFERRED_EVENT	DEFINE_RMAP_DEFERRED_EVENT +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); + +/* per-AG reservation */ +DECLARE_EVENT_CLASS(xfs_ag_resv_class, +	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, +		 xfs_extlen_t len), +	TP_ARGS(pag, resv, len), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(int, resv) +		__field(xfs_extlen_t, freeblks) +		__field(xfs_extlen_t, flcount) +		__field(xfs_extlen_t, reserved) +		__field(xfs_extlen_t, asked) +		__field(xfs_extlen_t, len) +	), +	TP_fast_assign( +		struct xfs_ag_resv	*r = xfs_perag_resv(pag, resv); + +		__entry->dev = pag->pag_mount->m_super->s_dev; +		__entry->agno = pag->pag_agno; +		__entry->resv = resv; +		__entry->freeblks = pag->pagf_freeblks; +		__entry->flcount = pag->pagf_flcount; +		__entry->reserved = r ? r->ar_reserved : 0; +		__entry->asked = r ? r->ar_asked : 0; +		__entry->len = len; +	), +	TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->resv, +		  __entry->freeblks, +		  __entry->flcount, +		  __entry->reserved, +		  __entry->asked, +		  __entry->len) +) +#define DEFINE_AG_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_ag_resv_class, name, \ +	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \ +		 xfs_extlen_t len), \ +	TP_ARGS(pag, type, len)) + +/* per-AG reservation tracepoints */ +DEFINE_AG_RESV_EVENT(xfs_ag_resv_init); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); + +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); + +/* refcount tracepoint classes */ + +/* reuse the discard trace class for agbno/aglen-based traces */ +#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) + +/* ag btree lookup tracepoint class */ +#define XFS_AG_BTREE_CMP_FORMAT_STR \ +	{ XFS_LOOKUP_EQ,	"eq" }, \ +	{ XFS_LOOKUP_LE,	"le" }, \ +	{ XFS_LOOKUP_GE,	"ge" } +DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 xfs_agblock_t agbno, xfs_lookup_t dir), +	TP_ARGS(mp, agno, agbno, dir), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(xfs_agblock_t, agbno) +		__field(xfs_lookup_t, dir) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->agbno = agbno; +		__entry->dir = dir; +	), +	TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->agbno, +		  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), +		  __entry->dir) +) + +#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \ +DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \ +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ +		 xfs_agblock_t agbno, xfs_lookup_t dir), \ +	TP_ARGS(mp, agno, agbno, dir)) + +/* single-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_class, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 struct xfs_refcount_irec *irec), +	TP_ARGS(mp, agno, irec), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(xfs_agblock_t, startblock) +		__field(xfs_extlen_t, blockcount) +		__field(xfs_nlink_t, refcount) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->startblock = irec->rc_startblock; +		__entry->blockcount = irec->rc_blockcount; +		__entry->refcount = irec->rc_refcount; +	), +	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->startblock, +		  __entry->blockcount, +		  __entry->refcount) +) + +#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_class, name, \ +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ +		 struct xfs_refcount_irec *irec), \ +	TP_ARGS(mp, agno, irec)) + +/* single-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno), +	TP_ARGS(mp, agno, irec, agbno), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(xfs_agblock_t, startblock) +		__field(xfs_extlen_t, blockcount) +		__field(xfs_nlink_t, refcount) +		__field(xfs_agblock_t, agbno) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->startblock = irec->rc_startblock; +		__entry->blockcount = irec->rc_blockcount; +		__entry->refcount = irec->rc_refcount; +		__entry->agbno = agbno; +	), +	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->startblock, +		  __entry->blockcount, +		  __entry->refcount, +		  __entry->agbno) +) + +#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ +		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \ +	TP_ARGS(mp, agno, irec, agbno)) + +/* double-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), +	TP_ARGS(mp, agno, i1, i2), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(xfs_agblock_t, i1_startblock) +		__field(xfs_extlen_t, i1_blockcount) +		__field(xfs_nlink_t, i1_refcount) +		__field(xfs_agblock_t, i2_startblock) +		__field(xfs_extlen_t, i2_blockcount) +		__field(xfs_nlink_t, i2_refcount) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->i1_startblock = i1->rc_startblock; +		__entry->i1_blockcount = i1->rc_blockcount; +		__entry->i1_refcount = i1->rc_refcount; +		__entry->i2_startblock = i2->rc_startblock; +		__entry->i2_blockcount = i2->rc_blockcount; +		__entry->i2_refcount = i2->rc_refcount; +	), +	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " +		  "agbno %u len %u refcount %u\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->i1_startblock, +		  __entry->i1_blockcount, +		  __entry->i1_refcount, +		  __entry->i2_startblock, +		  __entry->i2_blockcount, +		  __entry->i2_refcount) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ +		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \ +	TP_ARGS(mp, agno, i1, i2)) + +/* double-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, +		 xfs_agblock_t agbno), +	TP_ARGS(mp, agno, i1, i2, agbno), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(xfs_agblock_t, i1_startblock) +		__field(xfs_extlen_t, i1_blockcount) +		__field(xfs_nlink_t, i1_refcount) +		__field(xfs_agblock_t, i2_startblock) +		__field(xfs_extlen_t, i2_blockcount) +		__field(xfs_nlink_t, i2_refcount) +		__field(xfs_agblock_t, agbno) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->i1_startblock = i1->rc_startblock; +		__entry->i1_blockcount = i1->rc_blockcount; +		__entry->i1_refcount = i1->rc_refcount; +		__entry->i2_startblock = i2->rc_startblock; +		__entry->i2_blockcount = i2->rc_blockcount; +		__entry->i2_refcount = i2->rc_refcount; +		__entry->agbno = agbno; +	), +	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " +		  "agbno %u len %u refcount %u @ agbno %u\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->i1_startblock, +		  __entry->i1_blockcount, +		  __entry->i1_refcount, +		  __entry->i2_startblock, +		  __entry->i2_blockcount, +		  __entry->i2_refcount, +		  __entry->agbno) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ +		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ +		 xfs_agblock_t agbno), \ +	TP_ARGS(mp, agno, i1, i2, agbno)) + +/* triple-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, +		 struct xfs_refcount_irec *i3), +	TP_ARGS(mp, agno, i1, i2, i3), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(xfs_agblock_t, i1_startblock) +		__field(xfs_extlen_t, i1_blockcount) +		__field(xfs_nlink_t, i1_refcount) +		__field(xfs_agblock_t, i2_startblock) +		__field(xfs_extlen_t, i2_blockcount) +		__field(xfs_nlink_t, i2_refcount) +		__field(xfs_agblock_t, i3_startblock) +		__field(xfs_extlen_t, i3_blockcount) +		__field(xfs_nlink_t, i3_refcount) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->i1_startblock = i1->rc_startblock; +		__entry->i1_blockcount = i1->rc_blockcount; +		__entry->i1_refcount = i1->rc_refcount; +		__entry->i2_startblock = i2->rc_startblock; +		__entry->i2_blockcount = i2->rc_blockcount; +		__entry->i2_refcount = i2->rc_refcount; +		__entry->i3_startblock = i3->rc_startblock; +		__entry->i3_blockcount = i3->rc_blockcount; +		__entry->i3_refcount = i3->rc_refcount; +	), +	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " +		  "agbno %u len %u refcount %u -- " +		  "agbno %u len %u refcount %u\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->agno, +		  __entry->i1_startblock, +		  __entry->i1_blockcount, +		  __entry->i1_refcount, +		  __entry->i2_startblock, +		  __entry->i2_blockcount, +		  __entry->i2_refcount, +		  __entry->i3_startblock, +		  __entry->i3_blockcount, +		  __entry->i3_refcount) +); + +#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ +		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ +		 struct xfs_refcount_irec *i3), \ +	TP_ARGS(mp, agno, i1, i2, i3)) + +/* refcount btree tracepoints */ +DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); +DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete); +DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); + +/* refcount adjustment tracepoints */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); +DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); +DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); + +/* reflink helpers */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); +#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); + +TRACE_EVENT(xfs_refcount_finish_one_leftover, +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +		 int type, xfs_agblock_t agbno, xfs_extlen_t len, +		 xfs_agblock_t new_agbno, xfs_extlen_t new_len), +	TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_agnumber_t, agno) +		__field(int, type) +		__field(xfs_agblock_t, agbno) +		__field(xfs_extlen_t, len) +		__field(xfs_agblock_t, new_agbno) +		__field(xfs_extlen_t, new_len) +	), +	TP_fast_assign( +		__entry->dev = mp->m_super->s_dev; +		__entry->agno = agno; +		__entry->type = type; +		__entry->agbno = agbno; +		__entry->len = len; +		__entry->new_agbno = new_agbno; +		__entry->new_len = new_len; +	), +	TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->type, +		  __entry->agno, +		  __entry->agbno, +		  __entry->len, +		  __entry->new_agbno, +		  __entry->new_len) +); + +/* simple inode-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_inode_error_class, +	TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip), +	TP_ARGS(ip, error, caller_ip), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, ino) +		__field(int, error) +		__field(unsigned long, caller_ip) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(ip)->i_sb->s_dev; +		__entry->ino = ip->i_ino; +		__entry->error = error; +		__entry->caller_ip = caller_ip; +	), +	TP_printk("dev %d:%d ino %llx error %d caller %ps", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->ino, +		  __entry->error, +		  (char *)__entry->caller_ip) +); + +#define DEFINE_INODE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_inode_error_class, name, \ +	TP_PROTO(struct xfs_inode *ip, int error, \ +		 unsigned long caller_ip), \ +	TP_ARGS(ip, error, caller_ip)) + +/* reflink allocator */ +TRACE_EVENT(xfs_bmap_remap_alloc, +	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, +		 xfs_extlen_t len), +	TP_ARGS(ip, fsbno, len), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, ino) +		__field(xfs_fsblock_t, fsbno) +		__field(xfs_extlen_t, len) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(ip)->i_sb->s_dev; +		__entry->ino = ip->i_ino; +		__entry->fsbno = fsbno; +		__entry->len = len; +	), +	TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->ino, +		  __entry->fsbno, +		  __entry->len) +); +DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); + +/* reflink tracepoint classes */ + +/* two-file io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_io_class, +	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, +		 struct xfs_inode *dest, xfs_off_t doffset), +	TP_ARGS(src, soffset, len, dest, doffset), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, src_ino) +		__field(loff_t, src_isize) +		__field(loff_t, src_disize) +		__field(loff_t, src_offset) +		__field(size_t, len) +		__field(xfs_ino_t, dest_ino) +		__field(loff_t, dest_isize) +		__field(loff_t, dest_disize) +		__field(loff_t, dest_offset) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(src)->i_sb->s_dev; +		__entry->src_ino = src->i_ino; +		__entry->src_isize = VFS_I(src)->i_size; +		__entry->src_disize = src->i_d.di_size; +		__entry->src_offset = soffset; +		__entry->len = len; +		__entry->dest_ino = dest->i_ino; +		__entry->dest_isize = VFS_I(dest)->i_size; +		__entry->dest_disize = dest->i_d.di_size; +		__entry->dest_offset = doffset; +	), +	TP_printk("dev %d:%d count %zd " +		  "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " +		  "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->len, +		  __entry->src_ino, +		  __entry->src_isize, +		  __entry->src_disize, +		  __entry->src_offset, +		  __entry->dest_ino, +		  __entry->dest_isize, +		  __entry->dest_disize, +		  __entry->dest_offset) +) + +#define DEFINE_DOUBLE_IO_EVENT(name)	\ +DEFINE_EVENT(xfs_double_io_class, name,	\ +	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \ +		 struct xfs_inode *dest, xfs_off_t doffset), \ +	TP_ARGS(src, soffset, len, dest, doffset)) + +/* two-file vfs io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, +	TP_PROTO(struct inode *src, u64 soffset, u64 len, +		 struct inode *dest, u64 doffset), +	TP_ARGS(src, soffset, len, dest, doffset), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(unsigned long, src_ino) +		__field(loff_t, src_isize) +		__field(loff_t, src_offset) +		__field(size_t, len) +		__field(unsigned long, dest_ino) +		__field(loff_t, dest_isize) +		__field(loff_t, dest_offset) +	), +	TP_fast_assign( +		__entry->dev = src->i_sb->s_dev; +		__entry->src_ino = src->i_ino; +		__entry->src_isize = i_size_read(src); +		__entry->src_offset = soffset; +		__entry->len = len; +		__entry->dest_ino = dest->i_ino; +		__entry->dest_isize = i_size_read(dest); +		__entry->dest_offset = doffset; +	), +	TP_printk("dev %d:%d count %zd " +		  "ino 0x%lx isize 0x%llx offset 0x%llx -> " +		  "ino 0x%lx isize 0x%llx offset 0x%llx", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->len, +		  __entry->src_ino, +		  __entry->src_isize, +		  __entry->src_offset, +		  __entry->dest_ino, +		  __entry->dest_isize, +		  __entry->dest_offset) +) + +#define DEFINE_DOUBLE_VFS_IO_EVENT(name)	\ +DEFINE_EVENT(xfs_double_vfs_io_class, name,	\ +	TP_PROTO(struct inode *src, u64 soffset, u64 len, \ +		 struct inode *dest, u64 doffset), \ +	TP_ARGS(src, soffset, len, dest, doffset)) + +/* CoW write tracepoint */ +DECLARE_EVENT_CLASS(xfs_copy_on_write_class, +	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, +		 xfs_extlen_t len, xfs_fsblock_t new_pblk), +	TP_ARGS(ip, lblk, pblk, len, new_pblk), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, ino) +		__field(xfs_fileoff_t, lblk) +		__field(xfs_fsblock_t, pblk) +		__field(xfs_extlen_t, len) +		__field(xfs_fsblock_t, new_pblk) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(ip)->i_sb->s_dev; +		__entry->ino = ip->i_ino; +		__entry->lblk = lblk; +		__entry->pblk = pblk; +		__entry->len = len; +		__entry->new_pblk = new_pblk; +	), +	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " +		  "len 0x%x new_pblk %llu", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->ino, +		  __entry->lblk, +		  __entry->pblk, +		  __entry->len, +		  __entry->new_pblk) +) + +#define DEFINE_COW_EVENT(name)	\ +DEFINE_EVENT(xfs_copy_on_write_class, name,	\ +	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ +		 xfs_extlen_t len, xfs_fsblock_t new_pblk), \ +	TP_ARGS(ip, lblk, pblk, len, new_pblk)) + +/* inode/irec events */ +DECLARE_EVENT_CLASS(xfs_inode_irec_class, +	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), +	TP_ARGS(ip, irec), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, ino) +		__field(xfs_fileoff_t, lblk) +		__field(xfs_extlen_t, len) +		__field(xfs_fsblock_t, pblk) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(ip)->i_sb->s_dev; +		__entry->ino = ip->i_ino; +		__entry->lblk = irec->br_startoff; +		__entry->len = irec->br_blockcount; +		__entry->pblk = irec->br_startblock; +	), +	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->ino, +		  __entry->lblk, +		  __entry->len, +		  __entry->pblk) +); +#define DEFINE_INODE_IREC_EVENT(name) \ +DEFINE_EVENT(xfs_inode_irec_class, name, \ +	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ +	TP_ARGS(ip, irec)) + +/* refcount/reflink tracepoint definitions */ + +/* reflink tracepoints */ +DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); +DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); +DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); +DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +TRACE_EVENT(xfs_reflink_remap_blocks_loop, +	TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, +		 xfs_filblks_t len, struct xfs_inode *dest, +		 xfs_fileoff_t doffset), +	TP_ARGS(src, soffset, len, dest, doffset), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, src_ino) +		__field(xfs_fileoff_t, src_lblk) +		__field(xfs_filblks_t, len) +		__field(xfs_ino_t, dest_ino) +		__field(xfs_fileoff_t, dest_lblk) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(src)->i_sb->s_dev; +		__entry->src_ino = src->i_ino; +		__entry->src_lblk = soffset; +		__entry->len = len; +		__entry->dest_ino = dest->i_ino; +		__entry->dest_lblk = doffset; +	), +	TP_printk("dev %d:%d len 0x%llx " +		  "ino 0x%llx offset 0x%llx blocks -> " +		  "ino 0x%llx offset 0x%llx blocks", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->len, +		  __entry->src_ino, +		  __entry->src_lblk, +		  __entry->dest_ino, +		  __entry->dest_lblk) +); +TRACE_EVENT(xfs_reflink_punch_range, +	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, +		 xfs_extlen_t len), +	TP_ARGS(ip, lblk, len), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, ino) +		__field(xfs_fileoff_t, lblk) +		__field(xfs_extlen_t, len) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(ip)->i_sb->s_dev; +		__entry->ino = ip->i_ino; +		__entry->lblk = lblk; +		__entry->len = len; +	), +	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->ino, +		  __entry->lblk, +		  __entry->len) +); +TRACE_EVENT(xfs_reflink_remap, +	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, +		 xfs_extlen_t len, xfs_fsblock_t new_pblk), +	TP_ARGS(ip, lblk, len, new_pblk), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(xfs_ino_t, ino) +		__field(xfs_fileoff_t, lblk) +		__field(xfs_extlen_t, len) +		__field(xfs_fsblock_t, new_pblk) +	), +	TP_fast_assign( +		__entry->dev = VFS_I(ip)->i_sb->s_dev; +		__entry->ino = ip->i_ino; +		__entry->lblk = lblk; +		__entry->len = len; +		__entry->new_pblk = new_pblk; +	), +	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->ino, +		  __entry->lblk, +		  __entry->len, +		  __entry->new_pblk) +); +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); + +/* dedupe tracepoints */ +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); + +/* ioctl tracepoints */ +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); +TRACE_EVENT(xfs_ioctl_clone, +	TP_PROTO(struct inode *src, struct inode *dest), +	TP_ARGS(src, dest), +	TP_STRUCT__entry( +		__field(dev_t, dev) +		__field(unsigned long, src_ino) +		__field(loff_t, src_isize) +		__field(unsigned long, dest_ino) +		__field(loff_t, dest_isize) +	), +	TP_fast_assign( +		__entry->dev = src->i_sb->s_dev; +		__entry->src_ino = src->i_ino; +		__entry->src_isize = i_size_read(src); +		__entry->dest_ino = dest->i_ino; +		__entry->dest_isize = i_size_read(dest); +	), +	TP_printk("dev %d:%d " +		  "ino 0x%lx isize 0x%llx -> " +		  "ino 0x%lx isize 0x%llx\n", +		  MAJOR(__entry->dev), MINOR(__entry->dev), +		  __entry->src_ino, +		  __entry->src_isize, +		  __entry->dest_ino, +		  __entry->dest_isize) +); + +/* unshare tracepoints */ +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); +DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); + +/* copy on write */ +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); + +DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range); +DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); + +DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); + +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); + +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); + +DEFINE_COW_EVENT(xfs_reflink_fork_buf); +DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); + +DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); + +/* rmap swapext tracepoints */ +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); +DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); +  #endif /* _TRACE_XFS_H */  #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5f3d33d16e67..70f42ea86dfb 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -217,7 +217,7 @@ undo_log:  undo_blocks:  	if (blocks > 0) { -		xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); +		xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);  		tp->t_blk_res = 0;  	} @@ -318,7 +318,6 @@ xfs_trans_mod_sb(  		 * in-core superblock's counter.  This should only  		 * be applied to the on-disk superblock.  		 */ -		ASSERT(delta < 0);  		tp->t_res_fdblocks_delta += delta;  		if (xfs_sb_version_haslazysbcount(&mp->m_sb))  			flags &= ~XFS_TRANS_SB_DIRTY; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e2bf86aad33d..61b7fbdd3ebd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -36,6 +36,11 @@ struct xfs_busy_extent;  struct xfs_rud_log_item;  struct xfs_rui_log_item;  struct xfs_btree_cur; +struct xfs_cui_log_item; +struct xfs_cud_log_item; +struct xfs_defer_ops; +struct xfs_bui_log_item; +struct xfs_bud_log_item;  typedef struct xfs_log_item {  	struct list_head		li_ail;		/* AIL pointers */ @@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,  		xfs_fsblock_t startblock, xfs_filblks_t blockcount,  		xfs_exntst_t state, struct xfs_btree_cur **pcur); +/* refcount updates */ +enum xfs_refcount_intent_type; + +void xfs_refcount_update_init_defer_op(void); +struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, +		struct xfs_cui_log_item *cuip); +int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, +		struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops, +		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, +		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, +		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + +/* mapping updates */ +enum xfs_bmap_intent_type; + +void xfs_bmap_update_init_defer_op(void); +struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, +		struct xfs_bui_log_item *buip); +int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, +		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, +		enum xfs_bmap_intent_type type, struct xfs_inode *ip, +		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, +		xfs_filblks_t blockcount, xfs_exntst_t state); +  #endif	/* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c new file mode 100644 index 000000000000..6408e7d7c08c --- /dev/null +++ b/fs/xfs/xfs_trans_bmap.c @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * This routine is called to allocate a "bmap update done" + * log item. + */ +struct xfs_bud_log_item * +xfs_trans_get_bud( +	struct xfs_trans		*tp, +	struct xfs_bui_log_item		*buip) +{ +	struct xfs_bud_log_item		*budp; + +	budp = xfs_bud_init(tp->t_mountp, buip); +	xfs_trans_add_item(tp, &budp->bud_item); +	return budp; +} + +/* + * Finish an bmap update and log it to the BUD. Note that the + * transaction is marked dirty regardless of whether the bmap update + * succeeds or fails to support the BUI/BUD lifecycle rules. + */ +int +xfs_trans_log_finish_bmap_update( +	struct xfs_trans		*tp, +	struct xfs_bud_log_item		*budp, +	struct xfs_defer_ops		*dop, +	enum xfs_bmap_intent_type	type, +	struct xfs_inode		*ip, +	int				whichfork, +	xfs_fileoff_t			startoff, +	xfs_fsblock_t			startblock, +	xfs_filblks_t			blockcount, +	xfs_exntst_t			state) +{ +	int				error; + +	error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff, +			startblock, blockcount, state); + +	/* +	 * Mark the transaction dirty, even on error. This ensures the +	 * transaction is aborted, which: +	 * +	 * 1.) releases the BUI and frees the BUD +	 * 2.) shuts down the filesystem +	 */ +	tp->t_flags |= XFS_TRANS_DIRTY; +	budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + +	return error; +} + +/* Sort bmap intents by inode. */ +static int +xfs_bmap_update_diff_items( +	void				*priv, +	struct list_head		*a, +	struct list_head		*b) +{ +	struct xfs_bmap_intent		*ba; +	struct xfs_bmap_intent		*bb; + +	ba = container_of(a, struct xfs_bmap_intent, bi_list); +	bb = container_of(b, struct xfs_bmap_intent, bi_list); +	return ba->bi_owner->i_ino - bb->bi_owner->i_ino; +} + +/* Get an BUI. */ +STATIC void * +xfs_bmap_update_create_intent( +	struct xfs_trans		*tp, +	unsigned int			count) +{ +	struct xfs_bui_log_item		*buip; + +	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); +	ASSERT(tp != NULL); + +	buip = xfs_bui_init(tp->t_mountp); +	ASSERT(buip != NULL); + +	/* +	 * Get a log_item_desc to point at the new item. +	 */ +	xfs_trans_add_item(tp, &buip->bui_item); +	return buip; +} + +/* Set the map extent flags for this mapping. */ +static void +xfs_trans_set_bmap_flags( +	struct xfs_map_extent		*bmap, +	enum xfs_bmap_intent_type	type, +	int				whichfork, +	xfs_exntst_t			state) +{ +	bmap->me_flags = 0; +	switch (type) { +	case XFS_BMAP_MAP: +	case XFS_BMAP_UNMAP: +		bmap->me_flags = type; +		break; +	default: +		ASSERT(0); +	} +	if (state == XFS_EXT_UNWRITTEN) +		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; +	if (whichfork == XFS_ATTR_FORK) +		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; +} + +/* Log bmap updates in the intent item. */ +STATIC void +xfs_bmap_update_log_item( +	struct xfs_trans		*tp, +	void				*intent, +	struct list_head		*item) +{ +	struct xfs_bui_log_item		*buip = intent; +	struct xfs_bmap_intent		*bmap; +	uint				next_extent; +	struct xfs_map_extent		*map; + +	bmap = container_of(item, struct xfs_bmap_intent, bi_list); + +	tp->t_flags |= XFS_TRANS_DIRTY; +	buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + +	/* +	 * atomic_inc_return gives us the value after the increment; +	 * we want to use it as an array index so we need to subtract 1 from +	 * it. +	 */ +	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; +	ASSERT(next_extent < buip->bui_format.bui_nextents); +	map = &buip->bui_format.bui_extents[next_extent]; +	map->me_owner = bmap->bi_owner->i_ino; +	map->me_startblock = bmap->bi_bmap.br_startblock; +	map->me_startoff = bmap->bi_bmap.br_startoff; +	map->me_len = bmap->bi_bmap.br_blockcount; +	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, +			bmap->bi_bmap.br_state); +} + +/* Get an BUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_bmap_update_create_done( +	struct xfs_trans		*tp, +	void				*intent, +	unsigned int			count) +{ +	return xfs_trans_get_bud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_bmap_update_finish_item( +	struct xfs_trans		*tp, +	struct xfs_defer_ops		*dop, +	struct list_head		*item, +	void				*done_item, +	void				**state) +{ +	struct xfs_bmap_intent		*bmap; +	int				error; + +	bmap = container_of(item, struct xfs_bmap_intent, bi_list); +	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, +			bmap->bi_type, +			bmap->bi_owner, bmap->bi_whichfork, +			bmap->bi_bmap.br_startoff, +			bmap->bi_bmap.br_startblock, +			bmap->bi_bmap.br_blockcount, +			bmap->bi_bmap.br_state); +	kmem_free(bmap); +	return error; +} + +/* Abort all pending BUIs. */ +STATIC void +xfs_bmap_update_abort_intent( +	void				*intent) +{ +	xfs_bui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_bmap_update_cancel_item( +	struct list_head		*item) +{ +	struct xfs_bmap_intent		*bmap; + +	bmap = container_of(item, struct xfs_bmap_intent, bi_list); +	kmem_free(bmap); +} + +static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { +	.type		= XFS_DEFER_OPS_TYPE_BMAP, +	.max_items	= XFS_BUI_MAX_FAST_EXTENTS, +	.diff_items	= xfs_bmap_update_diff_items, +	.create_intent	= xfs_bmap_update_create_intent, +	.abort_intent	= xfs_bmap_update_abort_intent, +	.log_item	= xfs_bmap_update_log_item, +	.create_done	= xfs_bmap_update_create_done, +	.finish_item	= xfs_bmap_update_finish_item, +	.cancel_item	= xfs_bmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_bmap_update_init_defer_op(void) +{ +	xfs_defer_init_op_type(&xfs_bmap_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 459ddec137a4..ab438647592a 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -79,7 +79,8 @@ xfs_trans_free_extent(  	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); -	error = xfs_free_extent(tp, start_block, ext_len, oinfo); +	error = xfs_free_extent(tp, start_block, ext_len, oinfo, +			XFS_AG_RESV_NONE);  	/*  	 * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 11a3af08b5c7..dab8daa676f9 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -73,7 +73,7 @@ xfs_trans_ichgtime(  	ASSERT(tp);  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); -	tv = current_fs_time(inode->i_sb); +	tv = current_time(inode);  	if (flags & XFS_ICHGTIME_MOD)  		inode->i_mtime = tv; diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c new file mode 100644 index 000000000000..94c1877af834 --- /dev/null +++ b/fs/xfs/xfs_trans_refcount.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2016 Oracle.  All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_refcount_item.h" +#include "xfs_alloc.h" +#include "xfs_refcount.h" + +/* + * This routine is called to allocate a "refcount update done" + * log item. + */ +struct xfs_cud_log_item * +xfs_trans_get_cud( +	struct xfs_trans		*tp, +	struct xfs_cui_log_item		*cuip) +{ +	struct xfs_cud_log_item		*cudp; + +	cudp = xfs_cud_init(tp->t_mountp, cuip); +	xfs_trans_add_item(tp, &cudp->cud_item); +	return cudp; +} + +/* + * Finish an refcount update and log it to the CUD. Note that the + * transaction is marked dirty regardless of whether the refcount + * update succeeds or fails to support the CUI/CUD lifecycle rules. + */ +int +xfs_trans_log_finish_refcount_update( +	struct xfs_trans		*tp, +	struct xfs_cud_log_item		*cudp, +	struct xfs_defer_ops		*dop, +	enum xfs_refcount_intent_type	type, +	xfs_fsblock_t			startblock, +	xfs_extlen_t			blockcount, +	xfs_fsblock_t			*new_fsb, +	xfs_extlen_t			*new_len, +	struct xfs_btree_cur		**pcur) +{ +	int				error; + +	error = xfs_refcount_finish_one(tp, dop, type, startblock, +			blockcount, new_fsb, new_len, pcur); + +	/* +	 * Mark the transaction dirty, even on error. This ensures the +	 * transaction is aborted, which: +	 * +	 * 1.) releases the CUI and frees the CUD +	 * 2.) shuts down the filesystem +	 */ +	tp->t_flags |= XFS_TRANS_DIRTY; +	cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + +	return error; +} + +/* Sort refcount intents by AG. */ +static int +xfs_refcount_update_diff_items( +	void				*priv, +	struct list_head		*a, +	struct list_head		*b) +{ +	struct xfs_mount		*mp = priv; +	struct xfs_refcount_intent	*ra; +	struct xfs_refcount_intent	*rb; + +	ra = container_of(a, struct xfs_refcount_intent, ri_list); +	rb = container_of(b, struct xfs_refcount_intent, ri_list); +	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - +		XFS_FSB_TO_AGNO(mp, rb->ri_startblock); +} + +/* Get an CUI. */ +STATIC void * +xfs_refcount_update_create_intent( +	struct xfs_trans		*tp, +	unsigned int			count) +{ +	struct xfs_cui_log_item		*cuip; + +	ASSERT(tp != NULL); +	ASSERT(count > 0); + +	cuip = xfs_cui_init(tp->t_mountp, count); +	ASSERT(cuip != NULL); + +	/* +	 * Get a log_item_desc to point at the new item. +	 */ +	xfs_trans_add_item(tp, &cuip->cui_item); +	return cuip; +} + +/* Set the phys extent flags for this reverse mapping. */ +static void +xfs_trans_set_refcount_flags( +	struct xfs_phys_extent		*refc, +	enum xfs_refcount_intent_type	type) +{ +	refc->pe_flags = 0; +	switch (type) { +	case XFS_REFCOUNT_INCREASE: +	case XFS_REFCOUNT_DECREASE: +	case XFS_REFCOUNT_ALLOC_COW: +	case XFS_REFCOUNT_FREE_COW: +		refc->pe_flags |= type; +		break; +	default: +		ASSERT(0); +	} +} + +/* Log refcount updates in the intent item. */ +STATIC void +xfs_refcount_update_log_item( +	struct xfs_trans		*tp, +	void				*intent, +	struct list_head		*item) +{ +	struct xfs_cui_log_item		*cuip = intent; +	struct xfs_refcount_intent	*refc; +	uint				next_extent; +	struct xfs_phys_extent		*ext; + +	refc = container_of(item, struct xfs_refcount_intent, ri_list); + +	tp->t_flags |= XFS_TRANS_DIRTY; +	cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + +	/* +	 * atomic_inc_return gives us the value after the increment; +	 * we want to use it as an array index so we need to subtract 1 from +	 * it. +	 */ +	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; +	ASSERT(next_extent < cuip->cui_format.cui_nextents); +	ext = &cuip->cui_format.cui_extents[next_extent]; +	ext->pe_startblock = refc->ri_startblock; +	ext->pe_len = refc->ri_blockcount; +	xfs_trans_set_refcount_flags(ext, refc->ri_type); +} + +/* Get an CUD so we can process all the deferred refcount updates. */ +STATIC void * +xfs_refcount_update_create_done( +	struct xfs_trans		*tp, +	void				*intent, +	unsigned int			count) +{ +	return xfs_trans_get_cud(tp, intent); +} + +/* Process a deferred refcount update. */ +STATIC int +xfs_refcount_update_finish_item( +	struct xfs_trans		*tp, +	struct xfs_defer_ops		*dop, +	struct list_head		*item, +	void				*done_item, +	void				**state) +{ +	struct xfs_refcount_intent	*refc; +	xfs_fsblock_t			new_fsb; +	xfs_extlen_t			new_aglen; +	int				error; + +	refc = container_of(item, struct xfs_refcount_intent, ri_list); +	error = xfs_trans_log_finish_refcount_update(tp, done_item, dop, +			refc->ri_type, +			refc->ri_startblock, +			refc->ri_blockcount, +			&new_fsb, &new_aglen, +			(struct xfs_btree_cur **)state); +	/* Did we run out of reservation?  Requeue what we didn't finish. */ +	if (!error && new_aglen > 0) { +		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || +		       refc->ri_type == XFS_REFCOUNT_DECREASE); +		refc->ri_startblock = new_fsb; +		refc->ri_blockcount = new_aglen; +		return -EAGAIN; +	} +	kmem_free(refc); +	return error; +} + +/* Clean up after processing deferred refcounts. */ +STATIC void +xfs_refcount_update_finish_cleanup( +	struct xfs_trans	*tp, +	void			*state, +	int			error) +{ +	struct xfs_btree_cur	*rcur = state; + +	xfs_refcount_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending CUIs. */ +STATIC void +xfs_refcount_update_abort_intent( +	void				*intent) +{ +	xfs_cui_release(intent); +} + +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( +	struct list_head		*item) +{ +	struct xfs_refcount_intent	*refc; + +	refc = container_of(item, struct xfs_refcount_intent, ri_list); +	kmem_free(refc); +} + +static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { +	.type		= XFS_DEFER_OPS_TYPE_REFCOUNT, +	.max_items	= XFS_CUI_MAX_FAST_EXTENTS, +	.diff_items	= xfs_refcount_update_diff_items, +	.create_intent	= xfs_refcount_update_create_intent, +	.abort_intent	= xfs_refcount_update_abort_intent, +	.log_item	= xfs_refcount_update_log_item, +	.create_done	= xfs_refcount_update_create_done, +	.finish_item	= xfs_refcount_update_finish_item, +	.finish_cleanup = xfs_refcount_update_finish_cleanup, +	.cancel_item	= xfs_refcount_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_refcount_update_init_defer_op(void) +{ +	xfs_defer_init_op_type(&xfs_refcount_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 5a50ef881568..9ead064b5e90 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags(  	case XFS_RMAP_MAP:  		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;  		break; +	case XFS_RMAP_MAP_SHARED: +		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; +		break;  	case XFS_RMAP_UNMAP:  		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;  		break; +	case XFS_RMAP_UNMAP_SHARED: +		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; +		break;  	case XFS_RMAP_CONVERT:  		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;  		break; +	case XFS_RMAP_CONVERT_SHARED: +		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; +		break;  	case XFS_RMAP_ALLOC:  		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;  		break; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ea62245fee26..62900938f26d 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -147,6 +147,7 @@ __xfs_xattr_put_listent(  	arraytop = context->count + prefix_len + namelen + 1;  	if (arraytop > context->firstu) {  		context->count = -1;	/* insufficient space */ +		context->seen_enough = 1;  		return 0;  	}  	offset = (char *)context->alist + context->count;  | 

