diff options
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot_item.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_extfree_item.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 258 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 1 |
7 files changed, 202 insertions, 64 deletions
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 34257992934c..4561b1e2f2f9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -949,6 +949,7 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { xfs_buf_item_free_format(bip); + kmem_free(bip->bli_item.li_lv_shadow); kmem_zone_free(xfs_buf_item_zone, bip); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e0646659ce16..ccb0811963b2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -74,6 +74,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); + kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 814cff94e78f..2c7a1629e064 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + kmem_free(qfs->qql_item.li_lv_shadow); + kmem_free(lip->li_lv_shadow); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 4aa0153214f9..ab779460ecbf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,6 +40,7 @@ void xfs_efi_item_free( struct xfs_efi_log_item *efip) { + kmem_free(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { + kmem_free(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a1b07612224c..892c2aced207 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -651,6 +651,7 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { + kmem_free(ip->i_itemp->ili_item.li_lv_shadow); kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5e54e7955ea6..a4ab192e1792 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -78,6 +78,157 @@ xlog_cil_init_post_recovery( log->l_cilp->xc_ctx->sequence = 1; } +static inline int +xlog_cil_iovec_space( + uint niovecs) +{ + return round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); +} + +/* + * Allocate or pin log vector buffers for CIL insertion. + * + * The CIL currently uses disposable buffers for copying a snapshot of the + * modified items into the log during a push. The biggest problem with this is + * the requirement to allocate the disposable buffer during the commit if: + * a) does not exist; or + * b) it is too small + * + * If we do this allocation within xlog_cil_insert_format_items(), it is done + * under the xc_ctx_lock, which means that a CIL push cannot occur during + * the memory allocation. This means that we have a potential deadlock situation + * under low memory conditions when we have lots of dirty metadata pinned in + * the CIL and we need a CIL commit to occur to free memory. + * + * To avoid this, we need to move the memory allocation outside the + * xc_ctx_lock, but because the log vector buffers are disposable, that opens + * up a TOCTOU race condition w.r.t. the CIL committing and removing the log + * vector buffers between the check and the formatting of the item into the + * log vector buffer within the xc_ctx_lock. + * + * Because the log vector buffer needs to be unchanged during the CIL push + * process, we cannot share the buffer between the transaction commit (which + * modifies the buffer) and the CIL push context that is writing the changes + * into the log. This means skipping preallocation of buffer space is + * unreliable, but we most definitely do not want to be allocating and freeing + * buffers unnecessarily during commits when overwrites can be done safely. + * + * The simplest solution to this problem is to allocate a shadow buffer when a + * log item is committed for the second time, and then to only use this buffer + * if necessary. The buffer can remain attached to the log item until such time + * it is needed, and this is the buffer that is reallocated to match the size of + * the incoming modification. Then during the formatting of the item we can swap + * the active buffer with the new one if we can't reuse the existing buffer. We + * don't free the old buffer as it may be reused on the next modification if + * it's size is right, otherwise we'll free and reallocate it at that point. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and attaches the vector to the log item in preparation + * for the formatting step which occurs under the xc_ctx_lock. + * + * While this means the memory footprint goes up, it avoids the repeated + * alloc/free pattern that repeated modifications of an item would otherwise + * cause, and hence minimises the CPU overhead of such behaviour. + */ +static void +xlog_cil_alloc_shadow_bufs( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + xlog_cil_iovec_space(niovecs); + + /* + * if we have no shadow buffer, or it is too small, we need to + * reallocate it. + */ + if (!lip->li_lv_shadow || + buf_size > lip->li_lv_shadow->lv_size) { + + /* + * We free and allocate here as a realloc would copy + * unecessary data. We don't use kmem_zalloc() for the + * same reason - we don't need to zero the data area in + * the buffer, only the log vector header and the iovec + * storage. + */ + kmem_free(lip->li_lv_shadow); + + lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + memset(lv, 0, xlog_cil_iovec_space(niovecs)); + + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + lip->li_lv_shadow = lv; + } else { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv_shadow; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_next = NULL; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); + } + +} + /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -100,16 +251,19 @@ xfs_cil_prepare_item( /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and free it. + * old_lv, then remove the space it accounts for and make it the shadow + * buffer for later freeing. In both cases we are now switching to the + * shadow buffer, so update the the pointer to it appropriately. */ - if (!old_lv) + if (!old_lv) { lv->lv_item->li_ops->iop_pin(lv->lv_item); - else if (old_lv != lv) { + lv->lv_item->li_lv_shadow = NULL; + } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; - kmem_free(old_lv); + lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ @@ -133,11 +287,13 @@ xfs_cil_prepare_item( * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * - * This function builds a vector for the changes in each log item in the - * transaction. It then works out the length of the buffer needed for each log - * item, allocates them and formats the vector for the item into the buffer. - * The buffer is then attached to the log item are then inserted into the - * Committed Item List for tracking until the next checkpoint is written out. + * This function takes the prepared log vectors attached to each log item, and + * formats the changes into the log vector buffer. The buffer it uses is + * dependent on the current state of the vector in the CIL - the shadow lv is + * guaranteed to be large enough for the current modification, but we will only + * use that if we can't reuse the existing lv. If we can't reuse the existing + * lv, then simple swap it out for the shadow lv. We don't free it - that is + * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a @@ -170,59 +326,29 @@ xlog_cil_insert_format_items( list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_item *lip = lidp->lid_item; struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv; - int niovecs = 0; - int nbytes = 0; - int buf_size; + struct xfs_log_vec *old_lv = NULL; + struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - /* get number of vecs and size of data to be stored */ - lip->li_ops->iop_size(lip, &niovecs, &nbytes); - - /* Skip items that do not have any vectors for writing */ - if (!niovecs) - continue; - /* - * Ordered items need to be tracked but we do not wish to write - * them. We need a logvec to track the object, but we do not - * need an iovec or buffer to be allocated for copying data. + * The formatting size information is already attached to + * the shadow lv on the log item. */ - if (niovecs == XFS_LOG_VEC_ORDERED) { + shadow = lip->li_lv_shadow; + if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; - niovecs = 0; - nbytes = 0; - } - /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. - */ - nbytes += niovecs * sizeof(uint64_t); - nbytes = round_up(nbytes, sizeof(uint64_t)); - - /* grab the old item if it exists for reservation accounting */ - old_lv = lip->li_lv; - - /* - * The data buffer needs to start 64-bit aligned, so round up - * that space to ensure we can align it appropriately and not - * overrun the buffer. - */ - buf_size = nbytes + - round_up((sizeof(struct xfs_log_vec) + - niovecs * sizeof(struct xfs_log_iovec)), - sizeof(uint64_t)); + /* Skip items that do not have any vectors for writing */ + if (!shadow->lv_niovecs && !ordered) + continue; /* compare to existing item size */ - if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + old_lv = lip->li_lv; + if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; lv->lv_next = NULL; @@ -236,32 +362,29 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = shadow->lv_niovecs; + + /* reset the lv buffer information for new formatting */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + + xlog_cil_iovec_space(lv->lv_niovecs); } else { - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + /* switch to shadow buffer! */ + lv = shadow; lv->lv_item = lip; - lv->lv_size = buf_size; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; goto insert; } - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - - /* The allocated data region lies beyond the iovec region */ - lv->lv_buf_len = 0; - lv->lv_bytes = 0; - lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lip->li_ops->iop_format(lip, lv); insert: - ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } } @@ -783,6 +906,13 @@ xfs_log_commit_cil( struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + /* + * Do all necessary memory allocation before we lock the CIL. + * This ensures the allocation does not deadlock with a CIL + * push in memory reclaim (e.g. from kswapd). + */ + xlog_cil_alloc_shadow_bufs(log, tp); + /* lock out background commit */ down_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9a462e892e4f..9b2b9fa89331 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -52,6 +52,7 @@ typedef struct xfs_log_item { /* delayed logging */ struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; |