diff options
Diffstat (limited to 'fs/xfs')
150 files changed, 8020 insertions, 8702 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 06b68b6115bc..aceca2f9a3db 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -27,7 +27,6 @@ xfs-y += $(addprefix libxfs/, \ xfs_bmap_btree.o \ xfs_btree.o \ xfs_da_btree.o \ - xfs_da_format.o \ xfs_defer.o \ xfs_dir2.o \ xfs_dir2_block.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 16bb9a328678..1da94237a8cf 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -3,10 +3,10 @@ * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/sched/mm.h> +#include "xfs.h" #include <linux/backing-dev.h> -#include "kmem.h" #include "xfs_message.h" +#include "xfs_trace.h" void * kmem_alloc(size_t size, xfs_km_flags_t flags) @@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_alloc(size, flags, _RET_IP_); + do { ptr = kmalloc(size, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, @@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } while (1); } -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) + +/* + * __vmalloc() will allocate data pages and auxiliary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence + * we need to tell memory reclaim that we are in such a context via + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here + * and potentially deadlocking. + */ +static void * +__kmem_vmalloc(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; - gfp_t lflags; - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; + gfp_t lflags = kmem_flags_convert(flags); - /* - * __vmalloc() will allocate data pages and auxillary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context - * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering - * the filesystem here and potentially deadlocking. - */ if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags, PAGE_KERNEL); if (flags & KM_NOFS) @@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; } +/* + * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned + * to the @align_mask. We only guarantee alignment up to page size, we'll clamp + * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE + * aligned region. + */ +void * +kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_io(size, flags, _RET_IP_); + + if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) + align_mask = PAGE_SIZE - 1; + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) { + if (!((uintptr_t)ptr & align_mask)) + return ptr; + kfree(ptr); + } + return __kmem_vmalloc(size, flags); +} + +void * +kmem_alloc_large(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_large(size, flags, _RET_IP_); + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + return __kmem_vmalloc(size, flags); +} + void * kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { @@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_realloc(newsize, flags, _RET_IP_); + do { ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, @@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); do { ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 267655acd426..6143117770e9 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -16,8 +16,6 @@ */ typedef unsigned __bitwise xfs_km_flags_t; -#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) -#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) @@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); + BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); - if (flags & KM_NOSLEEP) { - lflags = GFP_ATOMIC | __GFP_NOWARN; - } else { - lflags = GFP_KERNEL | __GFP_NOWARN; - if (flags & KM_NOFS) - lflags &= ~__GFP_FS; - } + lflags = GFP_KERNEL | __GFP_NOWARN; + if (flags & KM_NOFS) + lflags &= ~__GFP_FS; /* * Default page/slab allocator behavior is to retry for ever @@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) @@ -83,39 +78,9 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * Zone interfaces */ -#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN -#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT -#define KM_ZONE_SPREAD SLAB_MEM_SPREAD -#define KM_ZONE_ACCOUNT SLAB_ACCOUNT - #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache -static inline kmem_zone_t * -kmem_zone_init(int size, char *zone_name) -{ - return kmem_cache_create(zone_name, size, 0, 0, NULL); -} - -static inline kmem_zone_t * -kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags, - void (*construct)(void *)) -{ - return kmem_cache_create(zone_name, size, 0, flags, construct); -} - -static inline void -kmem_zone_free(kmem_zone_t *zone, void *ptr) -{ - kmem_cache_free(zone, ptr); -} - -static inline void -kmem_zone_destroy(kmem_zone_t *zone) -{ - kmem_cache_destroy(zone); -} - extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); static inline void * diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5de296b34ab1..08d6beb54f8c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -23,26 +23,28 @@ #include "xfs_ag_resv.h" #include "xfs_health.h" -static struct xfs_buf * +static int xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + int error; - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); - if (!bp) - return NULL; + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + if (error) + return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; - return bp; + *bpp = bp; + return 0; } static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) @@ -341,13 +343,13 @@ xfs_ag_init_hdr( struct aghdr_init_data *id, aghdr_init_work_f work, const struct xfs_buf_ops *ops) - { struct xfs_buf *bp; + int error; - bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); - if (!bp) - return -ENOMEM; + error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops); + if (error) + return error; (*work)(mp, bp, id); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 87a9747f1d36..fdfe6dc0d307 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -19,6 +19,8 @@ #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" /* * Per-AG Block Reservations diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 372ad55631fc..d8053bc96c4d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -146,9 +146,13 @@ xfs_alloc_lookup_eq( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { + int error; + cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); + error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); + cur->bc_private.a.priv.abt.active = (*stat == 1); + return error; } /* @@ -162,9 +166,13 @@ xfs_alloc_lookup_ge( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { + int error; + cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); + error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); + cur->bc_private.a.priv.abt.active = (*stat == 1); + return error; } /* @@ -178,9 +186,19 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { + int error; cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + cur->bc_private.a.priv.abt.active = (*stat == 1); + return error; +} + +static inline bool +xfs_alloc_cur_active( + struct xfs_btree_cur *cur) +{ + return cur && cur->bc_private.a.priv.abt.active; } /* @@ -313,7 +331,7 @@ xfs_alloc_compute_diff( xfs_extlen_t newlen1=0; /* length with newbno1 */ xfs_extlen_t newlen2=0; /* length with newbno2 */ xfs_agblock_t wantend; /* end of target extent */ - bool userdata = xfs_alloc_is_userdata(datatype); + bool userdata = datatype & XFS_ALLOC_USERDATA; ASSERT(freelen >= wantlen); freeend = freebno + freelen; @@ -433,13 +451,17 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, - i == 1 && nfbno1 == fbno && nflen1 == flen); + if (XFS_IS_CORRUPT(mp, + i != 1 || + nfbno1 != fbno || + nflen1 != flen)) + return -EFSCORRUPTED; #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } /* * Look up the record in the by-block tree if necessary. @@ -448,13 +470,17 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, - i == 1 && nfbno1 == fbno && nflen1 == flen); + if (XFS_IS_CORRUPT(mp, + i != 1 || + nfbno1 != fbno || + nflen1 != flen)) + return -EFSCORRUPTED; #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } #ifdef DEBUG @@ -465,8 +491,10 @@ xfs_alloc_fixup_trees( bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN(mp, - bnoblock->bb_numrecs == cntblock->bb_numrecs); + if (XFS_IS_CORRUPT(mp, + bnoblock->bb_numrecs != + cntblock->bb_numrecs)) + return -EFSCORRUPTED; } #endif @@ -496,25 +524,30 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) + return -EFSCORRUPTED; if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) + return -EFSCORRUPTED; if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } /* * Fix up the by-block btree entry(s). @@ -525,7 +558,8 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } else { /* * Update the by-block entry to start later|be shorter. @@ -539,10 +573,12 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) + return -EFSCORRUPTED; if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } return 0; } @@ -684,16 +720,298 @@ xfs_alloc_update_counters( xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > - be32_to_cpu(agf->agf_length))) + be32_to_cpu(agf->agf_length))) { + xfs_buf_corruption_error(agbp); return -EFSCORRUPTED; + } xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); return 0; } /* - * Allocation group level functions. + * Block allocation algorithm and data structures. + */ +struct xfs_alloc_cur { + struct xfs_btree_cur *cnt; /* btree cursors */ + struct xfs_btree_cur *bnolt; + struct xfs_btree_cur *bnogt; + xfs_extlen_t cur_len;/* current search length */ + xfs_agblock_t rec_bno;/* extent startblock */ + xfs_extlen_t rec_len;/* extent length */ + xfs_agblock_t bno; /* alloc bno */ + xfs_extlen_t len; /* alloc len */ + xfs_extlen_t diff; /* diff from search bno */ + unsigned int busy_gen;/* busy state */ + bool busy; +}; + +/* + * Set up cursors, etc. in the extent allocation cursor. This function can be + * called multiple times to reset an initialized structure without having to + * reallocate cursors. + */ +static int +xfs_alloc_cur_setup( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + int error; + int i; + + ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO); + + acur->cur_len = args->maxlen; + acur->rec_bno = 0; + acur->rec_len = 0; + acur->bno = 0; + acur->len = 0; + acur->diff = -1; + acur->busy = false; + acur->busy_gen = 0; + + /* + * Perform an initial cntbt lookup to check for availability of maxlen + * extents. If this fails, we'll return -ENOSPC to signal the caller to + * attempt a small allocation. + */ + if (!acur->cnt) + acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_CNT); + error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); + if (error) + return error; + + /* + * Allocate the bnobt left and right search cursors. + */ + if (!acur->bnolt) + acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + if (!acur->bnogt) + acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + return i == 1 ? 0 : -ENOSPC; +} + +static void +xfs_alloc_cur_close( + struct xfs_alloc_cur *acur, + bool error) +{ + int cur_error = XFS_BTREE_NOERROR; + + if (error) + cur_error = XFS_BTREE_ERROR; + + if (acur->cnt) + xfs_btree_del_cursor(acur->cnt, cur_error); + if (acur->bnolt) + xfs_btree_del_cursor(acur->bnolt, cur_error); + if (acur->bnogt) + xfs_btree_del_cursor(acur->bnogt, cur_error); + acur->cnt = acur->bnolt = acur->bnogt = NULL; +} + +/* + * Check an extent for allocation and track the best available candidate in the + * allocation structure. The cursor is deactivated if it has entered an out of + * range state based on allocation arguments. Optionally return the extent + * extent geometry and allocation status if requested by the caller. + */ +static int +xfs_alloc_cur_check( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + struct xfs_btree_cur *cur, + int *new) +{ + int error, i; + xfs_agblock_t bno, bnoa, bnew; + xfs_extlen_t len, lena, diff = -1; + bool busy; + unsigned busy_gen = 0; + bool deactivate = false; + bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; + + *new = 0; + + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (error) + return error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) + return -EFSCORRUPTED; + + /* + * Check minlen and deactivate a cntbt cursor if out of acceptable size + * range (i.e., walking backwards looking for a minlen extent). + */ + if (len < args->minlen) { + deactivate = !isbnobt; + goto out; + } + + busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, + &busy_gen); + acur->busy |= busy; + if (busy) + acur->busy_gen = busy_gen; + /* deactivate a bnobt cursor outside of locality range */ + if (bnoa < args->min_agbno || bnoa > args->max_agbno) { + deactivate = isbnobt; + goto out; + } + if (lena < args->minlen) + goto out; + + args->len = XFS_EXTLEN_MIN(lena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < acur->len) + goto out; + + /* + * We have an aligned record that satisfies minlen and beats or matches + * the candidate extent size. Compare locality for near allocation mode. + */ + ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); + diff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->datatype, + bnoa, lena, &bnew); + if (bnew == NULLAGBLOCK) + goto out; + + /* + * Deactivate a bnobt cursor with worse locality than the current best. + */ + if (diff > acur->diff) { + deactivate = isbnobt; + goto out; + } + + ASSERT(args->len > acur->len || + (args->len == acur->len && diff <= acur->diff)); + acur->rec_bno = bno; + acur->rec_len = len; + acur->bno = bnew; + acur->len = args->len; + acur->diff = diff; + *new = 1; + + /* + * We're done if we found a perfect allocation. This only deactivates + * the current cursor, but this is just an optimization to terminate a + * cntbt search that otherwise runs to the edge of the tree. + */ + if (acur->diff == 0 && acur->len == args->maxlen) + deactivate = true; +out: + if (deactivate) + cur->bc_private.a.priv.abt.active = false; + trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, + *new); + return 0; +} + +/* + * Complete an allocation of a candidate extent. Remove the extent from both + * trees and update the args structure. */ +STATIC int +xfs_alloc_cur_finish( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + int error; + + ASSERT(acur->cnt && acur->bnolt); + ASSERT(acur->bno >= acur->rec_bno); + ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); + ASSERT(acur->rec_bno + acur->rec_len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + + error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, + acur->rec_len, acur->bno, acur->len, 0); + if (error) + return error; + + args->agbno = acur->bno; + args->len = acur->len; + args->wasfromfl = 0; + + trace_xfs_alloc_cur(args); + return 0; +} + +/* + * Locality allocation lookup algorithm. This expects a cntbt cursor and uses + * bno optimized lookup to search for extents with ideal size and locality. + */ +STATIC int +xfs_alloc_cntbt_iter( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + struct xfs_btree_cur *cur = acur->cnt; + xfs_agblock_t bno; + xfs_extlen_t len, cur_len; + int error; + int i; + + if (!xfs_alloc_cur_active(cur)) + return 0; + + /* locality optimized lookup */ + cur_len = acur->cur_len; + error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); + if (error) + return error; + if (i == 0) + return 0; + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (error) + return error; + + /* check the current record and update search length from it */ + error = xfs_alloc_cur_check(args, acur, cur, &i); + if (error) + return error; + ASSERT(len >= acur->cur_len); + acur->cur_len = len; + + /* + * We looked up the first record >= [agbno, len] above. The agbno is a + * secondary key and so the current record may lie just before or after + * agbno. If it is past agbno, check the previous record too so long as + * the length matches as it may be closer. Don't check a smaller record + * because that could deactivate our cursor. + */ + if (bno > args->agbno) { + error = xfs_btree_decrement(cur, 0, &i); + if (!error && i) { + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (!error && i && len == acur->cur_len) + error = xfs_alloc_cur_check(args, acur, cur, + &i); + } + if (error) + return error; + } + + /* + * Increment the search key until we find at least one allocation + * candidate or if the extent we found was larger. Otherwise, double the + * search key to optimize the search. Efficiency is more important here + * than absolute best locality. + */ + cur_len <<= 1; + if (!acur->len || acur->cur_len >= cur_len) + acur->cur_len++; + else + acur->cur_len = cur_len; + + return error; +} /* * Deal with the case where only small freespaces remain. Either return the @@ -727,7 +1045,10 @@ xfs_alloc_ag_vextent_small( error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } goto out; } @@ -744,23 +1065,26 @@ xfs_alloc_ag_vextent_small( goto out; xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - xfs_alloc_allow_busy_reuse(args->datatype)); + (args->datatype & XFS_ALLOC_NOBUSY)); - if (xfs_alloc_is_userdata(args->datatype)) { + if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; - bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno); - if (!bp) { - error = -EFSCORRUPTED; + error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, + XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + args->mp->m_bsize, 0, &bp); + if (error) goto error; - } xfs_trans_binval(args->tp, bp); } *fbnop = args->agbno = fbno; *flenp = args->len = 1; - XFS_WANT_CORRUPTED_GOTO(args->mp, - fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error); + if (XFS_IS_CORRUPT(args->mp, + fbno >= be32_to_cpu( + XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + error = -EFSCORRUPTED; + goto error; + } args->wasfromfl = 1; trace_xfs_alloc_small_freelist(args); @@ -915,7 +1239,10 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } ASSERT(fbno <= args->agbno); /* @@ -984,98 +1311,243 @@ error0: } /* - * Search the btree in a given direction via the search cursor and compare - * the records found against the good extent we've already found. + * Search a given number of btree records in a given direction. Check each + * record against the good extent we've already found. */ STATIC int -xfs_alloc_find_best_extent( - struct xfs_alloc_arg *args, /* allocation argument structure */ - struct xfs_btree_cur **gcur, /* good cursor */ - struct xfs_btree_cur **scur, /* searching cursor */ - xfs_agblock_t gdiff, /* difference for search comparison */ - xfs_agblock_t *sbno, /* extent found by search */ - xfs_extlen_t *slen, /* extent length */ - xfs_agblock_t *sbnoa, /* aligned extent found by search */ - xfs_extlen_t *slena, /* aligned extent length */ - int dir) /* 0 = search right, 1 = search left */ +xfs_alloc_walk_iter( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + struct xfs_btree_cur *cur, + bool increment, + bool find_one, /* quit on first candidate */ + int count, /* rec count (-1 for infinite) */ + int *stat) { - xfs_agblock_t new; - xfs_agblock_t sdiff; int error; int i; - unsigned busy_gen; - /* The good extent is perfect, no need to search. */ - if (!gdiff) - goto out_use_good; + *stat = 0; /* - * Look until we find a better one, run out of space or run off the end. + * Search so long as the cursor is active or we find a better extent. + * The cursor is deactivated if it extends beyond the range of the + * current allocation candidate. */ - do { - error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + while (xfs_alloc_cur_active(cur) && count) { + error = xfs_alloc_cur_check(args, acur, cur, &i); if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, - sbnoa, slena, &busy_gen); + return error; + if (i == 1) { + *stat = 1; + if (find_one) + break; + } + if (!xfs_alloc_cur_active(cur)) + break; + + if (increment) + error = xfs_btree_increment(cur, 0, &i); + else + error = xfs_btree_decrement(cur, 0, &i); + if (error) + return error; + if (i == 0) + cur->bc_private.a.priv.abt.active = false; + + if (count > 0) + count--; + } + + return 0; +} + +/* + * Search the by-bno and by-size btrees in parallel in search of an extent with + * ideal locality based on the NEAR mode ->agbno locality hint. + */ +STATIC int +xfs_alloc_ag_vextent_locality( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + int *stat) +{ + struct xfs_btree_cur *fbcur = NULL; + int error; + int i; + bool fbinc; + + ASSERT(acur->len == 0); + ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); + + *stat = 0; + + error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); + if (error) + return error; + error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); + if (error) + return error; + error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); + if (error) + return error; + + /* + * Search the bnobt and cntbt in parallel. Search the bnobt left and + * right and lookup the closest extent to the locality hint for each + * extent size key in the cntbt. The entire search terminates + * immediately on a bnobt hit because that means we've found best case + * locality. Otherwise the search continues until the cntbt cursor runs + * off the end of the tree. If no allocation candidate is found at this + * point, give up on locality, walk backwards from the end of the cntbt + * and take the first available extent. + * + * The parallel tree searches balance each other out to provide fairly + * consistent performance for various situations. The bnobt search can + * have pathological behavior in the worst case scenario of larger + * allocation requests and fragmented free space. On the other hand, the + * bnobt is able to satisfy most smaller allocation requests much more + * quickly than the cntbt. The cntbt search can sift through fragmented + * free space and sets of free extents for larger allocation requests + * more quickly than the bnobt. Since the locality hint is just a hint + * and we don't want to scan the entire bnobt for perfect locality, the + * cntbt search essentially bounds the bnobt search such that we can + * find good enough locality at reasonable performance in most cases. + */ + while (xfs_alloc_cur_active(acur->bnolt) || + xfs_alloc_cur_active(acur->bnogt) || + xfs_alloc_cur_active(acur->cnt)) { + + trace_xfs_alloc_cur_lookup(args); /* - * The good extent is closer than this one. + * Search the bnobt left and right. In the case of a hit, finish + * the search in the opposite direction and we're done. */ - if (!dir) { - if (*sbnoa > args->max_agbno) - goto out_use_good; - if (*sbnoa >= args->agbno + gdiff) - goto out_use_good; - } else { - if (*sbnoa < args->min_agbno) - goto out_use_good; - if (*sbnoa <= args->agbno - gdiff) - goto out_use_good; + error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, + true, 1, &i); + if (error) + return error; + if (i == 1) { + trace_xfs_alloc_cur_left(args); + fbcur = acur->bnogt; + fbinc = true; + break; + } + error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, + 1, &i); + if (error) + return error; + if (i == 1) { + trace_xfs_alloc_cur_right(args); + fbcur = acur->bnolt; + fbinc = false; + break; } /* - * Same distance, compare length and pick the best. + * Check the extent with best locality based on the current + * extent size search key and keep track of the best candidate. */ - if (*slena >= args->minlen) { - args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); - xfs_alloc_fix_len(args); - - sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, - args->datatype, *sbnoa, - *slena, &new); + error = xfs_alloc_cntbt_iter(args, acur); + if (error) + return error; + if (!xfs_alloc_cur_active(acur->cnt)) { + trace_xfs_alloc_cur_lookup_done(args); + break; + } + } - /* - * Choose closer size and invalidate other cursor. - */ - if (sdiff < gdiff) - goto out_use_search; - goto out_use_good; + /* + * If we failed to find anything due to busy extents, return empty + * handed so the caller can flush and retry. If no busy extents were + * found, walk backwards from the end of the cntbt as a last resort. + */ + if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { + error = xfs_btree_decrement(acur->cnt, 0, &i); + if (error) + return error; + if (i) { + acur->cnt->bc_private.a.priv.abt.active = true; + fbcur = acur->cnt; + fbinc = false; } + } - if (!dir) - error = xfs_btree_increment(*scur, 0, &i); - else - error = xfs_btree_decrement(*scur, 0, &i); + /* + * Search in the opposite direction for a better entry in the case of + * a bnobt hit or walk backwards from the end of the cntbt. + */ + if (fbcur) { + error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, + &i); if (error) - goto error0; - } while (i); + return error; + } -out_use_good: - xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); - *scur = NULL; - return 0; + if (acur->len) + *stat = 1; -out_use_search: - xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); - *gcur = NULL; return 0; +} -error0: - /* caller invalidates cursors */ - return error; +/* Check the last block of the cnt btree for allocations. */ +static int +xfs_alloc_ag_vextent_lastblock( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + xfs_agblock_t *bno, + xfs_extlen_t *len, + bool *allocated) +{ + int error; + int i; + +#ifdef DEBUG + /* Randomly don't execute the first algorithm. */ + if (prandom_u32() & 1) + return 0; +#endif + + /* + * Start from the entry that lookup found, sequence through all larger + * free blocks. If we're actually pointing at a record smaller than + * maxlen, go to the start of this block, and skip all those smaller + * than minlen. + */ + if (len || args->alignment > 1) { + acur->cnt->bc_ptrs[0] = 1; + do { + error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); + if (error) + return error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) + return -EFSCORRUPTED; + if (*len >= args->minlen) + break; + error = xfs_btree_increment(acur->cnt, 0, &i); + if (error) + return error; + } while (i); + ASSERT(*len >= args->minlen); + if (!i) + return 0; + } + + error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); + if (error) + return error; + + /* + * It didn't work. We COULD be in a case where there's a good record + * somewhere, so try again. + */ + if (acur->len == 0) + return 0; + + trace_xfs_alloc_near_first(args); + *allocated = true; + return 0; } /* @@ -1084,41 +1556,17 @@ error0: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +STATIC int xfs_alloc_ag_vextent_near( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args) { - xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ - xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ - xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ - xfs_agblock_t gtbno; /* start bno of right side entry */ - xfs_agblock_t gtbnoa; /* aligned ... */ - xfs_extlen_t gtdiff; /* difference to right side entry */ - xfs_extlen_t gtlen; /* length of right side entry */ - xfs_extlen_t gtlena; /* aligned ... */ - xfs_agblock_t gtnew; /* useful start bno of right side */ - int error; /* error code */ - int i; /* result code, temporary */ - int j; /* result code, temporary */ - xfs_agblock_t ltbno; /* start bno of left side entry */ - xfs_agblock_t ltbnoa; /* aligned ... */ - xfs_extlen_t ltdiff; /* difference to left side entry */ - xfs_extlen_t ltlen; /* length of left side entry */ - xfs_extlen_t ltlena; /* aligned ... */ - xfs_agblock_t ltnew; /* useful start bno of left side */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; -#ifdef DEBUG - /* - * Randomly don't execute the first algorithm. - */ - int dofirst; /* set to do first algorithm */ + struct xfs_alloc_cur acur = {}; + int error; /* error code */ + int i; /* result code, temporary */ + xfs_agblock_t bno; + xfs_extlen_t len; - dofirst = prandom_u32() & 1; -#endif - - /* handle unitialized agbno range so caller doesn't have to */ + /* handle uninitialized agbno range so caller doesn't have to */ if (!args->min_agbno && !args->max_agbno) args->max_agbno = args->mp->m_sb.sb_agblocks - 1; ASSERT(args->min_agbno <= args->max_agbno); @@ -1130,40 +1578,27 @@ xfs_alloc_ag_vextent_near( args->agbno = args->max_agbno; restart: - bno_cur_lt = NULL; - bno_cur_gt = NULL; - ltlen = 0; - gtlena = 0; - ltlena = 0; - busy = false; + len = 0; /* - * Get a cursor for the by-size btree. + * Set up cursors and see if there are any free extents as big as + * maxlen. If not, pick the last entry in the tree unless the tree is + * empty. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); - - /* - * See if there are any free extents as big as maxlen. - */ - if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) - goto error0; - /* - * If none, then pick up the last entry in the tree unless the - * tree is empty. - */ - if (!i) { - if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, - <len, &i))) - goto error0; - if (i == 0 || ltlen == 0) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + error = xfs_alloc_cur_setup(args, &acur); + if (error == -ENOSPC) { + error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, + &len, &i); + if (error) + goto out; + if (i == 0 || len == 0) { trace_xfs_alloc_near_noentry(args); - return 0; + goto out; } ASSERT(i == 1); + } else if (error) { + goto out; } - args->wasfromfl = 0; /* * First algorithm. @@ -1172,311 +1607,47 @@ restart: * near the right edge of the tree. If it's in the last btree leaf * block, then we just examine all the entries in that block * that are big enough, and pick the best one. - * This is written as a while loop so we can break out of it, - * but we never loop back to the top. */ - while (xfs_btree_islastblock(cnt_cur, 0)) { - xfs_extlen_t bdiff; - int besti=0; - xfs_extlen_t blen=0; - xfs_agblock_t bnew=0; + if (xfs_btree_islastblock(acur.cnt, 0)) { + bool allocated = false; -#ifdef DEBUG - if (dofirst) - break; -#endif - /* - * Start from the entry that lookup found, sequence through - * all larger free blocks. If we're actually pointing at a - * record smaller than maxlen, go to the start of this block, - * and skip all those smaller than minlen. - */ - if (ltlen || args->alignment > 1) { - cnt_cur->bc_ptrs[0] = 1; - do { - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, - <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - if (ltlen >= args->minlen) - break; - if ((error = xfs_btree_increment(cnt_cur, 0, &i))) - goto error0; - } while (i); - ASSERT(ltlen >= args->minlen); - if (!i) - break; - } - i = cnt_cur->bc_ptrs[0]; - for (j = 1, blen = 0, bdiff = 0; - !error && j && (blen < args->maxlen || bdiff > 0); - error = xfs_btree_increment(cnt_cur, 0, &j)) { - /* - * For each entry, decide if it's better than - * the previous best entry. - */ - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy = xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena, &busy_gen); - if (ltlena < args->minlen) - continue; - if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) - continue; - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - ASSERT(args->len >= args->minlen); - if (args->len < blen) - continue; - ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, ltbnoa, - ltlena, <new); - if (ltnew != NULLAGBLOCK && - (args->len > blen || ltdiff < bdiff)) { - bdiff = ltdiff; - bnew = ltnew; - blen = args->len; - besti = cnt_cur->bc_ptrs[0]; - } - } - /* - * It didn't work. We COULD be in a case where - * there's a good record somewhere, so try again. - */ - if (blen == 0) - break; - /* - * Point at the best entry, and retrieve it again. - */ - cnt_cur->bc_ptrs[0] = besti; - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - args->len = blen; - - /* - * We are allocating starting at bnew for blen blocks. - */ - args->agbno = bnew; - ASSERT(bnew >= ltbno); - ASSERT(bnew + blen <= ltbno + ltlen); - /* - * Set up a cursor for the by-bno tree. - */ - bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); - /* - * Fix up the btree entries. - */ - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, - ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) - goto error0; - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - - trace_xfs_alloc_near_first(args); - return 0; - } - /* - * Second algorithm. - * Search in the by-bno tree to the left and to the right - * simultaneously, until in each case we find a space big enough, - * or run into the edge of the tree. When we run into the edge, - * we deallocate that cursor. - * If both searches succeed, we compare the two spaces and pick - * the better one. - * With alignment, it's possible for both to fail; the upper - * level algorithm that picks allocation groups for allocations - * is not supposed to do this. - */ - /* - * Allocate and initialize the cursor for the leftward search. - */ - bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); - /* - * Lookup <= bno to find the leftward search's starting point. - */ - if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) - goto error0; - if (!i) { - /* - * Didn't find anything; use this cursor for the rightward - * search. - */ - bno_cur_gt = bno_cur_lt; - bno_cur_lt = NULL; - } - /* - * Found something. Duplicate the cursor for the rightward search. - */ - else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) - goto error0; - /* - * Increment the cursor, so we will point at the entry just right - * of the leftward entry if any, or to the leftmost entry. - */ - if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) - goto error0; - if (!i) { - /* - * It failed, there are no rightward entries. - */ - xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); - bno_cur_gt = NULL; + error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, + &allocated); + if (error) + goto out; + if (allocated) + goto alloc_finish; } - /* - * Loop going left with the leftward cursor, right with the - * rightward cursor, until either both directions give up or - * we find an entry at least as big as minlen. - */ - do { - if (bno_cur_lt) { - if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena, &busy_gen); - if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) - break; - if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) - goto error0; - if (!i || ltbnoa < args->min_agbno) { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - } - if (bno_cur_gt) { - if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, - >bnoa, >lena, &busy_gen); - if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) - break; - if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) - goto error0; - if (!i || gtbnoa > args->max_agbno) { - xfs_btree_del_cursor(bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - } - } while (bno_cur_lt || bno_cur_gt); /* - * Got both cursors still active, need to find better entry. + * Second algorithm. Combined cntbt and bnobt search to find ideal + * locality. */ - if (bno_cur_lt && bno_cur_gt) { - if (ltlena >= args->minlen) { - /* - * Left side is good, look for a right side entry. - */ - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, ltbnoa, - ltlena, <new); - - error = xfs_alloc_find_best_extent(args, - &bno_cur_lt, &bno_cur_gt, - ltdiff, >bno, >len, - >bnoa, >lena, - 0 /* search right */); - } else { - ASSERT(gtlena >= args->minlen); - - /* - * Right side is good, look for a left side entry. - */ - args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); - xfs_alloc_fix_len(args); - gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, gtbnoa, - gtlena, >new); - - error = xfs_alloc_find_best_extent(args, - &bno_cur_gt, &bno_cur_lt, - gtdiff, <bno, <len, - <bnoa, <lena, - 1 /* search left */); - } - - if (error) - goto error0; - } + error = xfs_alloc_ag_vextent_locality(args, &acur, &i); + if (error) + goto out; /* * If we couldn't get anything, give up. */ - if (bno_cur_lt == NULL && bno_cur_gt == NULL) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - - if (busy) { + if (!acur.len) { + if (acur.busy) { trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + xfs_extent_busy_flush(args->mp, args->pag, + acur.busy_gen); goto restart; } trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; - return 0; + goto out; } - /* - * At this point we have selected a freespace entry, either to the - * left or to the right. If it's on the right, copy all the - * useful variables to the "left" set so we only have one - * copy of this code. - */ - if (bno_cur_gt) { - bno_cur_lt = bno_cur_gt; - bno_cur_gt = NULL; - ltbno = gtbno; - ltbnoa = gtbnoa; - ltlen = gtlen; - ltlena = gtlena; - j = 1; - } else - j = 0; +alloc_finish: + /* fix up btrees on a successful allocation */ + error = xfs_alloc_cur_finish(args, &acur); - /* - * Fix up the length and compute the useful address. - */ - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - args->datatype, ltbnoa, ltlena, <new); - ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltbnoa + ltlena); - ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); - args->agbno = ltnew; - - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, - ltnew, rlen, XFSA_FIXUP_BNO_OK))) - goto error0; - - if (j) - trace_xfs_alloc_near_greater(args); - else - trace_xfs_alloc_near_lesser(args); - - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - return 0; - - error0: - trace_xfs_alloc_near_error(args); - if (cnt_cur != NULL) - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); - if (bno_cur_lt != NULL) - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); - if (bno_cur_gt != NULL) - xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); +out: + xfs_alloc_cur_close(&acur, error); return error; } @@ -1545,7 +1716,10 @@ restart: error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1579,8 +1753,13 @@ restart: * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || - (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (XFS_IS_CORRUPT(args->mp, + rlen != 0 && + (rlen > flen || + rbno + rlen > fbno + flen))) { + error = -EFSCORRUPTED; + goto error0; + } if (rlen < args->maxlen) { xfs_agblock_t bestfbno; xfs_extlen_t bestflen; @@ -1599,15 +1778,22 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if (flen < bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || - (rlen <= flen && rbno + rlen <= fbno + flen), - error0); + if (XFS_IS_CORRUPT(args->mp, + rlen != 0 && + (rlen > flen || + rbno + rlen > fbno + flen))) { + error = -EFSCORRUPTED; + goto error0; + } if (rlen > bestrlen) { bestrlen = rlen; bestrbno = rbno; @@ -1620,7 +1806,10 @@ restart: if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1643,7 +1832,10 @@ restart: xfs_alloc_fix_len(args); rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); + if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Allocate and initialize a cursor for the by-block tree. */ @@ -1657,10 +1849,13 @@ restart: cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO(args->mp, - args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error0); + if (XFS_IS_CORRUPT(args->mp, + args->agbno + args->len > + be32_to_cpu( + XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + error = -EFSCORRUPTED; + goto error0; + } trace_xfs_alloc_size_done(args); return 0; @@ -1732,7 +1927,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * It's not contiguous, though. */ @@ -1744,8 +1942,10 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(mp, - ltbno + ltlen <= bno, error0); + if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { + error = -EFSCORRUPTED; + goto error0; + } } } /* @@ -1760,7 +1960,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * It's not contiguous, though. */ @@ -1772,7 +1975,10 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); + if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { + error = -EFSCORRUPTED; + goto error0; + } } } /* @@ -1789,31 +1995,49 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1826,9 +2050,13 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, - i == 1 && xxbno == ltbno && xxlen == ltlen, - error0); + if (XFS_IS_CORRUPT(mp, + i != 1 || + xxbno != ltbno || + xxlen != ltlen)) { + error = -EFSCORRUPTED; + goto error0; + } } #endif /* @@ -1849,17 +2077,26 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1875,10 +2112,16 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1897,7 +2140,10 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; @@ -1906,10 +2152,16 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; @@ -1989,30 +2241,39 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return pag->pagf_longest - delta; + return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, + pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } @@ -2086,9 +2347,11 @@ xfs_free_agfl_block( if (error) return error; - bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno); - if (!bp) - return -EFSCORRUPTED; + error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, + XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), + tp->t_mountp->m_bsize, 0, &bp); + if (error) + return error; xfs_trans_binval(tp, bp); return 0; @@ -2205,7 +2468,7 @@ xfs_defer_agfl_block( ASSERT(xfs_bmap_free_item_zone != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; @@ -2239,12 +2502,11 @@ xfs_alloc_fix_freelist( if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; - if (!pag->pagf_init) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - goto out_agbp_relse; } } @@ -2253,7 +2515,7 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && + if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2270,11 +2532,10 @@ xfs_alloc_fix_freelist( */ if (!agbp) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) - goto out_no_agbp; - if (!agbp) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; } } @@ -2505,11 +2766,10 @@ xfs_alloc_pagf_init( xfs_buf_t *bp; int error; - if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) - return error; - if (bp) + error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); + if (!error) xfs_trans_brelse(tp, bp); - return 0; + return error; } /* @@ -2695,14 +2955,11 @@ xfs_read_agf( trace_xfs_read_agf(mp, agno); ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; - if (!*bpp) - return 0; ASSERT(!(*bpp)->b_error); xfs_buf_set_ref(*bpp, XFS_AGF_REF); @@ -2726,14 +2983,15 @@ xfs_alloc_read_agf( trace_xfs_alloc_read_agf(mp, agno); + /* We don't support trylock when freeing. */ + ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != + (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); if (error) return error; - if (!*bpp) - return 0; ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); @@ -2956,13 +3214,6 @@ xfs_alloc_vextent( args->len); #endif - /* Zero the extent if we were asked to do so */ - if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { - error = xfs_zero_extent(args->ip, args->fsbno, args->len); - if (error) - goto error0; - } - } xfs_perag_put(args->pag); return 0; @@ -3038,12 +3289,18 @@ __xfs_free_extent( if (error) return error; - XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); + if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { + error = -EFSCORRUPTED; + goto err; + } /* validate the extent size is legal now we have the agf locked */ - XFS_WANT_CORRUPTED_GOTO(mp, - agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), - err); + if (XFS_IS_CORRUPT(mp, + agbno + len > + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) { + error = -EFSCORRUPTED; + goto err; + } error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d6ed5d2c07c2..7380fbe4a3ff 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -54,7 +54,6 @@ typedef struct xfs_alloc_arg { struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for a.g. freelist header */ struct xfs_perag *pag; /* per-ag struct for this agno */ - struct xfs_inode *ip; /* for userdata zeroing method */ xfs_fsblock_t fsbno; /* file system block number */ xfs_agnumber_t agno; /* allocation group number */ xfs_agblock_t agbno; /* allocation group-relative block # */ @@ -83,20 +82,7 @@ typedef struct xfs_alloc_arg { */ #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ - -static inline bool -xfs_alloc_is_userdata(int datatype) -{ - return (datatype & ~XFS_ALLOC_NOBUSY) != 0; -} - -static inline bool -xfs_alloc_allow_busy_reuse(int datatype) -{ - return (datatype & XFS_ALLOC_NOBUSY) == 0; -} +#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ #define XFS_ALLOC_AGFL_RESERVE 4 diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 2a94543857a1..279694d73e4e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -507,6 +507,7 @@ xfs_allocbt_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; + cur->bc_private.a.priv.abt.active = false; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index d48fcf11cc35..e6149720ce02 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -62,6 +62,7 @@ xfs_attr_args_init( struct xfs_da_args *args, struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { @@ -74,7 +75,7 @@ xfs_attr_args_init( args->dp = dp; args->flags = flags; args->name = name; - args->namelen = strlen((const char *)name); + args->namelen = namelen; if (args->namelen >= MAXNAMELEN) return -EFAULT; /* match IRIX behaviour */ @@ -97,7 +98,10 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -/* Retrieve an extended attribute and its value. Must have ilock. */ +/* + * Retrieve an extended attribute and its value. Must have ilock. + * Returns 0 on successful retrieval, otherwise an error. + */ int xfs_attr_get_ilocked( struct xfs_inode *ip, @@ -115,12 +119,29 @@ xfs_attr_get_ilocked( return xfs_attr_node_get(args); } -/* Retrieve an extended attribute by name, and its value. */ +/* + * Retrieve an extended attribute by name, and its value if requested. + * + * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, + * just an indication whether the attribute exists and the size of the value if + * it exists. The size is returned in @valuelenp, + * + * If the attribute is found, but exceeds the size limit set by the caller in + * @valuelenp, return -ERANGE with the size of the attribute that was found in + * @valuelenp. + * + * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after + * existence of the attribute has been determined. On success, return that + * buffer to the caller and leave them to free it. On failure, free any + * allocated buffer and ensure the buffer pointer returned to the caller is + * null. + */ int xfs_attr_get( struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, + size_t namelen, + unsigned char **value, int *valuelenp, int flags) { @@ -128,26 +149,40 @@ xfs_attr_get( uint lock_mode; int error; + ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); + XFS_STATS_INC(ip->i_mount, xs_attr_get); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, flags); + error = xfs_attr_args_init(&args, ip, name, namelen, flags); if (error) return error; - args.value = value; - args.valuelen = *valuelenp; /* Entirely possible to look up a name which doesn't exist */ args.op_flags = XFS_DA_OP_OKNOENT; + if (flags & ATTR_ALLOC) + args.op_flags |= XFS_DA_OP_ALLOCVAL; + else + args.value = *value; + args.valuelen = *valuelenp; lock_mode = xfs_ilock_attr_map_shared(ip); error = xfs_attr_get_ilocked(ip, &args); xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; - return error == -EEXIST ? 0 : error; + + /* on error, we have to clean up allocated value buffers */ + if (error) { + if (flags & ATTR_ALLOC) { + kmem_free(args.value); + *value = NULL; + } + return error; + } + *value = args.value; + return 0; } /* @@ -305,6 +340,7 @@ int xfs_attr_set( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, unsigned char *value, int valuelen, int flags) @@ -320,7 +356,7 @@ xfs_attr_set( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -409,6 +445,7 @@ int xfs_attr_remove( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { struct xfs_mount *mp = dp->i_mount; @@ -420,7 +457,7 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -556,7 +593,7 @@ xfs_attr_leaf_addname( */ dp = args->dp; args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -682,7 +719,7 @@ xfs_attr_leaf_addname( * remove the "old" attr from that block (neat, huh!) */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - -1, &bp); + &bp); if (error) return error; @@ -736,7 +773,7 @@ xfs_attr_leaf_removename( */ dp = args->dp; args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -768,6 +805,8 @@ xfs_attr_leaf_removename( * * This leaf block cannot have a "remote" value, we only call this routine * if bmap_one_block() says there is only one block (ie: no remote blks). + * + * Returns 0 on successful retrieval, otherwise an error. */ STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) @@ -778,7 +817,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -789,9 +828,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args) } error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); - if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { - error = xfs_attr_rmtval_get(args); - } return error; } @@ -975,7 +1011,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->op_flags |= XFS_DA_OP_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; @@ -1141,7 +1177,7 @@ xfs_attr_node_removename( ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); if (error) goto out; @@ -1234,10 +1270,9 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da3_node_read(state->args->trans, - state->args->dp, - blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); if (error) return error; } else { @@ -1253,10 +1288,9 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da3_node_read(state->args->trans, - state->args->dp, - blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); if (error) return error; } else { @@ -1268,11 +1302,13 @@ xfs_attr_refillstate(xfs_da_state_t *state) } /* - * Look up a filename in a node attribute list. + * Retrieve the attribute data from a node attribute list. * * This routine gets called for any attribute fork that has more than one * block, ie: both true Btree attr lists and for single-leaf-blocks with * "remote" values taking up more blocks. + * + * Returns 0 on successful retrieval, otherwise an error. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args) @@ -1294,24 +1330,21 @@ xfs_attr_node_get(xfs_da_args_t *args) error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; - } else if (retval == -EEXIST) { - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - - /* - * Get the value, local or "remote" - */ - retval = xfs_attr3_leaf_getvalue(blk->bp, args); - if (!retval && (args->rmtblkno > 0) - && !(args->flags & ATTR_KERNOVAL)) { - retval = xfs_attr_rmtval_get(args); - } + goto out_release; } + if (retval != -EEXIST) + goto out_release; + + /* + * Get the value, local or "remote" + */ + blk = &state->path.blk[state->path.active - 1]; + retval = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. */ +out_release: for (i = 0; i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index ff28ebf3b635..4243b2272642 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -26,7 +26,7 @@ struct xfs_attr_list_context; *========================================================================*/ -#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ #define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ @@ -37,6 +37,10 @@ struct xfs_attr_list_context; #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ +#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ + +#define ATTR_KERNEL_FLAGS \ + (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ @@ -47,7 +51,8 @@ struct xfs_attr_list_context; { ATTR_REPLACE, "REPLACE" }, \ { ATTR_KERNOTIME, "KERNOTIME" }, \ { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" } + { ATTR_INCOMPLETE, "INCOMPLETE" }, \ + { ATTR_ALLOC, "ALLOC" } /* * The maximum size (into the kernel or returned from the kernel) of an @@ -143,11 +148,13 @@ int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, int *valuelenp, int flags); + size_t namelen, unsigned char **value, int *valuelenp, + int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); + size_t namelen, unsigned char *value, int valuelen, int flags); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 70eb941d02e4..fed537a4353d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -233,6 +233,61 @@ xfs_attr3_leaf_hdr_to_disk( } static xfs_failaddr_t +xfs_attr3_leaf_verify_entry( + struct xfs_mount *mp, + char *buf_end, + struct xfs_attr_leafblock *leaf, + struct xfs_attr3_icleaf_hdr *leafhdr, + struct xfs_attr_leaf_entry *ent, + int idx, + __u32 *last_hashval) +{ + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + char *name_end; + unsigned int nameidx; + unsigned int namesize; + __u32 hashval; + + /* hash order check */ + hashval = be32_to_cpu(ent->hashval); + if (hashval < *last_hashval) + return __this_address; + *last_hashval = hashval; + + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize) + return __this_address; + + /* + * Check the name information. The namelen fields are u8 so we can't + * possibly exceed the maximum name length of 255 bytes. + */ + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, idx); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, + be16_to_cpu(lentry->valuelen)); + name_end = (char *)lentry + namesize; + if (lentry->namelen == 0) + return __this_address; + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, idx); + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + name_end = (char *)rentry + namesize; + if (rentry->namelen == 0) + return __this_address; + if (!(ent->flags & XFS_ATTR_INCOMPLETE) && + rentry->valueblk == 0) + return __this_address; + } + + if (name_end > buf_end) + return __this_address; + + return NULL; +} + +static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) { @@ -240,7 +295,10 @@ xfs_attr3_leaf_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *ent; + char *buf_end; uint32_t end; /* must be 32bit - see below */ + __u32 last_hashval = 0; int i; xfs_failaddr_t fa; @@ -273,8 +331,13 @@ xfs_attr3_leaf_verify( (char *)bp->b_addr + ichdr.firstused) return __this_address; - /* XXX: need to range check rest of attr header values */ - /* XXX: hash order check? */ + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { + fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, + ent, i, &last_hashval); + if (fa) + return fa; + } /* * Quickly check the freemap information. Attribute data has to be @@ -367,13 +430,12 @@ xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, + &xfs_attr3_leaf_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return err; @@ -393,6 +455,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); } +static int +xfs_attr_copy_value( + struct xfs_da_args *args, + unsigned char *value, + int valuelen) +{ + /* + * No copy if all we have to do is get the length + */ + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return 0; + } + + /* + * No copy if the length of the existing buffer is too small + */ + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return -ERANGE; + } + + if (args->op_flags & XFS_DA_OP_ALLOCVAL) { + args->value = kmem_alloc_large(valuelen, 0); + if (!args->value) + return -ENOMEM; + } + args->valuelen = valuelen; + + /* remote block xattr requires IO for copy-in */ + if (args->rmtblkno) + return xfs_attr_rmtval_get(args); + + /* + * This is to prevent a GCC warning because the remote xattr case + * doesn't have a value to pass in. In that case, we never reach here, + * but GCC can't work that out and so throws a "passing NULL to + * memcpy" warning. + */ + if (!value) + return -EINVAL; + memcpy(args->value, value, valuelen); + return 0; +} /*======================================================================== * External routines when attribute fork size < XFS_LITINO(mp). @@ -409,13 +515,15 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) * special case for dev/uuid inodes, they have fixed size data forks. */ int -xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) +xfs_attr_shortform_bytesfit( + struct xfs_inode *dp, + int bytes) { - int offset; - int minforkoff; /* lower limit on valid forkoff locations */ - int maxforkoff; /* upper limit on valid forkoff locations */ - int dsize; - xfs_mount_t *mp = dp->i_mount; + struct xfs_mount *mp = dp->i_mount; + int64_t dsize; + int minforkoff; + int maxforkoff; + int offset; /* rounded down */ offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; @@ -481,7 +589,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ - minforkoff = max(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ @@ -720,15 +828,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) } /* - * Look up a name in a shortform attribute list structure. + * Retrieve the attribute value and length. + * + * If ATTR_KERNOVAL is specified, only the length needs to be returned. + * Unlike a lookup, we only return an error if the attribute does not + * exist or we can't retrieve the value. */ -/*ARGSUSED*/ int -xfs_attr_shortform_getvalue(xfs_da_args_t *args) +xfs_attr_shortform_getvalue( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; @@ -741,18 +853,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = sfe->valuelen; - return -EEXIST; - } - if (args->valuelen < sfe->valuelen) { - args->valuelen = sfe->valuelen; - return -ERANGE; - } - args->valuelen = sfe->valuelen; - memcpy(args->value, &sfe->nameval[args->namelen], - args->valuelen); - return -EEXIST; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } return -ENOATTR; } @@ -782,38 +884,23 @@ xfs_attr_shortform_to_leaf( ifp = dp->i_afp; sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); - tmpbuffer = kmem_alloc(size, KM_SLEEP); + tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); sf = (xfs_attr_shortform_t *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); bp = NULL; error = xfs_da_grow_inode(args, &blkno); - if (error) { - /* - * If we hit an IO error middle of the transaction inside - * grow_inode(), we may have inconsistent data. Bail out. - */ - if (error == -EIO) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); - if (error) { - /* xfs_attr3_leaf_create may not have instantiated a block */ - if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0)) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; @@ -901,7 +988,7 @@ xfs_attr_shortform_verify( char *endp; struct xfs_ifork *ifp; int i; - int size; + int64_t size; ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); @@ -985,7 +1072,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, 0); if (!tmpbuffer) return -ENOMEM; @@ -1057,7 +1144,6 @@ xfs_attr3_leaf_to_node( struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr icleafhdr; struct xfs_attr_leaf_entry *entries; - struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr icnodehdr; struct xfs_da_intnode *node; struct xfs_inode *dp = args->dp; @@ -1072,11 +1158,11 @@ xfs_attr3_leaf_to_node( error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1); if (error) goto out; - error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); + error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK); if (error) goto out; @@ -1097,18 +1183,17 @@ xfs_attr3_leaf_to_node( if (error) goto out; node = bp1->b_addr; - dp->d_ops->node_hdr_from_disk(&icnodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node); leaf = bp2->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ - btree[0].hashval = entries[icleafhdr.count - 1].hashval; - btree[0].before = cpu_to_be32(blkno); + icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval; + icnodehdr.btree[0].before = cpu_to_be32(blkno); icnodehdr.count = 1; - dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr); xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: @@ -1138,7 +1223,7 @@ xfs_attr3_leaf_create( trace_xfs_attr_leaf_create(args); - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp, XFS_ATTR_FORK); if (error) return error; @@ -1424,7 +1509,9 @@ xfs_attr3_leaf_add_work( for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr->freemap[i].base == tmp) { ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); - ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= + min_t(uint16_t, ichdr->freemap[i].size, + sizeof(xfs_attr_leaf_entry_t)); } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); @@ -1448,7 +1535,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, 0); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1908,7 +1995,7 @@ xfs_attr3_leaf_toosmall( if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, - blkno, -1, &bp); + blkno, &bp); if (error) return error; @@ -2167,7 +2254,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); /* * Copy the header into the temp leaf so that all the stuff @@ -2258,8 +2345,10 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - if (ichdr.count >= args->geo->blksize / 8) + if (ichdr.count >= args->geo->blksize / 8) { + xfs_buf_corruption_error(bp); return -EFSCORRUPTED; + } /* * Binary search. (note: small blocks will skip this loop) @@ -2275,10 +2364,14 @@ xfs_attr3_leaf_lookup_int( else break; } - if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) + if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { + xfs_buf_corruption_error(bp); return -EFSCORRUPTED; - if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) + } + if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { + xfs_buf_corruption_error(bp); return -EFSCORRUPTED; + } /* * Since we may have duplicate hashval's, find the first matching @@ -2310,8 +2403,8 @@ xfs_attr3_leaf_lookup_int( * If we are looking for INCOMPLETE entries, show only those. * If we are looking for complete entries, show only those. */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { continue; } if (entry->flags & XFS_ATTR_LOCAL) { @@ -2350,6 +2443,10 @@ xfs_attr3_leaf_lookup_int( /* * Get the value associated with an attribute name from a leaf attribute * list structure. + * + * If ATTR_KERNOVAL is specified, only the length needs to be returned. + * Unlike a lookup, we only return an error if the attribute does not + * exist or we can't retrieve the value. */ int xfs_attr3_leaf_getvalue( @@ -2361,7 +2458,6 @@ xfs_attr3_leaf_getvalue( struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; - int valuelen; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); @@ -2373,36 +2469,19 @@ xfs_attr3_leaf_getvalue( name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); - valuelen = be16_to_cpu(name_loc->valuelen); - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; - return 0; - } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; - return -ERANGE; - } - args->valuelen = valuelen; - memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); - } else { - name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); - ASSERT(name_rmt->namelen == args->namelen); - ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); - args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - args->rmtvaluelen); - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = args->rmtvaluelen; - return 0; - } - if (args->valuelen < args->rmtvaluelen) { - args->valuelen = args->rmtvaluelen; - return -ERANGE; - } - args->valuelen = args->rmtvaluelen; - } - return 0; + return xfs_attr_copy_value(args, + &name_loc->nameval[args->namelen], + be16_to_cpu(name_loc->valuelen)); + } + + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); + return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); } /*======================================================================== @@ -2652,7 +2731,7 @@ xfs_attr3_leaf_clearflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -2719,7 +2798,7 @@ xfs_attr3_leaf_setflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -2781,7 +2860,7 @@ xfs_attr3_leaf_flipflags( /* * Read the block containing the "old" attr */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1); if (error) return error; @@ -2790,7 +2869,7 @@ xfs_attr3_leaf_flipflags( */ if (args->blkno2 != args->blkno) { error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, - -1, &bp2); + &bp2); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 7b74e18becff..73615b1dd1a8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -17,13 +17,27 @@ struct xfs_inode; struct xfs_trans; /* - * Used to keep a list of "remote value" extents when unlinking an inode. + * Incore version of the attribute leaf header. */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - +struct xfs_attr3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t usedbytes; + /* + * Firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + uint32_t firstused; + __u8 holes; + struct { + uint16_t base; + uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; /*======================================================================== * Function prototypes for the kernel. @@ -67,8 +81,8 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -void xfs_attr3_leaf_list_int(struct xfs_buf *bp, - struct xfs_attr_list_context *context); +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, + struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. @@ -85,8 +99,7 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp); + xfs_dablk_t bno, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4eb30d357045..8b7f74b3bea2 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -19,12 +19,30 @@ #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_attr.h" +#include "xfs_attr_remote.h" #include "xfs_trace.h" #include "xfs_error.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ /* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + +/* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ @@ -358,6 +376,8 @@ xfs_attr_rmtval_copyin( /* * Read the value associated with an attribute from the out-of-line buffer * that we stored it in. + * + * Returns 0 on successful retrieval, otherwise an error. */ int xfs_attr_rmtval_get( @@ -398,17 +418,15 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); + error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, + 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error; @@ -527,9 +545,9 @@ xfs_attr_rmtval_set( dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt); - if (!bp) - return -ENOMEM; + error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp); + if (error) + return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, @@ -549,6 +567,33 @@ xfs_attr_rmtval_set( return 0; } +/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + return -EFSCORRUPTED; + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -557,7 +602,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -572,9 +616,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap; /* @@ -585,22 +626,11 @@ xfs_attr_rmtval_remove( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error; lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 7071ff98fdbc..40ce5f3094d1 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -5,6 +5,7 @@ */ #include "xfs.h" #include "xfs_log_format.h" +#include "xfs_bit.h" /* * XFS bit manipulation routines, used in non-realtime code. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 07aad70f3931..9a6d7a84689a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,6 +34,7 @@ #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" +#include "xfs_iomap.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -383,8 +384,10 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(mp, - xfs_verify_fsbno(mp, bno), error0); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { + error = -EFSCORRUPTED; + goto error0; + } if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -553,7 +556,7 @@ __xfs_bmap_add_free( #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); new->xefi_startblock = bno; new->xefi_blockcount = (xfs_extlen_t)len; if (oinfo) @@ -611,8 +614,8 @@ xfs_bmap_btree_to_extents( pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - xfs_btree_check_lptr(cur, cbno, 1)); + if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) + return -EFSCORRUPTED; #endif error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); @@ -727,11 +730,11 @@ xfs_bmap_extents_to_btree( cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno); - if (!abp) { - error = -EFSCORRUPTED; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, args.fsbno), + mp->m_bsize, 0, &abp); + if (error) goto out_unreserve_dquot; - } /* * Fill in the child block. @@ -792,6 +795,7 @@ out_root_realloc: */ void xfs_bmap_local_to_extents_empty( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork) { @@ -808,6 +812,7 @@ xfs_bmap_local_to_extents_empty( ifp->if_u1.if_root = NULL; ifp->if_height = 0; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -840,7 +845,7 @@ xfs_bmap_local_to_extents( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags = XFS_ILOG_CORE; goto done; } @@ -873,7 +878,11 @@ xfs_bmap_local_to_extents( ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); tp->t_firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno); + error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, + XFS_FSB_TO_DADDR(args.mp, args.fsbno), + args.mp->m_bsize, 0, &bp); + if (error) + goto done; /* * Initialize the block, copy the data and log the remote buffer. @@ -887,7 +896,7 @@ xfs_bmap_local_to_extents( /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; ifp->if_u1.if_root = NULL; @@ -934,7 +943,10 @@ xfs_bmap_add_attrfork_btree( if (error) goto error0; /* must be at least one entry */ - XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); + if (XFS_IS_CORRUPT(mp, stat != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -1081,7 +1093,7 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - if (ip->i_d.di_anextents != 0) { + if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) { error = -EFSCORRUPTED; goto trans_cancel; } @@ -1099,7 +1111,7 @@ xfs_bmap_add_attrfork( if (error) goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; switch (ip->i_d.di_format) { @@ -1152,6 +1164,65 @@ trans_cancel: * Internal and external extent tree search functions. */ +struct xfs_iread_state { + struct xfs_iext_cursor icur; + xfs_extnum_t loaded; +}; + +/* Stuff every bmbt record from this block into the incore extent map. */ +static int +xfs_iread_bmbt_block( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xfs_iread_state *ir = priv; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_btree_block *block; + struct xfs_buf *bp; + struct xfs_bmbt_rec *frp; + xfs_extnum_t num_recs; + xfs_extnum_t j; + int whichfork = cur->bc_private.b.whichfork; + + block = xfs_btree_get_block(cur, level, &bp); + + /* Abort if we find more records than nextents. */ + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(ir->loaded + num_recs > + XFS_IFORK_NEXTENTS(ip, whichfork))) { + xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", + (unsigned long long)ip->i_ino); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, + sizeof(*block), __this_address); + return -EFSCORRUPTED; + } + + /* Copy records into the incore cache. */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + for (j = 0; j < num_recs; j++, frp++, ir->loaded++) { + struct xfs_bmbt_irec new; + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(frp, &new); + fa = xfs_bmap_validate_extent(ip, whichfork, &new); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iread_extents(2)", frp, + sizeof(*frp), fa); + return -EFSCORRUPTED; + } + xfs_iext_insert(ip, &ir->icur, &new, + xfs_bmap_fork_to_state(whichfork)); + trace_xfs_read_extent(ip, &ir->icur, + xfs_bmap_fork_to_state(whichfork), _THIS_IP_); + xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur); + } + + return 0; +} + /* * Read in extents from a btree-format inode. */ @@ -1161,134 +1232,39 @@ xfs_iread_extents( struct xfs_inode *ip, int whichfork) { - struct xfs_mount *mp = ip->i_mount; - int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_iread_state ir; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - struct xfs_btree_block *block = ifp->if_broot; - struct xfs_iext_cursor icur; - struct xfs_bmbt_irec new; - xfs_fsblock_t bno; - struct xfs_buf *bp; - xfs_extnum_t i, j; - int level; - __be64 *pp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_cur *cur; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - if (unlikely(level == 0)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - goto out; - block = XFS_BUF_TO_BLOCK(bp); - if (level == 0) - break; - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(mp, - xfs_verify_fsbno(mp, bno), out_brelse); - xfs_trans_brelse(tp, bp); + if (XFS_IS_CORRUPT(mp, + XFS_IFORK_FORMAT(ip, whichfork) != + XFS_DINODE_FMT_BTREE)) { + error = -EFSCORRUPTED; + goto out; } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; - xfs_iext_first(ifp, &icur); - - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - - num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > nextents)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, - __func__, block, sizeof(*block), - __this_address); - error = -EFSCORRUPTED; - goto out_brelse; - } - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1, - &xfs_bmbt_buf_ops); - /* - * Copy records into the extent records. - */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); - for (j = 0; j < num_recs; j++, frp++, i++) { - xfs_failaddr_t fa; - - xfs_bmbt_disk_get_all(frp, &new); - fa = xfs_bmap_validate_extent(ip, whichfork, &new); - if (fa) { - error = -EFSCORRUPTED; - xfs_inode_verifier_error(ip, error, - "xfs_iread_extents(2)", - frp, sizeof(*frp), fa); - goto out_brelse; - } - xfs_iext_insert(ip, &icur, &new, state); - trace_xfs_read_extent(ip, &icur, state, _THIS_IP_); - xfs_iext_next(ifp, &icur); - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - error = xfs_btree_read_bufl(mp, tp, bno, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - goto out; - block = XFS_BUF_TO_BLOCK(bp); - } + ir.loaded = 0; + xfs_iext_first(ifp, &ir.icur); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block, + XFS_BTREE_VISIT_RECORDS, &ir); + xfs_btree_del_cursor(cur, error); + if (error) + goto out; - if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) { + if (XFS_IS_CORRUPT(mp, + ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) { error = -EFSCORRUPTED; goto out; } - ASSERT(i == xfs_iext_count(ifp)); + ASSERT(ir.loaded == xfs_iext_count(ifp)); ifp->if_flags |= XFS_IFEXTENTS; return 0; - -out_brelse: - xfs_trans_brelse(tp, bp); out: xfs_iext_destroy(ifp); return error; @@ -1315,8 +1291,7 @@ xfs_bmap_first_unused( xfs_fileoff_t lowest, max; int error; - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + ASSERT(xfs_ifork_has_extents(ip, whichfork) || XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { @@ -1372,7 +1347,8 @@ xfs_bmap_last_before( case XFS_DINODE_FMT_EXTENTS: break; default: - return -EIO; + ASSERT(0); + return -EFSCORRUPTED; } if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -1471,9 +1447,8 @@ xfs_bmap_last_offset( if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return -EIO; + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork))) + return -EFSCORRUPTED; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -1650,15 +1625,24 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; @@ -1684,7 +1668,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; @@ -1714,7 +1701,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &PREV); if (error) goto done; @@ -1739,11 +1729,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -1774,7 +1770,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; @@ -1795,11 +1794,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -1840,7 +1845,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &RIGHT); if (error) goto done; @@ -1872,11 +1880,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -1952,11 +1966,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -1985,11 +2005,8 @@ xfs_bmap_add_extent_delay_real( } /* add reverse mapping unless caller opted out */ - if (!(bma->flags & XFS_BMAPI_NORMAP)) { - error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); - if (error) - goto done; - } + if (!(bma->flags & XFS_BMAPI_NORMAP)) + xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2153,19 +2170,34 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; @@ -2191,13 +2223,22 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &PREV, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; @@ -2226,13 +2267,22 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2255,7 +2305,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2285,7 +2338,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2319,14 +2375,20 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -2353,7 +2415,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2387,17 +2452,26 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -2431,7 +2505,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* new right extent - oldext */ error = xfs_bmbt_update(cur, &r[1]); if (error) @@ -2440,7 +2517,10 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = PREV; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -2449,11 +2529,17 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } /* new middle extent - newext */ if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -2471,9 +2557,7 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); - if (error) - goto done; + xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -2738,15 +2822,24 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, &right, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &left); if (error) goto done; @@ -2772,7 +2865,10 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &left); if (error) goto done; @@ -2799,7 +2895,10 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &right); if (error) goto done; @@ -2822,21 +2921,24 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; } /* add reverse mapping unless caller opted out */ - if (!(flags & XFS_BMAPI_NORMAP)) { - error = xfs_rmap_map_extent(tp, ip, whichfork, new); - if (error) - goto done; - } + if (!(flags & XFS_BMAPI_NORMAP)) + xfs_rmap_map_extent(tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -3064,7 +3166,7 @@ xfs_bmap_adjacent( mp = ap->ip->i_mount; nullfb = ap->tp->t_firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && - xfs_alloc_is_userdata(ap->datatype); + (ap->datatype & XFS_ALLOC_USERDATA); fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock); /* @@ -3209,11 +3311,12 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); - if (error) - goto out; - - if (!pag->pagf_init) { - *notinit = 1; + if (error) { + /* Couldn't lock the AGF, so skip this AG. */ + if (error == -EAGAIN) { + *notinit = 1; + error = 0; + } goto out; } } @@ -3417,7 +3520,7 @@ xfs_bmap_btalloc( if (ap->flags & XFS_BMAPI_COWFORK) align = xfs_get_cowextsz_hint(ap->ip); - else if (xfs_alloc_is_userdata(ap->datatype)) + else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); if (align) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3432,7 +3535,7 @@ xfs_bmap_btalloc( fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock); if (nullfb) { - if (xfs_alloc_is_userdata(ap->datatype) && + if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; @@ -3472,7 +3575,7 @@ xfs_bmap_btalloc( * enough for the request. If one isn't found, then adjust * the minimum allocation size to the largest space found. */ - if (xfs_alloc_is_userdata(ap->datatype) && + if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); else @@ -3506,13 +3609,11 @@ xfs_bmap_btalloc( args.mod = args.prod - args.mod; } /* - * If we are not low on available data blocks, and the - * underlying logical volume manager is a stripe, and - * the file offset is zero then try to allocate data - * blocks on stripe unit boundary. - * NOTE: ap->aeof is only set if the allocation length - * is >= the stripe unit and the allocation offset is - * at the end of file. + * If we are not low on available data blocks, and the underlying + * logical volume manager is a stripe, and the file offset is zero then + * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof + * is only set if the allocation length is >= the stripe unit and the + * allocation offset is at the end of file. */ if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) { if (!ap->offset) { @@ -3520,9 +3621,11 @@ xfs_bmap_btalloc( atype = args.type; isaligned = 1; /* - * Adjust for alignment + * Adjust minlen to try and preserve alignment if we + * can't guarantee an aligned maxlen extent. */ - if (blen > args.alignment && blen <= args.maxlen) + if (blen > args.alignment && + blen <= args.maxlen + args.alignment) args.minlen = blen - args.alignment; args.minalignslop = 0; } else { @@ -3560,8 +3663,6 @@ xfs_bmap_btalloc( args.wasdel = ap->wasdel; args.resv = XFS_AG_RESV_NONE; args.datatype = ap->datatype; - if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) - args.ip = ap->ip; error = xfs_alloc_vextent(&args); if (error) @@ -3646,20 +3747,6 @@ xfs_bmap_btalloc( return 0; } -/* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. - */ -STATIC int -xfs_bmap_alloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ -{ - if (XFS_IS_REALTIME_INODE(ap->ip) && - xfs_alloc_is_userdata(ap->datatype)) - return xfs_bmap_rtalloc(ap); - return xfs_bmap_btalloc(ap); -} - /* Trim extent to fit a logical block range. */ void xfs_trim_extent( @@ -3821,11 +3908,8 @@ xfs_bmapi_read( XFS_BMAPI_COWFORK))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4016,6 +4100,39 @@ out_unreserve_quota: } static int +xfs_bmap_alloc_userdata( + struct xfs_bmalloca *bma) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = xfs_bmapi_whichfork(bma->flags); + int error; + + /* + * Set the data type being allocated. For the data fork, the first data + * in the file is treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range is not on + * the busy list. + */ + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK) { + bma->datatype |= XFS_ALLOC_USERDATA; + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + + if (mp->m_dalign && bma->length >= mp->m_dalign) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + if (XFS_IS_REALTIME_INODE(bma->ip)) + return xfs_bmap_rtalloc(bma); + } + + return xfs_bmap_btalloc(bma); +} + +static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) { @@ -4034,7 +4151,8 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev); + if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) + bma->prev.br_startoff = NULLFILEOFF; } else { bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); if (!bma->eof) @@ -4042,43 +4160,24 @@ xfs_bmapi_allocate( bma->got.br_startoff - bma->offset); } - /* - * Set the data type being allocated. For the data fork, the first data - * in the file is treated differently to all other allocations. For the - * attribute fork, we only need to ensure the allocated range is not on - * the busy list. - */ - if (!(bma->flags & XFS_BMAPI_METADATA)) { - bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - else - bma->datatype |= XFS_ALLOC_USERDATA; - } - if (bma->flags & XFS_BMAPI_ZERO) - bma->datatype |= XFS_ALLOC_USERDATA_ZERO; - } + if (bma->flags & XFS_BMAPI_CONTIG) + bma->minlen = bma->length; + else + bma->minlen = 1; - bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + if (bma->flags & XFS_BMAPI_METADATA) + error = xfs_bmap_btalloc(bma); + else + error = xfs_bmap_alloc_userdata(bma); + if (error || bma->blkno == NULLFSBLOCK) + return error; - /* - * Only want to do the alignment at the eof if it is userdata and - * allocation length is larger than a stripe unit. - */ - if (mp->m_dalign && bma->length >= mp->m_dalign && - !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { - error = xfs_bmap_isaeof(bma, whichfork); + if (bma->flags & XFS_BMAPI_ZERO) { + error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); if (error) return error; } - error = xfs_bmap_alloc(bma); - if (error) - return error; - - if (bma->blkno == NULLFSBLOCK) - return 0; if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); /* @@ -4318,11 +4417,8 @@ xfs_bmapi_write( ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4401,12 +4497,9 @@ xfs_bmapi_write( * If this is a CoW allocation, record the data in * the refcount btree for orphan recovery. */ - if (whichfork == XFS_COW_FORK) { - error = xfs_refcount_alloc_cow_extent(tp, - bma.blkno, bma.length); - if (error) - goto error0; - } + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); } /* Deal with the allocated space we found. */ @@ -4465,16 +4558,21 @@ int xfs_bmapi_convert_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap, + xfs_off_t offset, + struct iomap *iomap, unsigned int *seq) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; + uint16_t flags = 0; struct xfs_trans *tp; int error; + if (whichfork == XFS_COW_FORK) + flags |= IOMAP_F_SHARED; + /* * Space for the extent and indirect blocks was reserved when the * delalloc extent was created so there's no need to do so here. @@ -4504,7 +4602,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4514,7 +4612,6 @@ xfs_bmapi_convert_delalloc( bma.wasdel = true; bma.offset = bma.got.br_startoff; bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); - bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); if (whichfork == XFS_COW_FORK) bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; @@ -4530,22 +4627,18 @@ xfs_bmapi_convert_delalloc( if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) goto out_finish; error = -EFSCORRUPTED; - if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) goto out_finish; XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); - if (whichfork == XFS_COW_FORK) { - error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); - if (error) - goto out_finish; - } + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4591,11 +4684,8 @@ xfs_bmapi_remap( ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5026,7 +5116,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (got.br_startoff == del->br_startoff) @@ -5050,7 +5143,10 @@ xfs_bmap_del_extent_real( } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case BMAP_LEFT_FILLING: /* @@ -5121,7 +5217,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Update the btree record back * to the original value. @@ -5138,7 +5237,10 @@ xfs_bmap_del_extent_real( error = -ENOSPC; goto done; } - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } else flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -5149,18 +5251,14 @@ xfs_bmap_del_extent_real( } /* remove reverse mapping */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); - if (error) - goto done; + xfs_rmap_unmap_extent(tp, ip, whichfork, del); /* * If we need to, add to list of extents to delete. */ if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - error = xfs_refcount_decrease_extent(tp, del); - if (error) - goto done; + xfs_refcount_decrease_extent(tp, del); } else { __xfs_bmap_add_free(tp, del->br_startblock, del->br_blockcount, NULL, @@ -5209,7 +5307,7 @@ __xfs_bunmapi( int isrt; /* freeing in rt area */ int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ - struct xfs_mount *mp; /* mount structure */ + struct xfs_mount *mp = ip->i_mount; int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ @@ -5226,14 +5324,8 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, - ip->i_mount); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork))) return -EFSCORRUPTED; - } - mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -5317,7 +5409,7 @@ __xfs_bunmapi( * Make sure we don't touch multiple AGF headers out of order * in a single transaction, as that could cause AB-BA deadlocks. */ - if (!wasdel) { + if (!wasdel && !isrt) { agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); if (prev_agno != NULLAGNUMBER && prev_agno > agno) break; @@ -5393,16 +5485,17 @@ __xfs_bunmapi( } div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); if (mod) { + xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; + /* * Realtime extent is lined up at the end but not * at the front. We'll get rid of full extents if * we can. */ - mod = mp->m_sb.sb_rextsize - mod; - if (del.br_blockcount > mod) { - del.br_blockcount -= mod; - del.br_startoff += mod; - del.br_startblock += mod; + if (del.br_blockcount > off) { + del.br_blockcount -= off; + del.br_startoff += off; + del.br_startblock += off; } else if (del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || tp->t_blk_res == 0)) { @@ -5420,6 +5513,7 @@ __xfs_bunmapi( continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { struct xfs_bmbt_irec prev; + xfs_fileoff_t unwrite_start; /* * This one is already unwritten. @@ -5433,12 +5527,13 @@ __xfs_bunmapi( ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); - if (prev.br_startoff < start) { - mod = start - prev.br_startoff; - prev.br_blockcount -= mod; - prev.br_startblock += mod; - prev.br_startoff = start; - } + unwrite_start = max3(start, + del.br_startoff - mod, + prev.br_startoff); + mod = unwrite_start - prev.br_startoff; + prev.br_startoff = unwrite_start; + prev.br_startblock += mod; + prev.br_blockcount -= mod; prev.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, whichfork, &icur, &cur, @@ -5627,23 +5722,31 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; error = xfs_btree_delete(cur, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; error = xfs_bmbt_update(cur, &new); if (error) return error; + /* change to extent format if required after extent removal */ + error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); + if (error) + return error; + done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); @@ -5651,12 +5754,11 @@ done: &new); /* update reverse mapping. rmap functions merge the rmaps for us */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); - if (error) - return error; + xfs_rmap_unmap_extent(tp, ip, whichfork, got); memcpy(&new, got, sizeof(new)); new.br_startoff = left->br_startoff + left->br_blockcount; - return xfs_rmap_map_extent(tp, ip, whichfork, &new); + xfs_rmap_map_extent(tp, ip, whichfork, &new); + return 0; } static int @@ -5682,7 +5784,8 @@ xfs_bmap_shift_update_extent( error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; error = xfs_bmbt_update(cur, got); if (error) @@ -5695,10 +5798,9 @@ xfs_bmap_shift_update_extent( got); /* update reverse mapping */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); - if (error) - return error; - return xfs_rmap_map_extent(tp, ip, whichfork, got); + xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); + xfs_rmap_map_extent(tp, ip, whichfork, got); + return 0; } int @@ -5719,11 +5821,8 @@ xfs_bmap_collapse_extents( int error = 0; int logflags = 0; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5747,8 +5846,10 @@ xfs_bmap_collapse_extents( *done = true; goto del_cursor; } - XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), - del_cursor); + if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + error = -EFSCORRUPTED; + goto del_cursor; + } new_startoff = got.br_startoff - offset_shift_fsb; if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { @@ -5837,11 +5938,8 @@ xfs_bmap_insert_extents( int error = 0; int logflags = 0; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5874,11 +5972,13 @@ xfs_bmap_insert_extents( goto del_cursor; } } - XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), - del_cursor); + if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + error = -EFSCORRUPTED; + goto del_cursor; + } - if (stop_fsb >= got.br_startoff + got.br_blockcount) { - error = -EIO; + if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { + error = -EFSCORRUPTED; goto del_cursor; } @@ -5943,12 +6043,8 @@ xfs_bmap_split_extent_at( int logflags = 0; int i = 0; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmap_split_extent_at", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5982,7 +6078,10 @@ xfs_bmap_split_extent_at( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto del_cursor; + } } got.br_blockcount = gotblkcnt; @@ -6007,11 +6106,17 @@ xfs_bmap_split_extent_at( error = xfs_bmbt_lookup_eq(cur, &new, &i); if (error) goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto del_cursor; + } error = xfs_btree_insert(cur, &i); if (error) goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto del_cursor; + } } /* @@ -6094,7 +6199,7 @@ __xfs_bmap_add( bmap->br_blockcount, bmap->br_state); - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; @@ -6106,29 +6211,29 @@ __xfs_bmap_add( } /* Map an extent into a file. */ -int +void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *PREV) { if (!xfs_bmap_is_update_needed(PREV)) - return 0; + return; - return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); } /* Unmap an extent out of a file. */ -int +void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *PREV) { if (!xfs_bmap_is_update_needed(PREV)) - return 0; + return; - return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); } /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8f597f9abdbe..14d25e0b7d9c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -171,11 +171,19 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) !isnullstartblock(irec->br_startblock); } +/* + * Check the mapping for obviously garbage allocations that could trash the + * filesystem immediately. + */ +#define xfs_valid_startblock(ip, startblock) \ + ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) + void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); -void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); @@ -220,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, - unsigned int *seq); + xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, @@ -254,9 +261,9 @@ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t *blockcount, xfs_exntst_t state); -int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); static inline int xfs_bmap_fork_to_state(int whichfork) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fbb18ba5d905..ffe608d2a2d9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys( union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - - be64_to_cpu(k2->bmbt.br_startoff); + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + + /* + * Note: This routine previously casted a and b to int64 and subtracted + * them to generate a result. This lead to problems if b was the + * "maximum" key value (all ones) being signed incorrectly, hence this + * somewhat less efficient version. + */ + if (a > b) + return 1; + if (b > a) + return -1; + return 0; } static xfs_failaddr_t diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f1048efa4268..fd300dc93ca4 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -105,11 +105,10 @@ xfs_btree_check_lblock( xfs_failaddr_t fa; fa = __xfs_btree_check_lblock(cur, block, level, bp); - if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, - XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } return 0; @@ -169,11 +168,10 @@ xfs_btree_check_sblock( xfs_failaddr_t fa; fa = __xfs_btree_check_sblock(cur, block, level, bp); - if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, - XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } return 0; @@ -384,7 +382,7 @@ xfs_btree_del_cursor( /* * Free the cursor. */ - kmem_zone_free(xfs_btree_cur_zone, cur); + kmem_cache_free(xfs_btree_cur_zone, cur); } /* @@ -681,61 +679,6 @@ xfs_btree_get_block( } /* - * Get a buffer for the block, return it with no data read. - * Long-form addressing. - */ -xfs_buf_t * /* buffer for fsbno */ -xfs_btree_get_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno) /* file system block number */ -{ - xfs_daddr_t d; /* real disk block address */ - - ASSERT(fsbno != NULLFSBLOCK); - d = XFS_FSB_TO_DADDR(mp, fsbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); -} - -/* - * Get a buffer for the block, return it with no data read. - * Short-form addressing. - */ -xfs_buf_t * /* buffer for agno/agbno */ -xfs_btree_get_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno) /* allocation group block number */ -{ - xfs_daddr_t d; /* real disk block address */ - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); -} - -/* - * Check for the cursor referring to the last block at the given level. - */ -int /* 1=is last block, 0=not last block */ -xfs_btree_islastblock( - xfs_btree_cur_t *cur, /* btree cursor */ - int level) /* level to check */ -{ - struct xfs_btree_block *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ - - block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); - else - return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); -} - -/* * Change the cursor to point to the first record at the given level. * Other levels are unaffected. */ @@ -1291,11 +1234,10 @@ xfs_btree_get_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - - if (!*bpp) - return -ENOMEM; + error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, + 0, bpp); + if (error) + return error; (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); @@ -1820,6 +1762,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; + xfs_buf_corruption_error(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -1867,7 +1810,7 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ - if (cur->bc_nlevels == 0) + if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) return -EFSCORRUPTED; block = NULL; @@ -1987,7 +1930,8 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; *stat = 1; return 0; } @@ -2408,8 +2352,6 @@ xfs_btree_lshift( XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ - int i; /* loop index */ - for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) @@ -2442,7 +2384,10 @@ xfs_btree_lshift( if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_decrement(tcur, level, &i); if (error) @@ -2609,7 +2554,10 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_increment(tcur, level, &i); if (error) @@ -3463,7 +3411,10 @@ xfs_btree_insert( goto error0; } - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } level++; /* @@ -3867,15 +3818,24 @@ xfs_btree_delrec( * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); @@ -3919,12 +3879,18 @@ xfs_btree_delrec( rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } } } @@ -3938,13 +3904,19 @@ xfs_btree_delrec( * previous block. */ i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); @@ -4286,6 +4258,7 @@ int xfs_btree_visit_blocks( struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, + unsigned int flags, void *data) { union xfs_btree_ptr lptr; @@ -4311,6 +4284,11 @@ xfs_btree_visit_blocks( /* save for the next iteration of the loop */ xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); + + if (!(flags & XFS_BTREE_VISIT_LEAVES)) + continue; + } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { + continue; } /* for each buffer in the level */ @@ -4413,7 +4391,7 @@ xfs_btree_change_owner( bbcoi.buffer_list = buffer_list; return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, - &bbcoi); + XFS_BTREE_VISIT_ALL, &bbcoi); } /* Verify the v5 fields of a long-format btree block. */ @@ -4466,8 +4444,6 @@ xfs_btree_lblock_verify( * btree block * * @bp: buffer containing the btree block - * @max_recs: pointer to the m_*_mxr max records field in the xfs mount - * @pag_max_level: pointer to the per-ag max level field */ xfs_failaddr_t xfs_btree_sblock_v5hdr_verify( @@ -4600,7 +4576,7 @@ xfs_btree_simple_query_range( /* Callback */ error = fn(cur, recp, priv); - if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error) break; advloop: @@ -4702,8 +4678,7 @@ pop_up: */ if (ldiff >= 0 && hdiff >= 0) { error = fn(cur, recp, priv); - if (error < 0 || - error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error) break; } else if (hdiff < 0) { /* Record is larger than high key; pop. */ @@ -4774,8 +4749,7 @@ out: * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error - * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a - * negative error code. + * code. This function returns -ECANCELED, zero, or a negative error code. */ int xfs_btree_query_range( @@ -4869,7 +4843,7 @@ xfs_btree_count_blocks( { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, - blocks); + XFS_BTREE_VISIT_ALL, blocks); } /* Compare two btree pointers. */ @@ -4891,7 +4865,7 @@ xfs_btree_has_record_helper( union xfs_btree_rec *rec, void *priv) { - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* Is there a record covering a given range of keys? */ @@ -4906,7 +4880,7 @@ xfs_btree_has_record( error = xfs_btree_query_range(cur, low, high, &xfs_btree_has_record_helper, NULL); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == -ECANCELED) { *exists = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index fa3cd8ab9aba..3eff7c321d43 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -183,6 +183,9 @@ union xfs_btree_cur_private { unsigned long nr_ops; /* # record updates */ int shape_changes; /* # of extent splits */ } refc; + struct { + bool active; /* allocation cursor state */ + } abt; }; /* @@ -294,35 +297,6 @@ xfs_btree_dup_cursor( xfs_btree_cur_t **ncur);/* output cursor */ /* - * Get a buffer for the block, return it with no data read. - * Long-form addressing. - */ -struct xfs_buf * /* buffer for fsbno */ -xfs_btree_get_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno); /* file system block number */ - -/* - * Get a buffer for the block, return it with no data read. - * Short-form addressing. - */ -struct xfs_buf * /* buffer for agno/agbno */ -xfs_btree_get_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno); /* allocation group block number */ - -/* - * Check for the cursor referring to the last block at the given level. - */ -int /* 1=is last block, 0=not last block */ -xfs_btree_islastblock( - xfs_btree_cur_t *cur, /* btree cursor */ - int level); /* level to check */ - -/* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ @@ -464,9 +438,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); -/* return codes */ -#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ -#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ +/* + * Return codes for the query range iterator function are 0 to continue + * iterating, and non-zero to stop iterating. Any non-zero value will be + * passed up to the _query_range caller. The special value -ECANCELED can be + * used to stop iteration, because _query_range never generates that error + * code on its own. + */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv); @@ -478,8 +456,15 @@ int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); +/* Visit record blocks. */ +#define XFS_BTREE_VISIT_RECORDS (1 << 0) +/* Visit leaf blocks. */ +#define XFS_BTREE_VISIT_LEAVES (1 << 1) +/* Visit all blocks. */ +#define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \ + XFS_BTREE_VISIT_LEAVES) int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, - xfs_btree_visit_blocks_fn fn, void *data); + xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); @@ -510,4 +495,21 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); +/* Does this cursor point to the last block in the given level? */ +static inline bool +xfs_btree_islastblock( + xfs_btree_cur_t *cur, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, level, &bp); + ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); + return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); +} + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 0bf56e94bfe9..875e04f82541 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -12,9 +12,9 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_attr_leaf.h" @@ -107,7 +107,66 @@ xfs_da_state_free(xfs_da_state_t *state) #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); + kmem_cache_free(xfs_da_state_zone, state); +} + +static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) +{ + if (whichfork == XFS_DATA_FORK) + return mp->m_dir_geo->fsbcount; + return mp->m_attr_geo->fsbcount; +} + +void +xfs_da3_node_hdr_from_disk( + struct xfs_mount *mp, + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from; + + to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); + to->back = be32_to_cpu(from3->hdr.info.hdr.back); + to->magic = be16_to_cpu(from3->hdr.info.hdr.magic); + to->count = be16_to_cpu(from3->hdr.__count); + to->level = be16_to_cpu(from3->hdr.__level); + to->btree = from3->__btree; + ASSERT(to->magic == XFS_DA3_NODE_MAGIC); + } else { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); + to->btree = from->__btree; + ASSERT(to->magic == XFS_DA_NODE_MAGIC); + } +} + +void +xfs_da3_node_hdr_to_disk( + struct xfs_mount *mp, + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + to3->hdr.info.hdr.forw = cpu_to_be32(from->forw); + to3->hdr.info.hdr.back = cpu_to_be32(from->back); + to3->hdr.info.hdr.magic = cpu_to_be16(from->magic); + to3->hdr.__count = cpu_to_be16(from->count); + to3->hdr.__level = cpu_to_be16(from->level); + } else { + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); + } } /* @@ -145,12 +204,9 @@ xfs_da3_node_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; - const struct xfs_dir_ops *ops; xfs_failaddr_t fa; - ops = xfs_dir_get_ops(mp, NULL); - - ops->node_hdr_from_disk(&ichdr, hdr); + xfs_da3_node_hdr_from_disk(mp, &ichdr, hdr); fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); if (fa) @@ -275,46 +331,76 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = { .verify_struct = xfs_da3_node_verify_struct, }; +static int +xfs_da3_node_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + return 0; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_ATTR_LEAF_BUF); + return 0; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + return 0; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp, + info, sizeof(*info)); + xfs_trans_brelse(tp, bp); + return -EFSCORRUPTED; + } +} + int xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, + struct xfs_buf **bpp, + int whichfork) +{ + int error; + + error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork, + &xfs_da3_node_buf_ops); + if (error || !*bpp || !tp) + return error; + return xfs_da3_node_set_type(tp, *bpp); +} + +int +xfs_da3_node_read_mapped( + struct xfs_trans *tp, + struct xfs_inode *dp, xfs_daddr_t mappedbno, struct xfs_buf **bpp, - int which_fork) + int whichfork) { - int err; + struct xfs_mount *mp = dp->i_mount; + int error; - err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, &xfs_da3_node_buf_ops); - if (!err && tp && *bpp) { - struct xfs_da_blkinfo *info = (*bpp)->b_addr; - int type; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno, + XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0, + bpp, &xfs_da3_node_buf_ops); + if (error || !*bpp) + return error; - switch (be16_to_cpu(info->magic)) { - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - type = XFS_BLFT_DA_NODE_BUF; - break; - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - type = XFS_BLFT_ATTR_LEAF_BUF; - break; - case XFS_DIR2_LEAFN_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - type = XFS_BLFT_DIR_LEAFN_BUF; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - tp->t_mountp, info, sizeof(*info)); - xfs_trans_brelse(tp, *bpp); - *bpp = NULL; - return -EFSCORRUPTED; - } - xfs_trans_buf_set_type(tp, *bpp, type); - } - return err; + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(*bpp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(*bpp, XFS_DIR_BTREE_REF); + + if (!tp) + return 0; + return xfs_da3_node_set_type(tp, *bpp); } /*======================================================================== @@ -343,7 +429,7 @@ xfs_da3_node_create( trace_xfs_da_node_create(args); ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); + error = xfs_da_get_buf(tp, dp, blkno, &bp, whichfork); if (error) return error; bp->b_ops = &xfs_da3_node_buf_ops; @@ -363,9 +449,9 @@ xfs_da3_node_create( } ichdr.level = level; - dp->d_ops->node_hdr_to_disk(node, &ichdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node, &node->hdr, args->geo->node_hdr_size)); *bpp = bp; return 0; @@ -504,6 +590,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { + xfs_buf_corruption_error(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -516,6 +603,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { + xfs_buf_corruption_error(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -568,7 +656,7 @@ xfs_da3_root_split( dp = args->dp; tp = args->trans; - error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + error = xfs_da_get_buf(tp, dp, blkno, &bp, args->whichfork); if (error) return error; node = bp->b_addr; @@ -577,8 +665,8 @@ xfs_da3_root_split( oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { struct xfs_da3_icnode_hdr icnodehdr; - dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); - btree = dp->d_ops->node_tree_p(oldroot); + xfs_da3_node_hdr_from_disk(dp->i_mount, &icnodehdr, oldroot); + btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; @@ -589,15 +677,14 @@ xfs_da3_root_split( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; leaf = (xfs_dir2_leaf_t *)oldroot; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + size = (int)((char *)&leafhdr.ents[leafhdr.count] - + (char *)leaf); level = 0; /* @@ -637,14 +724,14 @@ xfs_da3_root_split( return error; node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; btree[0].hashval = cpu_to_be32(blk1->hashval); btree[0].before = cpu_to_be32(blk1->blkno); btree[1].hashval = cpu_to_be32(blk2->hashval); btree[1].before = cpu_to_be32(blk2->blkno); nodehdr.count = 2; - dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); #ifdef DEBUG if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || @@ -686,7 +773,7 @@ xfs_da3_node_split( trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -733,7 +820,7 @@ xfs_da3_node_split( * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); if (oldblk->index <= nodehdr.count) { oldblk->index++; xfs_da3_node_add(state, oldblk, addblk); @@ -788,10 +875,10 @@ xfs_da3_node_rebalance( node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); - dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); + btree1 = nodehdr1.btree; + btree2 = nodehdr2.btree; /* * Figure out how many entries need to move, and in which direction. @@ -804,10 +891,10 @@ xfs_da3_node_rebalance( tmpnode = node1; node1 = node2; node2 = tmpnode; - dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); - dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); + btree1 = nodehdr1.btree; + btree2 = nodehdr2.btree; swap = 1; } @@ -869,14 +956,15 @@ xfs_da3_node_rebalance( /* * Log header of node 1 and all current bits of node 2. */ - dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); + xfs_da3_node_hdr_to_disk(dp->i_mount, node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node1, &node1->hdr, + state->args->geo->node_hdr_size)); - dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); + xfs_da3_node_hdr_to_disk(dp->i_mount, node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - dp->d_ops->node_hdr_size + + state->args->geo->node_hdr_size + (sizeof(btree2[0]) * nodehdr2.count))); /* @@ -886,10 +974,10 @@ xfs_da3_node_rebalance( if (swap) { node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); - dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); + btree1 = nodehdr1.btree; + btree2 = nodehdr2.btree; } blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); @@ -921,8 +1009,8 @@ xfs_da3_node_add( trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); @@ -945,9 +1033,10 @@ xfs_da3_node_add( tmp + sizeof(*btree))); nodehdr.count += 1; - dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node, &node->hdr, + state->args->geo->node_hdr_size)); /* * Copy the last hash value from the oldblk to propagate upwards. @@ -1082,7 +1171,6 @@ xfs_da3_root_join( xfs_dablk_t child; struct xfs_buf *bp; struct xfs_da3_icnode_hdr oldroothdr; - struct xfs_da_node_entry *btree; int error; struct xfs_inode *dp = state->args->dp; @@ -1092,7 +1180,7 @@ xfs_da3_root_join( args = state->args; oldroot = root_blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); + xfs_da3_node_hdr_from_disk(dp->i_mount, &oldroothdr, oldroot); ASSERT(oldroothdr.forw == 0); ASSERT(oldroothdr.back == 0); @@ -1106,11 +1194,9 @@ xfs_da3_root_join( * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - btree = dp->d_ops->node_tree_p(oldroot); - child = be32_to_cpu(btree[0].before); + child = be32_to_cpu(oldroothdr.btree[0].before); ASSERT(child != 0); - error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, - args->whichfork); + error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork); if (error) return error; xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); @@ -1172,7 +1258,7 @@ xfs_da3_node_toosmall( blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; node = (xfs_da_intnode_t *)info; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return 0; /* blk over 50%, don't try to join */ @@ -1224,13 +1310,13 @@ xfs_da3_node_toosmall( blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da3_node_read(state->args->trans, dp, - blkno, -1, &bp, state->args->whichfork); + error = xfs_da3_node_read(state->args->trans, dp, blkno, &bp, + state->args->whichfork); if (error) return error; node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&thdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node); xfs_trans_brelse(state->args->trans, bp); if (count - thdr.count >= 0) @@ -1272,18 +1358,14 @@ xfs_da3_node_lasthash( struct xfs_buf *bp, int *count) { - struct xfs_da_intnode *node; - struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; - node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr); if (count) *count = nodehdr.count; if (!nodehdr.count) return 0; - btree = dp->d_ops->node_tree_p(node); - return be32_to_cpu(btree[nodehdr.count - 1].hashval); + return be32_to_cpu(nodehdr.btree[nodehdr.count - 1].hashval); } /* @@ -1328,8 +1410,8 @@ xfs_da3_fixhashpath( struct xfs_da3_icnode_hdr nodehdr; node = blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; if (be32_to_cpu(btree[blk->index].hashval) == lasthash) break; blk->hashval = lasthash; @@ -1360,7 +1442,7 @@ xfs_da3_node_remove( trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); @@ -1368,7 +1450,7 @@ xfs_da3_node_remove( * Copy over the offending entry, or just zero it out. */ index = drop_blk->index; - btree = dp->d_ops->node_tree_p(node); + btree = nodehdr.btree; if (index < nodehdr.count - 1) { tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); @@ -1381,9 +1463,9 @@ xfs_da3_node_remove( xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); nodehdr.count -= 1; - dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size)); /* * Copy the last hash value from the block to propagate upwards. @@ -1416,10 +1498,10 @@ xfs_da3_node_unbalance( drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); - dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); - drop_btree = dp->d_ops->node_tree_p(drop_node); - save_btree = dp->d_ops->node_tree_p(save_node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &drop_hdr, drop_node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &save_hdr, save_node); + drop_btree = drop_hdr.btree; + save_btree = save_hdr.btree; tp = state->args->trans; /* @@ -1453,10 +1535,10 @@ xfs_da3_node_unbalance( memcpy(&save_btree[sindex], &drop_btree[0], tmp); save_hdr.count += drop_hdr.count; - dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - dp->d_ops->node_hdr_size)); + state->args->geo->node_hdr_size)); /* * Save the last hashval in the remaining block for upward propagation. @@ -1517,7 +1599,7 @@ xfs_da3_node_lookup_int( */ blk->blkno = blkno; error = xfs_da3_node_read(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork); + &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; @@ -1541,8 +1623,10 @@ xfs_da3_node_lookup_int( break; } - if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) + if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { + xfs_buf_corruption_error(blk->bp); return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; @@ -1550,19 +1634,22 @@ xfs_da3_node_lookup_int( * Search an intermediate node for a match. */ node = blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; /* Tree taller than we can handle; bail out! */ - if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { + xfs_buf_corruption_error(blk->bp); return -EFSCORRUPTED; + } /* Check the level from the root. */ if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; - else if (expected_level != nodehdr.level) + else if (expected_level != nodehdr.level) { + xfs_buf_corruption_error(blk->bp); return -EFSCORRUPTED; - else + } else expected_level--; max = nodehdr.count; @@ -1612,11 +1699,11 @@ xfs_da3_node_lookup_int( } /* We can't point back to the root. */ - if (blkno == args->geo->leafblk) + if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) return -EFSCORRUPTED; } - if (expected_level != 0) + if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) return -EFSCORRUPTED; /* @@ -1678,10 +1765,10 @@ xfs_da3_node_order( node1 = node1_bp->b_addr; node2 = node2_bp->b_addr; - dp->d_ops->node_hdr_from_disk(&node1hdr, node1); - dp->d_ops->node_hdr_from_disk(&node2hdr, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &node1hdr, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &node2hdr, node2); + btree1 = node1hdr.btree; + btree2 = node2hdr.btree; if (node1hdr.count > 0 && node2hdr.count > 0 && ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || @@ -1746,7 +1833,7 @@ xfs_da3_blk_link( if (old_info->back) { error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1767,7 +1854,7 @@ xfs_da3_blk_link( if (old_info->forw) { error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1826,7 +1913,7 @@ xfs_da3_blk_unlink( if (drop_info->back) { error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1843,7 +1930,7 @@ xfs_da3_blk_unlink( if (drop_info->forw) { error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1878,7 +1965,6 @@ xfs_da3_path_shift( { struct xfs_da_state_blk *blk; struct xfs_da_blkinfo *info; - struct xfs_da_intnode *node; struct xfs_da_args *args; struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; @@ -1901,17 +1987,16 @@ xfs_da3_path_shift( ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { - node = blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, + blk->bp->b_addr); if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(btree[blk->index].before); + blkno = be32_to_cpu(nodehdr.btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(btree[blk->index].before); + blkno = be32_to_cpu(nodehdr.btree[blk->index].before); break; } } @@ -1929,7 +2014,7 @@ xfs_da3_path_shift( /* * Read the next child block into a local buffer. */ - error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp, + error = xfs_da3_node_read(args->trans, dp, blkno, &bp, args->whichfork); if (error) return error; @@ -1962,9 +2047,9 @@ xfs_da3_path_shift( case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: blk->magic = XFS_DA_NODE_MAGIC; - node = (xfs_da_intnode_t *)info; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, + bp->b_addr); + btree = nodehdr.btree; blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; @@ -2044,18 +2129,6 @@ xfs_da_compname( XFS_CMP_EXACT : XFS_CMP_DIFFERENT; } -static xfs_dahash_t -xfs_default_hashname( - struct xfs_name *name) -{ - return xfs_da_hashname(name->name, name->len); -} - -const struct xfs_nameops xfs_default_nameops = { - .hashname = xfs_default_hashname, - .compname = xfs_da_compname -}; - int xfs_da_grow_inode_int( struct xfs_da_args *args, @@ -2098,7 +2171,7 @@ xfs_da_grow_inode_int( * If we didn't get it and the block might work if fragmented, * try without the CONTIG flag. Loop until we get it all. */ - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); @@ -2213,16 +2286,13 @@ xfs_da3_swap_lastblock( error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; - if (unlikely(lastoff == 0)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, - mp); + if (XFS_IS_CORRUPT(mp, lastoff == 0)) return -EFSCORRUPTED; - } /* * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; - error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); + error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w); if (error) return error; /* @@ -2240,16 +2310,17 @@ xfs_da3_swap_lastblock( struct xfs_dir2_leaf_entry *ents; dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); - ents = dp->d_ops->leaf_ents_p(dead_leaf2); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, + dead_leaf2); + ents = leafhdr.ents; dead_level = 0; dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { struct xfs_da3_icnode_hdr deadhdr; dead_node = (xfs_da_intnode_t *)dead_info; - dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); - btree = dp->d_ops->node_tree_p(dead_node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &deadhdr, dead_node); + btree = deadhdr.btree; dead_level = deadhdr.level; dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } @@ -2258,15 +2329,13 @@ xfs_da3_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; - if (unlikely( - be32_to_cpu(sib_info->forw) != last_blkno || - sib_info->magic != dead_info->magic)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, + be32_to_cpu(sib_info->forw) != last_blkno || + sib_info->magic != dead_info->magic)) { error = -EFSCORRUPTED; goto done; } @@ -2280,15 +2349,13 @@ xfs_da3_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; - if (unlikely( - be32_to_cpu(sib_info->back) != last_blkno || - sib_info->magic != dead_info->magic)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, + be32_to_cpu(sib_info->back) != last_blkno || + sib_info->magic != dead_info->magic)) { error = -EFSCORRUPTED; goto done; } @@ -2304,27 +2371,24 @@ xfs_da3_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); - if (level >= 0 && level != par_hdr.level + 1) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", - XFS_ERRLEVEL_LOW, mp); + xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); + if (XFS_IS_CORRUPT(mp, + level >= 0 && level != par_hdr.level + 1)) { error = -EFSCORRUPTED; goto done; } level = par_hdr.level; - btree = dp->d_ops->node_tree_p(par_node); + btree = par_hdr.btree; for (entno = 0; entno < par_hdr.count && be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (entno == par_hdr.count) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) { error = -EFSCORRUPTED; goto done; } @@ -2349,24 +2413,20 @@ xfs_da3_swap_lastblock( par_blkno = par_hdr.forw; xfs_trans_brelse(tp, par_buf); par_buf = NULL; - if (unlikely(par_blkno == 0)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, par_blkno == 0)) { error = -EFSCORRUPTED; goto done; } - error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); - if (par_hdr.level != level) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", - XFS_ERRLEVEL_LOW, mp); + xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); + if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { error = -EFSCORRUPTED; goto done; } - btree = dp->d_ops->node_tree_p(par_node); + btree = par_hdr.btree; entno = 0; } /* @@ -2429,159 +2489,84 @@ xfs_da_shrink_inode( return error; } -/* - * See if the mapping(s) for this btree block are valid, i.e. - * don't contain holes, are logically contiguous, and cover the whole range. - */ -STATIC int -xfs_da_map_covers_blocks( - int nmap, - xfs_bmbt_irec_t *mapp, - xfs_dablk_t bno, - int count) -{ - int i; - xfs_fileoff_t off; - - for (i = 0, off = bno; i < nmap; i++) { - if (mapp[i].br_startblock == HOLESTARTBLOCK || - mapp[i].br_startblock == DELAYSTARTBLOCK) { - return 0; - } - if (off != mapp[i].br_startoff) { - return 0; - } - off += mapp[i].br_blockcount; - } - return off == bno + count; -} - -/* - * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. - * - * For the single map case, it is assumed that the caller has provided a pointer - * to a valid xfs_buf_map. For the multiple map case, this function will - * allocate the xfs_buf_map to hold all the maps and replace the caller's single - * map pointer with the allocated map. - */ static int -xfs_buf_map_from_irec( - struct xfs_mount *mp, +xfs_dabuf_map( + struct xfs_inode *dp, + xfs_dablk_t bno, + unsigned int flags, + int whichfork, struct xfs_buf_map **mapp, - int *nmaps, - struct xfs_bmbt_irec *irecs, - int nirecs) + int *nmaps) { - struct xfs_buf_map *map; - int i; - - ASSERT(*nmaps == 1); - ASSERT(nirecs >= 1); + struct xfs_mount *mp = dp->i_mount; + int nfsb = xfs_dabuf_nfsb(mp, whichfork); + struct xfs_bmbt_irec irec, *irecs = &irec; + struct xfs_buf_map *map = *mapp; + xfs_fileoff_t off = bno; + int error = 0, nirecs, i; + + if (nfsb > 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS); + + nirecs = nfsb; + error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs, + xfs_bmapi_aflag(whichfork)); + if (error) + goto out_free_irecs; + /* + * Use the caller provided map for the single map case, else allocate a + * larger one that needs to be free by the caller. + */ if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), - KM_SLEEP | KM_NOFS); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); if (!map) - return -ENOMEM; + goto out_free_irecs; *mapp = map; } - *nmaps = nirecs; - map = *mapp; - for (i = 0; i < *nmaps; i++) { - ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && - irecs[i].br_startblock != HOLESTARTBLOCK); + for (i = 0; i < nirecs; i++) { + if (irecs[i].br_startblock == HOLESTARTBLOCK || + irecs[i].br_startblock == DELAYSTARTBLOCK) + goto invalid_mapping; + if (off != irecs[i].br_startoff) + goto invalid_mapping; + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + off += irecs[i].br_blockcount; } - return 0; -} -/* - * Map the block we are given ready for reading. There are three possible return - * values: - * -1 - will be returned if we land in a hole and mappedbno == -2 so the - * caller knows not to execute a subsequent read. - * 0 - if we mapped the block successfully - * >0 - positive error number if there was an error. - */ -static int -xfs_dabuf_map( - struct xfs_inode *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - int whichfork, - struct xfs_buf_map **map, - int *nmaps) -{ - struct xfs_mount *mp = dp->i_mount; - int nfsb; - int error = 0; - struct xfs_bmbt_irec irec; - struct xfs_bmbt_irec *irecs = &irec; - int nirecs; + if (off != bno + nfsb) + goto invalid_mapping; - ASSERT(map && *map); - ASSERT(*nmaps == 1); - - if (whichfork == XFS_DATA_FORK) - nfsb = mp->m_dir_geo->fsbcount; - else - nfsb = mp->m_attr_geo->fsbcount; - - /* - * Caller doesn't have a mapping. -2 means don't complain - * if we land in a hole. - */ - if (mappedbno == -1 || mappedbno == -2) { - /* - * Optimize the one-block case. - */ - if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, - KM_SLEEP | KM_NOFS); + *nmaps = nirecs; +out_free_irecs: + if (irecs != &irec) + kmem_free(irecs); + return error; - nirecs = nfsb; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, - &nirecs, xfs_bmapi_aflag(whichfork)); - if (error) - goto out; - } else { - irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); - irecs->br_startoff = (xfs_fileoff_t)bno; - irecs->br_blockcount = nfsb; - irecs->br_state = 0; - nirecs = 1; - } +invalid_mapping: + /* Caller ok with no mapping. */ + if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) { + error = -EFSCORRUPTED; + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "%s: bno %u inode %llu", + __func__, bno, dp->i_ino); - if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { - error = mappedbno == -2 ? -1 : -EFSCORRUPTED; - if (unlikely(error == -EFSCORRUPTED)) { - if (xfs_error_level >= XFS_ERRLEVEL_LOW) { - int i; - xfs_alert(mp, "%s: bno %lld dir: inode %lld", - __func__, (long long)bno, - (long long)dp->i_ino); - for (i = 0; i < *nmaps; i++) { - xfs_alert(mp, + for (i = 0; i < nirecs; i++) { + xfs_alert(mp, "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", - i, - (long long)irecs[i].br_startoff, - (long long)irecs[i].br_startblock, - (long long)irecs[i].br_blockcount, - irecs[i].br_state); - } + i, irecs[i].br_startoff, + irecs[i].br_startblock, + irecs[i].br_blockcount, + irecs[i].br_state); } - XFS_ERROR_REPORT("xfs_da_do_buf(1)", - XFS_ERRLEVEL_LOW, mp); } - goto out; + } else { + *nmaps = 0; } - error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); -out: - if (irecs != &irec) - kmem_free(irecs); - return error; + goto out_free_irecs; } /* @@ -2589,39 +2574,26 @@ out: */ int xfs_da_get_buf( - struct xfs_trans *trans, + struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork) { + struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; - struct xfs_buf_map map; - struct xfs_buf_map *mapp; - int nmap; + struct xfs_buf_map map, *mapp = ↦ + int nmap = 1; int error; *bpp = NULL; - mapp = ↦ - nmap = 1; - error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, - &mapp, &nmap); - if (error) { - /* mapping a hole is not an error, but we don't continue */ - if (error == -1) - error = 0; + error = xfs_dabuf_map(dp, bno, 0, whichfork, &mapp, &nmap); + if (error || nmap == 0) goto out_free; - } - bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0); - error = bp ? bp->b_error : -EIO; - if (error) { - if (bp) - xfs_trans_brelse(trans, bp); + error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp); + if (error) goto out_free; - } *bpp = bp; @@ -2637,35 +2609,27 @@ out_free: */ int xfs_da_read_buf( - struct xfs_trans *trans, + struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, + unsigned int flags, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops) { + struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; - struct xfs_buf_map map; - struct xfs_buf_map *mapp; - int nmap; + struct xfs_buf_map map, *mapp = ↦ + int nmap = 1; int error; *bpp = NULL; - mapp = ↦ - nmap = 1; - error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, - &mapp, &nmap); - if (error) { - /* mapping a hole is not an error, but we don't continue */ - if (error == -1) - error = 0; + error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap); + if (error || !nmap) goto out_free; - } - error = xfs_trans_read_buf_map(dp->i_mount, trans, - dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, ops); + error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0, + &bp, ops); if (error) goto out_free; @@ -2688,7 +2652,7 @@ int xfs_da_reada_buf( struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, + unsigned int flags, int whichfork, const struct xfs_buf_ops *ops) { @@ -2699,16 +2663,10 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, - &mapp, &nmap); - if (error) { - /* mapping a hole is not an error, but we don't continue */ - if (error == -1) - error = 0; + error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap); + if (error || !nmap) goto out_free; - } - mappedbno = mapp[0].bm_bn; xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); out_free: diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 84dd865b6c3d..0f4fbb0889ff 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -10,7 +10,6 @@ struct xfs_inode; struct xfs_trans; struct zone; -struct xfs_dir_ops; /* * Directory/attribute geometry information. There will be one of these for each @@ -18,15 +17,23 @@ struct xfs_dir_ops; * structures will be attached to the xfs_mount. */ struct xfs_da_geometry { - int blksize; /* da block size in bytes */ - int fsbcount; /* da block size in filesystem blocks */ + unsigned int blksize; /* da block size in bytes */ + unsigned int fsbcount; /* da block size in filesystem blocks */ uint8_t fsblog; /* log2 of _filesystem_ block size */ uint8_t blklog; /* log2 of da block size */ - uint node_ents; /* # of entries in a danode */ - int magicpct; /* 37% of block size in bytes */ + unsigned int node_hdr_size; /* danode header size in bytes */ + unsigned int node_ents; /* # of entries in a danode */ + unsigned int magicpct; /* 37% of block size in bytes */ xfs_dablk_t datablk; /* blockno of dir data v2 */ + unsigned int leaf_hdr_size; /* dir2 leaf header size */ + unsigned int leaf_max_ents; /* # of entries in dir2 leaf */ xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + unsigned int free_hdr_size; /* dir2 free header size */ + unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; }; /*======================================================================== @@ -81,13 +88,17 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ +#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ { XFS_DA_OP_RENAME, "RENAME" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ - { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } /* * Storage for holding state during Btree searches and split/join ops. @@ -123,6 +134,25 @@ typedef struct xfs_da_state { } xfs_da_state_t; /* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t level; + + /* + * Pointer to the on-disk format entries, which are behind the + * variable size (v4 vs v5) header in the on-disk block. + */ + struct xfs_da_node_entry *btree; +}; + +/* * Utility macros to aid in logging changed structure fields. */ #define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) @@ -130,16 +160,6 @@ typedef struct xfs_da_state { (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) -/* - * Name ops for directory and/or attr name operations - */ -struct xfs_nameops { - xfs_dahash_t (*hashname)(struct xfs_name *); - enum xfs_dacmp (*compname)(struct xfs_da_args *, - const unsigned char *, int); -}; - - /*======================================================================== * Function prototypes. *========================================================================*/ @@ -170,25 +190,28 @@ int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int which_fork); + xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork); +int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_daddr_t mappedbno, struct xfs_buf **bpp, + int whichfork); /* * Utility routines. */ + +#define XFS_DABUF_MAP_HOLE_OK (1 << 0) + int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bp, int whichfork); + xfs_dablk_t bno, struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int whichfork, - const struct xfs_buf_ops *ops); + xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp, + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno, int whichfork, - const struct xfs_buf_ops *ops); + unsigned int flags, int whichfork, + const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); @@ -200,7 +223,11 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, + struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); +void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, + struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); + extern struct kmem_zone *xfs_da_state_zone; -extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c deleted file mode 100644 index b1ae572496b6..000000000000 --- a/fs/xfs/libxfs/xfs_da_format.c +++ /dev/null @@ -1,888 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * Copyright (c) 2013 Red Hat, Inc. - * All Rights Reserved. - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_shared.h" -#include "xfs_format.h" -#include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_inode.h" -#include "xfs_dir2.h" - -/* - * Shortform directory ops - */ -static int -xfs_dir2_sf_entsize( - struct xfs_dir2_sf_hdr *hdr, - int len) -{ - int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ - - count += len; /* name */ - count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ - return count; -} - -static int -xfs_dir3_sf_entsize( - struct xfs_dir2_sf_hdr *hdr, - int len) -{ - return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t); -} - -static struct xfs_dir2_sf_entry * -xfs_dir2_sf_nextentry( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); -} - -static struct xfs_dir2_sf_entry * -xfs_dir3_sf_nextentry( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); -} - - -/* - * For filetype enabled shortform directories, the file type field is stored at - * the end of the name. Because it's only a single byte, endian conversion is - * not necessary. For non-filetype enable directories, the type is always - * unknown and we never store the value. - */ -static uint8_t -xfs_dir2_sfe_get_ftype( - struct xfs_dir2_sf_entry *sfep) -{ - return XFS_DIR3_FT_UNKNOWN; -} - -static void -xfs_dir2_sfe_put_ftype( - struct xfs_dir2_sf_entry *sfep, - uint8_t ftype) -{ - ASSERT(ftype < XFS_DIR3_FT_MAX); -} - -static uint8_t -xfs_dir3_sfe_get_ftype( - struct xfs_dir2_sf_entry *sfep) -{ - uint8_t ftype; - - ftype = sfep->name[sfep->namelen]; - if (ftype >= XFS_DIR3_FT_MAX) - return XFS_DIR3_FT_UNKNOWN; - return ftype; -} - -static void -xfs_dir3_sfe_put_ftype( - struct xfs_dir2_sf_entry *sfep, - uint8_t ftype) -{ - ASSERT(ftype < XFS_DIR3_FT_MAX); - - sfep->name[sfep->namelen] = ftype; -} - -/* - * Inode numbers in short-form directories can come in two versions, - * either 4 bytes or 8 bytes wide. These helpers deal with the - * two forms transparently by looking at the headers i8count field. - * - * For 64-bit inode number the most significant byte must be zero. - */ -static xfs_ino_t -xfs_dir2_sf_get_ino( - struct xfs_dir2_sf_hdr *hdr, - uint8_t *from) -{ - if (hdr->i8count) - return get_unaligned_be64(from) & 0x00ffffffffffffffULL; - else - return get_unaligned_be32(from); -} - -static void -xfs_dir2_sf_put_ino( - struct xfs_dir2_sf_hdr *hdr, - uint8_t *to, - xfs_ino_t ino) -{ - ASSERT((ino & 0xff00000000000000ULL) == 0); - - if (hdr->i8count) - put_unaligned_be64(ino, to); - else - put_unaligned_be32(ino, to); -} - -static xfs_ino_t -xfs_dir2_sf_get_parent_ino( - struct xfs_dir2_sf_hdr *hdr) -{ - return xfs_dir2_sf_get_ino(hdr, hdr->parent); -} - -static void -xfs_dir2_sf_put_parent_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); -} - -/* - * In short-form directory entries the inode numbers are stored at variable - * offset behind the entry name. If the entry stores a filetype value, then it - * sits between the name and the inode number. Hence the inode numbers may only - * be accessed through the helpers below. - */ -static xfs_ino_t -xfs_dir2_sfe_get_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); -} - -static void -xfs_dir2_sfe_put_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); -} - -static xfs_ino_t -xfs_dir3_sfe_get_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); -} - -static void -xfs_dir3_sfe_put_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); -} - - -/* - * Directory data block operations - */ - -/* - * For special situations, the dirent size ends up fixed because we always know - * what the size of the entry is. That's true for the "." and "..", and - * therefore we know that they are a fixed size and hence their offsets are - * constant, as is the first entry. - * - * Hence, this calculation is written as a macro to be able to be calculated at - * compile time and so certain offsets can be calculated directly in the - * structure initaliser via the macro. There are two macros - one for dirents - * with ftype and without so there are no unresolvable conditionals in the - * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power - * of 2 and the compiler doesn't reject it (unlike roundup()). - */ -#define XFS_DIR2_DATA_ENTSIZE(n) \ - round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ - sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) - -#define XFS_DIR3_DATA_ENTSIZE(n) \ - round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ - sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \ - XFS_DIR2_DATA_ALIGN) - -static int -xfs_dir2_data_entsize( - int n) -{ - return XFS_DIR2_DATA_ENTSIZE(n); -} - -static int -xfs_dir3_data_entsize( - int n) -{ - return XFS_DIR3_DATA_ENTSIZE(n); -} - -static uint8_t -xfs_dir2_data_get_ftype( - struct xfs_dir2_data_entry *dep) -{ - return XFS_DIR3_FT_UNKNOWN; -} - -static void -xfs_dir2_data_put_ftype( - struct xfs_dir2_data_entry *dep, - uint8_t ftype) -{ - ASSERT(ftype < XFS_DIR3_FT_MAX); -} - -static uint8_t -xfs_dir3_data_get_ftype( - struct xfs_dir2_data_entry *dep) -{ - uint8_t ftype = dep->name[dep->namelen]; - - if (ftype >= XFS_DIR3_FT_MAX) - return XFS_DIR3_FT_UNKNOWN; - return ftype; -} - -static void -xfs_dir3_data_put_ftype( - struct xfs_dir2_data_entry *dep, - uint8_t type) -{ - ASSERT(type < XFS_DIR3_FT_MAX); - ASSERT(dep->namelen != 0); - - dep->name[dep->namelen] = type; -} - -/* - * Pointer to an entry's tag word. - */ -static __be16 * -xfs_dir2_data_entry_tag_p( - struct xfs_dir2_data_entry *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); -} - -static __be16 * -xfs_dir3_data_entry_tag_p( - struct xfs_dir2_data_entry *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); -} - -/* - * location of . and .. in data space (always block 0) - */ -static struct xfs_dir2_data_entry * -xfs_dir2_data_dot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_data_dotdot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_data_first_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1) + - XFS_DIR2_DATA_ENTSIZE(2)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_ftype_data_dotdot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_ftype_data_first_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_dot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_dotdot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_first_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2)); -} - -static struct xfs_dir2_data_free * -xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) -{ - return hdr->bestfree; -} - -static struct xfs_dir2_data_free * -xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) -{ - return ((struct xfs_dir3_data_hdr *)hdr)->best_free; -} - -static struct xfs_dir2_data_entry * -xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); -} - -static struct xfs_dir2_data_unused * -xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_unused *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); -} - -static struct xfs_dir2_data_unused * -xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_unused *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); -} - - -/* - * Directory Leaf block operations - */ -static int -xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / - (uint)sizeof(struct xfs_dir2_leaf_entry); -} - -static struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) -{ - return lp->__ents; -} - -static int -xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / - (uint)sizeof(struct xfs_dir2_leaf_entry); -} - -static struct xfs_dir2_leaf_entry * -xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) -{ - return ((struct xfs_dir3_leaf *)lp)->__ents; -} - -static void -xfs_dir2_leaf_hdr_from_disk( - struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from) -{ - to->forw = be32_to_cpu(from->hdr.info.forw); - to->back = be32_to_cpu(from->hdr.info.back); - to->magic = be16_to_cpu(from->hdr.info.magic); - to->count = be16_to_cpu(from->hdr.count); - to->stale = be16_to_cpu(from->hdr.stale); - - ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || - to->magic == XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir2_leaf_hdr_to_disk( - struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from) -{ - ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || - from->magic == XFS_DIR2_LEAFN_MAGIC); - - to->hdr.info.forw = cpu_to_be32(from->forw); - to->hdr.info.back = cpu_to_be32(from->back); - to->hdr.info.magic = cpu_to_be16(from->magic); - to->hdr.count = cpu_to_be16(from->count); - to->hdr.stale = cpu_to_be16(from->stale); -} - -static void -xfs_dir3_leaf_hdr_from_disk( - struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from) -{ - struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; - - to->forw = be32_to_cpu(hdr3->info.hdr.forw); - to->back = be32_to_cpu(hdr3->info.hdr.back); - to->magic = be16_to_cpu(hdr3->info.hdr.magic); - to->count = be16_to_cpu(hdr3->count); - to->stale = be16_to_cpu(hdr3->stale); - - ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || - to->magic == XFS_DIR3_LEAFN_MAGIC); -} - -static void -xfs_dir3_leaf_hdr_to_disk( - struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from) -{ - struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; - - ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || - from->magic == XFS_DIR3_LEAFN_MAGIC); - - hdr3->info.hdr.forw = cpu_to_be32(from->forw); - hdr3->info.hdr.back = cpu_to_be32(from->back); - hdr3->info.hdr.magic = cpu_to_be16(from->magic); - hdr3->count = cpu_to_be16(from->count); - hdr3->stale = cpu_to_be16(from->stale); -} - - -/* - * Directory/Attribute Node block operations - */ -static struct xfs_da_node_entry * -xfs_da2_node_tree_p(struct xfs_da_intnode *dap) -{ - return dap->__btree; -} - -static struct xfs_da_node_entry * -xfs_da3_node_tree_p(struct xfs_da_intnode *dap) -{ - return ((struct xfs_da3_intnode *)dap)->__btree; -} - -static void -xfs_da2_node_hdr_from_disk( - struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from) -{ - ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - to->forw = be32_to_cpu(from->hdr.info.forw); - to->back = be32_to_cpu(from->hdr.info.back); - to->magic = be16_to_cpu(from->hdr.info.magic); - to->count = be16_to_cpu(from->hdr.__count); - to->level = be16_to_cpu(from->hdr.__level); -} - -static void -xfs_da2_node_hdr_to_disk( - struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from) -{ - ASSERT(from->magic == XFS_DA_NODE_MAGIC); - to->hdr.info.forw = cpu_to_be32(from->forw); - to->hdr.info.back = cpu_to_be32(from->back); - to->hdr.info.magic = cpu_to_be16(from->magic); - to->hdr.__count = cpu_to_be16(from->count); - to->hdr.__level = cpu_to_be16(from->level); -} - -static void -xfs_da3_node_hdr_from_disk( - struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from) -{ - struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; - - ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); - to->forw = be32_to_cpu(hdr3->info.hdr.forw); - to->back = be32_to_cpu(hdr3->info.hdr.back); - to->magic = be16_to_cpu(hdr3->info.hdr.magic); - to->count = be16_to_cpu(hdr3->__count); - to->level = be16_to_cpu(hdr3->__level); -} - -static void -xfs_da3_node_hdr_to_disk( - struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from) -{ - struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; - - ASSERT(from->magic == XFS_DA3_NODE_MAGIC); - hdr3->info.hdr.forw = cpu_to_be32(from->forw); - hdr3->info.hdr.back = cpu_to_be32(from->back); - hdr3->info.hdr.magic = cpu_to_be16(from->magic); - hdr3->__count = cpu_to_be16(from->count); - hdr3->__level = cpu_to_be16(from->level); -} - - -/* - * Directory free space block operations - */ -static int -xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / - sizeof(xfs_dir2_data_off_t); -} - -static __be16 * -xfs_dir2_free_bests_p(struct xfs_dir2_free *free) -{ - return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); -} - -/* - * Convert data space db to the corresponding free db. - */ -static xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + - (db / xfs_dir2_free_max_bests(geo)); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static int -xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return db % xfs_dir2_free_max_bests(geo); -} - -static int -xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / - sizeof(xfs_dir2_data_off_t); -} - -static __be16 * -xfs_dir3_free_bests_p(struct xfs_dir2_free *free) -{ - return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); -} - -/* - * Convert data space db to the corresponding free db. - */ -static xfs_dir2_db_t -xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + - (db / xfs_dir3_free_max_bests(geo)); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static int -xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return db % xfs_dir3_free_max_bests(geo); -} - -static void -xfs_dir2_free_hdr_from_disk( - struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from) -{ - to->magic = be32_to_cpu(from->hdr.magic); - to->firstdb = be32_to_cpu(from->hdr.firstdb); - to->nvalid = be32_to_cpu(from->hdr.nvalid); - to->nused = be32_to_cpu(from->hdr.nused); - ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); -} - -static void -xfs_dir2_free_hdr_to_disk( - struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from) -{ - ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); - - to->hdr.magic = cpu_to_be32(from->magic); - to->hdr.firstdb = cpu_to_be32(from->firstdb); - to->hdr.nvalid = cpu_to_be32(from->nvalid); - to->hdr.nused = cpu_to_be32(from->nused); -} - -static void -xfs_dir3_free_hdr_from_disk( - struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from) -{ - struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; - - to->magic = be32_to_cpu(hdr3->hdr.magic); - to->firstdb = be32_to_cpu(hdr3->firstdb); - to->nvalid = be32_to_cpu(hdr3->nvalid); - to->nused = be32_to_cpu(hdr3->nused); - - ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); -} - -static void -xfs_dir3_free_hdr_to_disk( - struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from) -{ - struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; - - ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); - - hdr3->hdr.magic = cpu_to_be32(from->magic); - hdr3->firstdb = cpu_to_be32(from->firstdb); - hdr3->nvalid = cpu_to_be32(from->nvalid); - hdr3->nused = cpu_to_be32(from->nused); -} - -static const struct xfs_dir_ops xfs_dir2_ops = { - .sf_entsize = xfs_dir2_sf_entsize, - .sf_nextentry = xfs_dir2_sf_nextentry, - .sf_get_ftype = xfs_dir2_sfe_get_ftype, - .sf_put_ftype = xfs_dir2_sfe_put_ftype, - .sf_get_ino = xfs_dir2_sfe_get_ino, - .sf_put_ino = xfs_dir2_sfe_put_ino, - .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, - .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, - - .data_entsize = xfs_dir2_data_entsize, - .data_get_ftype = xfs_dir2_data_get_ftype, - .data_put_ftype = xfs_dir2_data_put_ftype, - .data_entry_tag_p = xfs_dir2_data_entry_tag_p, - .data_bestfree_p = xfs_dir2_data_bestfree_p, - - .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), - .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1), - .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1) + - XFS_DIR2_DATA_ENTSIZE(2), - .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), - - .data_dot_entry_p = xfs_dir2_data_dot_entry_p, - .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, - .data_first_entry_p = xfs_dir2_data_first_entry_p, - .data_entry_p = xfs_dir2_data_entry_p, - .data_unused_p = xfs_dir2_data_unused_p, - - .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), - .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, - .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, - .leaf_max_ents = xfs_dir2_max_leaf_ents, - .leaf_ents_p = xfs_dir2_leaf_ents_p, - - .node_hdr_size = sizeof(struct xfs_da_node_hdr), - .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, - .node_tree_p = xfs_da2_node_tree_p, - - .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), - .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, - .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, - .free_max_bests = xfs_dir2_free_max_bests, - .free_bests_p = xfs_dir2_free_bests_p, - .db_to_fdb = xfs_dir2_db_to_fdb, - .db_to_fdindex = xfs_dir2_db_to_fdindex, -}; - -static const struct xfs_dir_ops xfs_dir2_ftype_ops = { - .sf_entsize = xfs_dir3_sf_entsize, - .sf_nextentry = xfs_dir3_sf_nextentry, - .sf_get_ftype = xfs_dir3_sfe_get_ftype, - .sf_put_ftype = xfs_dir3_sfe_put_ftype, - .sf_get_ino = xfs_dir3_sfe_get_ino, - .sf_put_ino = xfs_dir3_sfe_put_ino, - .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, - .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, - - .data_entsize = xfs_dir3_data_entsize, - .data_get_ftype = xfs_dir3_data_get_ftype, - .data_put_ftype = xfs_dir3_data_put_ftype, - .data_entry_tag_p = xfs_dir3_data_entry_tag_p, - .data_bestfree_p = xfs_dir2_data_bestfree_p, - - .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), - .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1), - .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2), - .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), - - .data_dot_entry_p = xfs_dir2_data_dot_entry_p, - .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, - .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, - .data_entry_p = xfs_dir2_data_entry_p, - .data_unused_p = xfs_dir2_data_unused_p, - - .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), - .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, - .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, - .leaf_max_ents = xfs_dir2_max_leaf_ents, - .leaf_ents_p = xfs_dir2_leaf_ents_p, - - .node_hdr_size = sizeof(struct xfs_da_node_hdr), - .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, - .node_tree_p = xfs_da2_node_tree_p, - - .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), - .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, - .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, - .free_max_bests = xfs_dir2_free_max_bests, - .free_bests_p = xfs_dir2_free_bests_p, - .db_to_fdb = xfs_dir2_db_to_fdb, - .db_to_fdindex = xfs_dir2_db_to_fdindex, -}; - -static const struct xfs_dir_ops xfs_dir3_ops = { - .sf_entsize = xfs_dir3_sf_entsize, - .sf_nextentry = xfs_dir3_sf_nextentry, - .sf_get_ftype = xfs_dir3_sfe_get_ftype, - .sf_put_ftype = xfs_dir3_sfe_put_ftype, - .sf_get_ino = xfs_dir3_sfe_get_ino, - .sf_put_ino = xfs_dir3_sfe_put_ino, - .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, - .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, - - .data_entsize = xfs_dir3_data_entsize, - .data_get_ftype = xfs_dir3_data_get_ftype, - .data_put_ftype = xfs_dir3_data_put_ftype, - .data_entry_tag_p = xfs_dir3_data_entry_tag_p, - .data_bestfree_p = xfs_dir3_data_bestfree_p, - - .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), - .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1), - .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2), - .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), - - .data_dot_entry_p = xfs_dir3_data_dot_entry_p, - .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, - .data_first_entry_p = xfs_dir3_data_first_entry_p, - .data_entry_p = xfs_dir3_data_entry_p, - .data_unused_p = xfs_dir3_data_unused_p, - - .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), - .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, - .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, - .leaf_max_ents = xfs_dir3_max_leaf_ents, - .leaf_ents_p = xfs_dir3_leaf_ents_p, - - .node_hdr_size = sizeof(struct xfs_da3_node_hdr), - .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, - .node_tree_p = xfs_da3_node_tree_p, - - .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), - .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, - .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, - .free_max_bests = xfs_dir3_free_max_bests, - .free_bests_p = xfs_dir3_free_bests_p, - .db_to_fdb = xfs_dir3_db_to_fdb, - .db_to_fdindex = xfs_dir3_db_to_fdindex, -}; - -static const struct xfs_dir_ops xfs_dir2_nondir_ops = { - .node_hdr_size = sizeof(struct xfs_da_node_hdr), - .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, - .node_tree_p = xfs_da2_node_tree_p, -}; - -static const struct xfs_dir_ops xfs_dir3_nondir_ops = { - .node_hdr_size = sizeof(struct xfs_da3_node_hdr), - .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, - .node_tree_p = xfs_da3_node_tree_p, -}; - -/* - * Return the ops structure according to the current config. If we are passed - * an inode, then that overrides the default config we use which is based on - * feature bits. - */ -const struct xfs_dir_ops * -xfs_dir_get_ops( - struct xfs_mount *mp, - struct xfs_inode *dp) -{ - if (dp) - return dp->d_ops; - if (mp->m_dir_inode_ops) - return mp->m_dir_inode_ops; - if (xfs_sb_version_hascrc(&mp->m_sb)) - return &xfs_dir3_ops; - if (xfs_sb_version_hasftype(&mp->m_sb)) - return &xfs_dir2_ftype_ops; - return &xfs_dir2_ops; -} - -const struct xfs_dir_ops * -xfs_nondir_get_ops( - struct xfs_mount *mp, - struct xfs_inode *dp) -{ - if (dp) - return dp->d_ops; - if (mp->m_nondir_inode_ops) - return mp->m_nondir_inode_ops; - if (xfs_sb_version_hascrc(&mp->m_sb)) - return &xfs_dir3_nondir_ops; - return &xfs_dir2_nondir_ops; -} diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index ae654e06b2fb..734837a9b51a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -94,19 +94,6 @@ struct xfs_da3_intnode { }; /* - * In-core version of the node header to abstract the differences in the v2 and - * v3 disk format of the headers. Callers need to convert to/from disk format as - * appropriate. - */ -struct xfs_da3_icnode_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t level; -}; - -/* * Directory version 2. * * There are 4 possible formats: @@ -230,7 +217,7 @@ typedef struct xfs_dir2_sf_entry { * A 64-bit or 32-bit inode number follows here, at a variable offset * after the name. */ -} xfs_dir2_sf_entry_t; +} __packed xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { @@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr { __be32 pad; /* 64 bit alignment */ }; -struct xfs_dir3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t stale; -}; - /* * Leaf block entry. */ @@ -482,7 +461,7 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* - * Free space block defintions for the node format. + * Free space block definitions for the node format. */ /* @@ -521,19 +500,6 @@ struct xfs_dir3_free { #define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) /* - * In core version of the free block header, abstracted away from on-disk format - * differences. Use this in the code, and convert to/from the disk version using - * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. - */ -struct xfs_dir3_icfree_hdr { - uint32_t magic; - uint32_t firstdb; - uint32_t nvalid; - uint32_t nused; - -}; - -/* * Single block format. * * The single block format looks like the following drawing on disk: @@ -710,29 +676,6 @@ struct xfs_attr3_leafblock { }; /* - * incore, neutral version of the attribute leaf header - */ -struct xfs_attr3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t usedbytes; - /* - * firstused is 32-bit here instead of 16-bit like the on-disk variant - * to support maximum fsb size of 64k without overflow issues throughout - * the attr code. Instead, the overflow condition is handled on - * conversion to/from disk. - */ - uint32_t firstused; - __u8 holes; - struct { - uint16_t base; - uint16_t size; - } freemap[XFS_ATTR_LEAF_MAPSIZE]; -}; - -/* * Special value to represent fs block size in the leaf header firstused field. * Only used when block size overflows the 2-bytes available on disk. */ @@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr { /* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index eb2be2a6a25a..22557527cfdb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -517,7 +517,7 @@ xfs_defer_add( } if (!dfp) { dfp = kmem_alloc(sizeof(struct xfs_defer_pending), - KM_SLEEP | KM_NOFS); + KM_NOFS); dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 67840723edbb..dd6fcaaea318 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -52,7 +52,7 @@ xfs_mode_to_ftype( * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. */ -STATIC xfs_dahash_t +xfs_dahash_t xfs_ascii_ci_hashname( struct xfs_name *name) { @@ -65,14 +65,14 @@ xfs_ascii_ci_hashname( return hash; } -STATIC enum xfs_dacmp +enum xfs_dacmp xfs_ascii_ci_compname( - struct xfs_da_args *args, - const unsigned char *name, - int len) + struct xfs_da_args *args, + const unsigned char *name, + int len) { - enum xfs_dacmp result; - int i; + enum xfs_dacmp result; + int i; if (args->namelen != len) return XFS_CMP_DIFFERENT; @@ -89,30 +89,20 @@ xfs_ascii_ci_compname( return result; } -static const struct xfs_nameops xfs_ascii_ci_nameops = { - .hashname = xfs_ascii_ci_hashname, - .compname = xfs_ascii_ci_compname, -}; - int xfs_da_mount( struct xfs_mount *mp) { struct xfs_da_geometry *dageo; - int nodehdr_size; ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); - mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); - mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); - - nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kmem_free(mp->m_dir_geo); kmem_free(mp->m_attr_geo); @@ -125,6 +115,27 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); + dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); + dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); + dageo->data_entry_offset = + sizeof(struct xfs_dir3_data_hdr); + } else { + dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr); + dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr); + dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr); + dageo->data_entry_offset = + sizeof(struct xfs_dir2_data_hdr); + } + dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) / + sizeof(struct xfs_dir2_leaf_entry); + dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) / + sizeof(xfs_dir2_data_off_t); + + dageo->data_first_offset = dageo->data_entry_offset + + xfs_dir2_data_entsize(mp, 1) + + xfs_dir2_data_entsize(mp, 2); /* * Now we've set up the block conversion variables, we can calculate the @@ -133,7 +144,7 @@ xfs_da_mount( dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); - dageo->node_ents = (dageo->blksize - nodehdr_size) / + dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->magicpct = (dageo->blksize * 37) / 100; @@ -143,15 +154,10 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = 1 << dageo->blklog; dageo->fsbcount = 1; - dageo->node_ents = (dageo->blksize - nodehdr_size) / + dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; + dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->magicpct = (dageo->blksize * 37) / 100; - - if (xfs_sb_version_hasasciici(&mp->m_sb)) - mp->m_dirnameops = &xfs_ascii_ci_nameops; - else - mp->m_dirnameops = &xfs_default_nameops; - return 0; } @@ -191,10 +197,10 @@ xfs_dir_ino_validate( { bool ino_ok = xfs_verify_dir_ino(mp, ino); - if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) { + if (XFS_IS_CORRUPT(mp, !ino_ok) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); - XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } return 0; @@ -217,7 +223,7 @@ xfs_dir_init( if (error) return error; - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -254,7 +260,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -262,7 +268,7 @@ xfs_dir_createname( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; @@ -353,12 +359,12 @@ xfs_dir_lookup( * lockdep Doing this avoids having to add a bunch of lockdep class * annotations into the reclaim path for the ilock. */ - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->dp = dp; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -422,7 +428,7 @@ xfs_dir_removename( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -430,7 +436,7 @@ xfs_dir_removename( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = ino; args->dp = dp; args->total = total; @@ -483,7 +489,7 @@ xfs_dir_replace( if (rval) return rval; - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -491,7 +497,7 @@ xfs_dir_replace( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; @@ -600,7 +606,9 @@ xfs_dir2_isblock( if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) + if (XFS_IS_CORRUPT(args->dp->i_mount, + rval != 0 && + args->dp->i_d.di_size != args->geo->blksize)) return -EFSCORRUPTED; *vp = rval; return 0; @@ -716,3 +724,24 @@ xfs_dir2_namecheck( /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } + +xfs_dahash_t +xfs_dir2_hashname( + struct xfs_mount *mp, + struct xfs_name *name) +{ + if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + return xfs_ascii_ci_hashname(name); + return xfs_da_hashname(name->name, name->len); +} + +enum xfs_dacmp +xfs_dir2_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + return xfs_ascii_ci_compname(args, name, len); + return xfs_da_compname(args, name, len); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index f54244779492..033777e282f2 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry; struct xfs_dir2_data_hdr; struct xfs_dir2_data_entry; struct xfs_dir2_data_unused; +struct xfs_dir3_icfree_hdr; +struct xfs_dir3_icleaf_hdr; extern struct xfs_name xfs_name_dotdot; @@ -27,85 +29,6 @@ extern struct xfs_name xfs_name_dotdot; extern unsigned char xfs_mode_to_ftype(int mode); /* - * directory operations vector for encode/decode routines - */ -struct xfs_dir_ops { - int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); - struct xfs_dir2_sf_entry * - (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep); - uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); - void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, - uint8_t ftype); - xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep); - void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino); - xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); - void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, - xfs_ino_t ino); - - int (*data_entsize)(int len); - uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); - void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, - uint8_t ftype); - __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); - struct xfs_dir2_data_free * - (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); - - xfs_dir2_data_aoff_t data_dot_offset; - xfs_dir2_data_aoff_t data_dotdot_offset; - xfs_dir2_data_aoff_t data_first_offset; - size_t data_entry_offset; - - struct xfs_dir2_data_entry * - (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_entry * - (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_entry * - (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_entry * - (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_unused * - (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); - - int leaf_hdr_size; - void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from); - void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from); - int (*leaf_max_ents)(struct xfs_da_geometry *geo); - struct xfs_dir2_leaf_entry * - (*leaf_ents_p)(struct xfs_dir2_leaf *lp); - - int node_hdr_size; - void (*node_hdr_to_disk)(struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from); - void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from); - struct xfs_da_node_entry * - (*node_tree_p)(struct xfs_da_intnode *dap); - - int free_hdr_size; - void (*free_hdr_to_disk)(struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from); - void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from); - int (*free_max_bests)(struct xfs_da_geometry *geo); - __be16 * (*free_bests_p)(struct xfs_dir2_free *free); - xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, - xfs_dir2_db_t db); - int (*db_to_fdindex)(struct xfs_da_geometry *geo, - xfs_dir2_db_t db); -}; - -extern const struct xfs_dir_ops * - xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); -extern const struct xfs_dir_ops * - xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); - -/* * Generic directory interface routines */ extern void xfs_dir_startup(void); @@ -124,6 +47,8 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); +extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp, + xfs_ino_t inum); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); @@ -143,10 +68,7 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); -extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, - const struct xfs_dir_ops *ops, - struct xfs_dir2_data_hdr *hdr, int *loghead); -extern void xfs_dir2_data_freescan(struct xfs_inode *dp, +extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); @@ -324,7 +246,7 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) #define XFS_READDIR_BUFSIZE (32768) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); -void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, +unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); bool xfs_dir2_namecheck(const void *name, size_t length); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index a6fb0cc2085e..d6ced59b9567 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -123,7 +123,7 @@ xfs_dir3_block_read( struct xfs_mount *mp = dp->i_mount; int err; - err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); @@ -172,7 +172,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); /* * If there are stale entries we'll use one for the leaf. @@ -311,7 +311,7 @@ xfs_dir2_block_compact( * This needs to happen before the next call to use_free. */ if (needscan) - xfs_dir2_data_freescan(args->dp, hdr, needlog); + xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog); } /* @@ -355,7 +355,7 @@ xfs_dir2_block_addname( if (error) return error; - len = dp->d_ops->data_entsize(args->namelen); + len = xfs_dir2_data_entsize(dp->i_mount, args->namelen); /* * Set up pointers to parts of the block. @@ -458,7 +458,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); needscan = 0; } /* @@ -541,14 +541,14 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - dp->d_ops->data_put_ftype(dep, args->filetype); - tagp = dp->d_ops->data_entry_tag_p(dep); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); + tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); @@ -633,7 +633,7 @@ xfs_dir2_block_lookup( * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); - args->filetype = dp->d_ops->data_get_ftype(dep); + args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return error; @@ -660,13 +660,11 @@ xfs_dir2_block_lookup_int( int high; /* binary search high index */ int low; /* binary search low index */ int mid; /* binary search current idx */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; - mp = dp->i_mount; error = xfs_dir3_block_read(tp, dp, &bp); if (error) @@ -718,7 +716,7 @@ xfs_dir2_block_lookup_int( * and buffer. If it's the first case-insensitive match, store * the index and buffer and continue looking for an exact match. */ - cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; *bpp = bp; @@ -791,7 +789,8 @@ xfs_dir2_block_removename( needlog = needscan = 0; xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, + &needscan); /* * Fix up the block tail. */ @@ -806,7 +805,7 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir3_data_check(dp, bp); @@ -864,7 +863,7 @@ xfs_dir2_block_replace( * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); - dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; @@ -914,7 +913,6 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); @@ -923,8 +921,7 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || @@ -938,7 +935,7 @@ xfs_dir2_leaf_to_block( while (dp->i_d.di_size > args->geo->blksize) { int hdrsz; - hdrsz = dp->d_ops->data_entry_offset; + hdrsz = args->geo->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == args->geo->blksize - hdrsz) { @@ -953,7 +950,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp); if (error) return error; } @@ -1004,9 +1001,10 @@ xfs_dir2_leaf_to_block( */ lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < leafhdr.count; from++) { - if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (leafhdr.ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = ents[from]; + lep[to++] = leafhdr.ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -1014,7 +1012,7 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* @@ -1039,47 +1037,38 @@ xfs_dir2_leaf_to_block( */ int /* error */ xfs_dir2_sf_to_block( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) { + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ int dummy; /* trash */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ int endoffset; /* end of data objects */ int error; /* error return value */ int i; /* index */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log block header */ int needscan; /* need to scan block freespc */ int newoffset; /* offset from current entry */ - int offset; /* target block offset */ + unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ struct xfs_name name; - struct xfs_ifork *ifp; trace_xfs_dir2_sf_to_block(args); - dp = args->dp; - tp = args->trans; - mp = dp->i_mount; - ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); ASSERT(ifp->if_flags & XFS_IFINLINE); - /* - * Bomb out if the shortform directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; @@ -1092,11 +1081,11 @@ xfs_dir2_sf_to_block( * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + sfp = kmem_alloc(ifp->if_bytes, 0); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); dp->i_d.di_size = 0; /* @@ -1123,7 +1112,7 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = dp->d_ops->data_unused_p(hdr); + dup = bp->b_addr + offset; needlog = needscan = 0; error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, i, &needlog, &needscan); @@ -1146,35 +1135,37 @@ xfs_dir2_sf_to_block( be16_to_cpu(dup->length), &needlog, &needscan); if (error) goto out_free; + /* * Create entry for . */ - dep = dp->d_ops->data_dot_entry_p(hdr); + dep = bp->b_addr + offset; dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); - tagp = dp->d_ops->data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); + tagp = xfs_dir2_data_entry_tag_p(mp, dep); + *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( - (char *)dep - (char *)hdr)); + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); + offset += xfs_dir2_data_entsize(mp, dep->namelen); + /* * Create entry for .. */ - dep = dp->d_ops->data_dotdot_entry_p(hdr); - dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); + dep = bp->b_addr + offset; + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); - tagp = dp->d_ops->data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); + tagp = xfs_dir2_data_entry_tag_p(mp, dep); + *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( - (char *)dep - (char *)hdr)); - offset = dp->d_ops->data_first_offset; + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); + offset += xfs_dir2_data_entsize(mp, dep->namelen); + /* * Loop over existing entries, stuff them in. */ @@ -1183,6 +1174,7 @@ xfs_dir2_sf_to_block( sfep = NULL; else sfep = xfs_dir2_sf_firstentry(sfp); + /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. @@ -1199,40 +1191,39 @@ xfs_dir2_sf_to_block( * There should be a hole here, make one. */ if (offset < newoffset) { - dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + dup = bp->b_addr + offset; dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); - *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( - ((char *)dup - (char *)hdr)); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset); xfs_dir2_data_log_unused(args, bp, dup); xfs_dir2_data_freeinsert(hdr, - dp->d_ops->data_bestfree_p(hdr), - dup, &dummy); + xfs_dir2_data_bestfree_p(mp, hdr), + dup, &dummy); offset += be16_to_cpu(dup->length); continue; } /* * Copy a real entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); - dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); + dep = bp->b_addr + newoffset; + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep)); dep->namelen = sfep->namelen; - dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); + xfs_dir2_data_put_ftype(mp, dep, + xfs_dir2_sf_get_ftype(mp, sfep)); memcpy(dep->name, sfep->name, dep->namelen); - tagp = dp->d_ops->data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)hdr); + tagp = xfs_dir2_data_entry_tag_p(mp, dep); + *tagp = cpu_to_be16(newoffset); xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; - blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> - hashname(&name)); - blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( - (char *)dep - (char *)hdr)); + blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name)); + blp[2 + i].address = + cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) sfep = NULL; else - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ kmem_free(sfp); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 2c79be4c3153..b9eba8213180 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -13,6 +13,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" @@ -23,6 +24,71 @@ static xfs_failaddr_t xfs_dir2_data_freefind_verify( struct xfs_dir2_data_unused *dup, struct xfs_dir2_data_free **bf_ent); +struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; + return hdr->bestfree; +} + +/* + * Pointer to an entry's tag word. + */ +__be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(mp, dep->namelen) - sizeof(__be16)); +} + +uint8_t +xfs_dir2_data_get_ftype( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep) +{ + if (xfs_sb_version_hasftype(&mp->m_sb)) { + uint8_t ftype = dep->name[dep->namelen]; + + if (likely(ftype < XFS_DIR3_FT_MAX)) + return ftype; + } + + return XFS_DIR3_FT_UNKNOWN; +} + +void +xfs_dir2_data_put_ftype( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep, + uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + dep->name[dep->namelen] = ftype; +} + +/* + * The number of leaf entries is limited by the size of the block and the amount + * of space used by the data entries. We don't know how much space is used by + * the data entries yet, so just ensure that the count falls somewhere inside + * the block right now. + */ +static inline unsigned int +xfs_dir2_data_max_leaf_entries( + struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_block_tail) - + geo->data_entry_offset) / + sizeof(struct xfs_dir2_leaf_entry); +} + /* * Check the consistency of the data block. * The input can also be a block-format directory. @@ -38,40 +104,27 @@ __xfs_dir3_data_check( xfs_dir2_block_tail_t *btp=NULL; /* block tail */ int count; /* count of entries found */ xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_free_t *dfp; /* bestfree entry */ - xfs_dir2_data_unused_t *dup; /* unused entry */ - char *endp; /* end of useful data */ int freeseen; /* mask of bestfrees seen */ xfs_dahash_t hash; /* hash of current name */ int i; /* leaf index */ int lastfree; /* last entry was unused */ xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ struct xfs_mount *mp = bp->b_mount; - char *p; /* current data position */ int stale; /* count of stale leaves */ struct xfs_name name; - const struct xfs_dir_ops *ops; - struct xfs_da_geometry *geo; - - geo = mp->m_dir_geo; + unsigned int offset; + unsigned int end; + struct xfs_da_geometry *geo = mp->m_dir_geo; /* - * We can be passed a null dp here from a verifier, so we need to go the - * hard way to get them. + * If this isn't a directory, something is seriously wrong. Bail out. */ - ops = xfs_dir_get_ops(mp, dp); - - /* - * If this isn't a directory, or we don't get handed the dir ops, - * something is seriously wrong. Bail out. - */ - if ((dp && !S_ISDIR(VFS_I(dp)->i_mode)) || - ops != xfs_dir_get_ops(mp, NULL)) + if (dp && !S_ISDIR(VFS_I(dp)->i_mode)) return __this_address; hdr = bp->b_addr; - p = (char *)ops->data_entry_p(hdr); + offset = geo->data_entry_offset; switch (hdr->magic) { case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): @@ -79,15 +132,8 @@ __xfs_dir3_data_check( btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); - /* - * The number of leaf entries is limited by the size of the - * block and the amount of space used by the data entries. - * We don't know how much space is used by the data entries yet, - * so just ensure that the count falls somewhere inside the - * block right now. - */ if (be32_to_cpu(btp->count) >= - ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)) + xfs_dir2_data_max_leaf_entries(geo)) return __this_address; break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): @@ -96,14 +142,14 @@ __xfs_dir3_data_check( default: return __this_address; } - endp = xfs_dir3_data_endp(geo, hdr); - if (!endp) + end = xfs_dir3_data_end_offset(geo, hdr); + if (!end) return __this_address; /* * Account for zero bestfree entries. */ - bf = ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(mp, hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { if (bf[0].offset) @@ -128,8 +174,10 @@ __xfs_dir3_data_check( /* * Loop over the data/unused entries. */ - while (p < endp) { - dup = (xfs_dir2_data_unused_t *)p; + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + /* * If it's unused, look for the space in the bestfree table. * If we find it, account for that, else make sure it @@ -140,10 +188,10 @@ __xfs_dir3_data_check( if (lastfree != 0) return __this_address; - if (endp < p + be16_to_cpu(dup->length)) + if (offset + be16_to_cpu(dup->length) > end) return __this_address; if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != - (char *)dup - (char *)hdr) + offset) return __this_address; fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp); if (fa) @@ -158,7 +206,7 @@ __xfs_dir3_data_check( be16_to_cpu(bf[2].length)) return __this_address; } - p += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); lastfree = 1; continue; } @@ -168,17 +216,15 @@ __xfs_dir3_data_check( * in the leaf section of the block. * The linear search is crude but this is DEBUG code. */ - dep = (xfs_dir2_data_entry_t *)p; if (dep->namelen == 0) return __this_address; if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))) return __this_address; - if (endp < p + ops->data_entsize(dep->namelen)) + if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) return __this_address; - if (be16_to_cpu(*ops->data_entry_tag_p(dep)) != - (char *)dep - (char *)hdr) + if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset) return __this_address; - if (ops->data_get_ftype(dep) >= XFS_DIR3_FT_MAX) + if (xfs_dir2_data_get_ftype(mp, dep) >= XFS_DIR3_FT_MAX) return __this_address; count++; lastfree = 0; @@ -189,7 +235,7 @@ __xfs_dir3_data_check( ((char *)dep - (char *)hdr)); name.name = dep->name; name.len = dep->namelen; - hash = mp->m_dirnameops->hashname(&name); + hash = xfs_dir2_hashname(mp, &name); for (i = 0; i < be32_to_cpu(btp->count); i++) { if (be32_to_cpu(lep[i].address) == addr && be32_to_cpu(lep[i].hashval) == hash) @@ -198,7 +244,7 @@ __xfs_dir3_data_check( if (i >= be32_to_cpu(btp->count)) return __this_address; } - p += ops->data_entsize(dep->namelen); + offset += xfs_dir2_data_entsize(mp, dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. @@ -354,13 +400,13 @@ xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno, + unsigned int flags, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK, + &xfs_dir3_data_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; @@ -370,10 +416,10 @@ int xfs_dir3_data_readahead( struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno) + unsigned int flags) { - return xfs_da_reada_buf(dp, bno, mapped_bno, - XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); + return xfs_da_reada_buf(dp, bno, flags, XFS_DATA_FORK, + &xfs_dir3_data_reada_buf_ops); } /* @@ -561,17 +607,16 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan_int( - struct xfs_da_geometry *geo, - const struct xfs_dir_ops *ops, - struct xfs_dir2_data_hdr *hdr, - int *loghead) +xfs_dir2_data_freescan( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) { - xfs_dir2_data_entry_t *dep; /* active data entry */ - xfs_dir2_data_unused_t *dup; /* unused data entry */ - struct xfs_dir2_data_free *bf; - char *endp; /* end of block's data */ - char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_data_free *bf = xfs_dir2_data_bestfree_p(mp, hdr); + void *addr = hdr; + unsigned int offset = geo->data_entry_offset; + unsigned int end; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -581,79 +626,60 @@ xfs_dir2_data_freescan_int( /* * Start by clearing the table. */ - bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; - /* - * Set up pointers. - */ - p = (char *)ops->data_entry_p(hdr); - endp = xfs_dir3_data_endp(geo, hdr); - /* - * Loop over the block's entries. - */ - while (p < endp) { - dup = (xfs_dir2_data_unused_t *)p; + + end = xfs_dir3_data_end_offset(geo, addr); + while (offset < end) { + struct xfs_dir2_data_unused *dup = addr + offset; + struct xfs_dir2_data_entry *dep = addr + offset; + /* * If it's a free entry, insert it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT((char *)dup - (char *)hdr == + ASSERT(offset == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); - p += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); + continue; } + /* * For active entries, check their tags and skip them. */ - else { - dep = (xfs_dir2_data_entry_t *)p; - ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*ops->data_entry_tag_p(dep))); - p += ops->data_entsize(dep->namelen); - } + ASSERT(offset == + be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep))); + offset += xfs_dir2_data_entsize(mp, dep->namelen); } } -void -xfs_dir2_data_freescan( - struct xfs_inode *dp, - struct xfs_dir2_data_hdr *hdr, - int *loghead) -{ - return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, - hdr, loghead); -} - /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. */ int /* error */ xfs_dir3_data_init( - xfs_da_args_t *args, /* directory operation args */ - xfs_dir2_db_t blkno, /* logical dir block number */ - struct xfs_buf **bpp) /* output block buffer */ + struct xfs_da_args *args, /* directory operation args */ + xfs_dir2_db_t blkno, /* logical dir block number */ + struct xfs_buf **bpp) /* output block buffer */ { - struct xfs_buf *bp; /* block buffer */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* unused entry pointer */ - struct xfs_dir2_data_free *bf; - int error; /* error return value */ - int i; /* bestfree index */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - int t; /* temp */ - - dp = args->dp; - mp = dp->i_mount; - tp = args->trans; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = args->geo; + struct xfs_buf *bp; + struct xfs_dir2_data_hdr *hdr; + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_free *bf; + int error; + int i; + /* * Get the buffer set up for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), - -1, &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK); if (error) return error; bp->b_ops = &xfs_dir3_data_buf_ops; @@ -675,8 +701,9 @@ xfs_dir3_data_init( } else hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - bf = dp->d_ops->data_bestfree_p(hdr); - bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); + bf = xfs_dir2_data_bestfree_p(mp, hdr); + bf[0].offset = cpu_to_be16(geo->data_entry_offset); + bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { bf[i].length = 0; bf[i].offset = 0; @@ -685,13 +712,11 @@ xfs_dir3_data_init( /* * Set up an unused entry for the block's body. */ - dup = dp->d_ops->data_unused_p(hdr); + dup = bp->b_addr + geo->data_entry_offset; dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - - t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; - bf[0].length = cpu_to_be16(t); - dup->length = cpu_to_be16(t); + dup->length = bf[0].length; *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); + /* * Log it and return it. */ @@ -710,6 +735,7 @@ xfs_dir2_data_log_entry( struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { + struct xfs_mount *mp = bp->b_mount; struct xfs_dir2_data_hdr *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || @@ -718,7 +744,7 @@ xfs_dir2_data_log_entry( hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), - (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - + (uint)((char *)(xfs_dir2_data_entry_tag_p(mp, dep) + 1) - (char *)hdr - 1)); } @@ -739,8 +765,7 @@ xfs_dir2_data_log_header( hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); #endif - xfs_trans_log_buf(args->trans, bp, 0, - args->dp->d_ops->data_entry_offset - 1); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->data_entry_offset - 1); } /* @@ -789,11 +814,11 @@ xfs_dir2_data_make_free( { xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ - char *endptr; /* end of data area */ int needscan; /* need to regen bestfree */ xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + unsigned int end; struct xfs_dir2_data_free *bf; hdr = bp->b_addr; @@ -801,14 +826,14 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - endptr = xfs_dir3_data_endp(args->geo, hdr); - ASSERT(endptr != NULL); + end = xfs_dir3_data_end_offset(args->geo, hdr); + ASSERT(end != 0); /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > args->dp->d_ops->data_entry_offset) { + if (offset > args->geo->data_entry_offset) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -821,7 +846,7 @@ xfs_dir2_data_make_free( * If this isn't the end of the block, see if the entry after * us is free. */ - if ((char *)hdr + offset + len < endptr) { + if (offset + len < end) { postdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) @@ -834,7 +859,7 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ - bf = args->dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ @@ -1025,7 +1050,7 @@ xfs_dir2_data_use_free( * Look up the entry in the bestfree table. */ oldlen = be16_to_cpu(dup->length); - bf = args->dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* @@ -1149,19 +1174,22 @@ corrupt: } /* Find the end of the entry data in a data/block format dir block. */ -void * -xfs_dir3_data_endp( +unsigned int +xfs_dir3_data_end_offset( struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) { + void *p; + switch (hdr->magic) { case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - return xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr)); + p = xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr)); + return p - (void *)hdr; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - return (char *)hdr + geo->blksize; + return geo->blksize; default: - return NULL; + return 0; } } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a53e4585a2f3..a131b520aac7 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -24,12 +24,73 @@ * Local function declarations. */ static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, - int *indexp, struct xfs_buf **dbpp); + int *indexp, struct xfs_buf **dbpp, + struct xfs_dir3_icleaf_hdr *leafhdr); static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, struct xfs_buf *bp, int first, int last); static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, struct xfs_buf *bp); +void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from; + + to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); + to->back = be32_to_cpu(from3->hdr.info.hdr.back); + to->magic = be16_to_cpu(from3->hdr.info.hdr.magic); + to->count = be16_to_cpu(from3->hdr.count); + to->stale = be16_to_cpu(from3->hdr.stale); + to->ents = from3->__ents; + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); + } else { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + to->ents = from->__ents; + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); + } +} + +void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_mount *mp, + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + to3->hdr.info.hdr.forw = cpu_to_be32(from->forw); + to3->hdr.info.hdr.back = cpu_to_be32(from->back); + to3->hdr.info.hdr.magic = cpu_to_be16(from->magic); + to3->hdr.count = cpu_to_be16(from->count); + to3->hdr.stale = cpu_to_be16(from->stale); + } else { + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); + } +} + /* * Check the internal consistency of a leaf1 block. * Pop an assert if something is wrong. @@ -43,7 +104,7 @@ xfs_dir3_leaf1_check( struct xfs_dir2_leaf *leaf = bp->b_addr; struct xfs_dir3_icleaf_hdr leafhdr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; @@ -52,7 +113,7 @@ xfs_dir3_leaf1_check( } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); } static inline void @@ -76,31 +137,15 @@ xfs_dir3_leaf_check( xfs_failaddr_t xfs_dir3_leaf_check_int( - struct xfs_mount *mp, - struct xfs_inode *dp, - struct xfs_dir3_icleaf_hdr *hdr, - struct xfs_dir2_leaf *leaf) + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) { - struct xfs_dir2_leaf_entry *ents; - xfs_dir2_leaf_tail_t *ltp; - int stale; - int i; - const struct xfs_dir_ops *ops; - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_da_geometry *geo = mp->m_dir_geo; - - /* - * we can be passed a null dp here from a verifier, so we need to go the - * hard way to get them. - */ - ops = xfs_dir_get_ops(mp, dp); + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; - if (!hdr) { - ops->leaf_hdr_from_disk(&leafhdr, leaf); - hdr = &leafhdr; - } - - ents = ops->leaf_ents_p(leaf); ltp = xfs_dir2_leaf_tail_p(geo, leaf); /* @@ -108,23 +153,23 @@ xfs_dir3_leaf_check_int( * Should factor in the size of the bests table as well. * We can deduce a value for that from di_size. */ - if (hdr->count > ops->leaf_max_ents(geo)) + if (hdr->count > geo->leaf_max_ents) return __this_address; /* Leaves and bests don't overlap in leaf format. */ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || hdr->magic == XFS_DIR3_LEAF1_MAGIC) && - (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; /* Check hash value order, count stale entries. */ for (i = stale = 0; i < hdr->count; i++) { if (i + 1 < hdr->count) { - if (be32_to_cpu(ents[i].hashval) > - be32_to_cpu(ents[i + 1].hashval)) + if (be32_to_cpu(hdr->ents[i].hashval) > + be32_to_cpu(hdr->ents[i + 1].hashval)) return __this_address; } - if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } if (hdr->stale != stale) @@ -139,17 +184,18 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_dir2_leaf *leaf = bp->b_addr; - xfs_failaddr_t fa; + struct xfs_mount *mp = bp->b_mount; + struct xfs_dir3_icleaf_hdr leafhdr; + xfs_failaddr_t fa; fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); if (fa) return fa; - return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr); + return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr); } static void @@ -216,13 +262,12 @@ xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, + &xfs_dir3_leaf1_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); return err; @@ -233,13 +278,12 @@ xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, + &xfs_dir3_leafn_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); return err; @@ -311,7 +355,7 @@ xfs_dir3_leaf_get_buf( bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), - -1, &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK); if (error) return error; @@ -346,7 +390,6 @@ xfs_dir2_block_to_leaf( int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir2_data_free *bf; - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_block_to_leaf(args); @@ -375,24 +418,24 @@ xfs_dir2_block_to_leaf( xfs_dir3_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); - bf = dp->d_ops->data_bestfree_p(hdr); - ents = dp->d_ops->leaf_ents_p(leaf); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); /* * Set the counts in the leaf header. */ - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); leafhdr.count = be32_to_cpu(btp->count); leafhdr.stale = be32_to_cpu(btp->stale); - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); + memcpy(leafhdr.ents, blp, + be32_to_cpu(btp->count) * sizeof(struct xfs_dir2_leaf_entry)); + xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* @@ -415,7 +458,7 @@ xfs_dir2_block_to_leaf( hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); /* * Set up leaf tail and bests table. */ @@ -594,7 +637,7 @@ xfs_dir2_leaf_addname( trace_xfs_dir2_leaf_addname(args); - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); if (error) return error; @@ -607,10 +650,10 @@ xfs_dir2_leaf_addname( index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); + ents = leafhdr.ents; bestsp = xfs_dir2_leaf_bests_p(ltp); - length = dp->d_ops->data_entsize(args->namelen); + length = xfs_dir2_data_entsize(dp->i_mount, args->namelen); /* * See if there are any entries with the same hash value @@ -773,7 +816,7 @@ xfs_dir2_leaf_addname( else xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); bestsp[use_block] = bf[0].length; grown = 1; } else { @@ -783,13 +826,13 @@ xfs_dir2_leaf_addname( */ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, use_block), - -1, &dbp); + 0, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); grown = 0; } /* @@ -815,14 +858,14 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - dp->d_ops->data_put_ftype(dep, args->filetype); - tagp = dp->d_ops->data_entry_tag_p(dep); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); + tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); /* * Need to log the data block's header. */ @@ -852,9 +895,9 @@ xfs_dir2_leaf_addname( /* * Log the leaf fields and give up the buffers. */ - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); - xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, lfloglow, lfloghigh); xfs_dir3_leaf_check(dp, lbp); xfs_dir3_data_check(dp, dbp); return 0; @@ -874,7 +917,6 @@ xfs_dir3_leaf_compact( xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ - struct xfs_dir2_leaf_entry *ents; struct xfs_inode *dp = args->dp; leaf = bp->b_addr; @@ -884,9 +926,9 @@ xfs_dir3_leaf_compact( /* * Compress out the stale entries in place. */ - ents = dp->d_ops->leaf_ents_p(leaf); for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { - if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (leafhdr->ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -894,7 +936,7 @@ xfs_dir3_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - ents[to] = ents[from]; + leafhdr->ents[to] = leafhdr->ents[from]; } to++; } @@ -905,10 +947,10 @@ xfs_dir3_leaf_compact( leafhdr->count -= leafhdr->stale; leafhdr->stale = 0; - dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, leafhdr); xfs_dir3_leaf_log_header(args, bp); if (loglow != -1) - xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args, leafhdr, bp, loglow, to - 1); } /* @@ -1037,6 +1079,7 @@ xfs_dir3_leaf_log_bests( void xfs_dir3_leaf_log_ents( struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first, int last) @@ -1044,16 +1087,14 @@ xfs_dir3_leaf_log_ents( xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ struct xfs_dir2_leaf *leaf = bp->b_addr; - struct xfs_dir2_leaf_entry *ents; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - ents = args->dp->d_ops->leaf_ents_p(leaf); - firstlep = &ents[first]; - lastlep = &ents[last]; + firstlep = &hdr->ents[first]; + lastlep = &hdr->ents[last]; xfs_trans_log_buf(args->trans, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); @@ -1076,7 +1117,7 @@ xfs_dir3_leaf_log_header( xfs_trans_log_buf(args->trans, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - args->dp->d_ops->leaf_hdr_size - 1); + args->geo->leaf_hdr_size - 1); } /* @@ -1115,28 +1156,27 @@ xfs_dir2_leaf_lookup( int error; /* error return code */ int index; /* found entry index */ struct xfs_buf *lbp; /* leaf buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_lookup(args); /* * Look up name in the leaf block, returning both buffers and index. */ - if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr); + if (error) return error; - } + tp = args->trans; dp = args->dp; xfs_dir3_leaf_check(dp, lbp); - leaf = lbp->b_addr; - ents = dp->d_ops->leaf_ents_p(leaf); + /* * Get to the leaf entry and contained data entry address. */ - lep = &ents[index]; + lep = &leafhdr.ents[index]; /* * Point to the data entry. @@ -1148,7 +1188,7 @@ xfs_dir2_leaf_lookup( * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); - args->filetype = dp->d_ops->data_get_ftype(dep); + args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); @@ -1166,7 +1206,8 @@ xfs_dir2_leaf_lookup_int( xfs_da_args_t *args, /* operation arguments */ struct xfs_buf **lbpp, /* out: leaf buffer */ int *indexp, /* out: index in leaf block */ - struct xfs_buf **dbpp) /* out: data buffer */ + struct xfs_buf **dbpp, /* out: data buffer */ + struct xfs_dir3_icleaf_hdr *leafhdr) { xfs_dir2_db_t curdb = -1; /* current data block number */ struct xfs_buf *dbp = NULL; /* data buffer */ @@ -1182,22 +1223,19 @@ xfs_dir2_leaf_lookup_int( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t cidb = -1; /* case match data block no. */ enum xfs_dacmp cmp; /* name compare result */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; xfs_dir3_leaf_check(dp, lbp); - ents = dp->d_ops->leaf_ents_p(leaf); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(mp, leafhdr, leaf); /* * Look for the first leaf entry with our hash value. @@ -1207,8 +1245,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &ents[index]; - index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + for (lep = &leafhdr->ents[index]; + index < leafhdr->count && + be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* * Skip over stale leaf entries. @@ -1229,7 +1268,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, newdb), - -1, &dbp); + 0, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1247,7 +1286,7 @@ xfs_dir2_leaf_lookup_int( * and buffer. If it's the first case-insensitive match, store * the index and buffer and continue looking for an exact match. */ - cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; *indexp = index; @@ -1271,7 +1310,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, cidb), - -1, &dbp); + 0, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1297,6 +1336,7 @@ int /* error */ xfs_dir2_leaf_removename( xfs_da_args_t *args) /* operation arguments */ { + struct xfs_da_geometry *geo = args->geo; __be16 *bestsp; /* leaf block best freespace */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ @@ -1314,7 +1354,6 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_removename(args); @@ -1322,51 +1361,54 @@ xfs_dir2_leaf_removename( /* * Lookup the leaf entry, get the leaf and data blocks read in. */ - if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr); + if (error) return error; - } + dp = args->dp; leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); - bf = dp->d_ops->data_bestfree_p(hdr); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); + /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &ents[index]; - db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + lep = &leafhdr.ents[index]; + db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *)((char *)hdr + - xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address))); needscan = needlog = 0; oldbest = be16_to_cpu(bf[0].length); - ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - if (be16_to_cpu(bestsp[db]) != oldbest) + if (be16_to_cpu(bestsp[db]) != oldbest) { + xfs_buf_corruption_error(lbp); return -EFSCORRUPTED; + } /* * Mark the former data entry unused. */ xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, + &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ leafhdr.stale++; - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(args, lbp, index, index); + xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, index, index); /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* @@ -1382,8 +1424,8 @@ xfs_dir2_leaf_removename( * If the data block is now empty then get rid of the data block. */ if (be16_to_cpu(bf[0].length) == - args->geo->blksize - dp->d_ops->data_entry_offset) { - ASSERT(db != args->geo->datablk); + geo->blksize - geo->data_entry_offset) { + ASSERT(db != geo->datablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* * Nope, can't get rid of it because it caused @@ -1425,7 +1467,7 @@ xfs_dir2_leaf_removename( /* * If the data block was not the first one, drop it. */ - else if (db != args->geo->datablk) + else if (db != geo->datablk) dbp = NULL; xfs_dir3_leaf_check(dp, lbp); @@ -1448,26 +1490,24 @@ xfs_dir2_leaf_replace( int error; /* error return code */ int index; /* index of leaf entry */ struct xfs_buf *lbp; /* leaf buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_replace(args); /* * Look up the entry. */ - if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr); + if (error) return error; - } + dp = args->dp; - leaf = lbp->b_addr; - ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &ents[index]; + lep = &leafhdr.ents[index]; /* * Point to the data entry. */ @@ -1479,7 +1519,7 @@ xfs_dir2_leaf_replace( * Put the new inode number in, log it. */ dep->inumber = cpu_to_be64(args->inumber); - dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); tp = args->trans; xfs_dir2_data_log_entry(args, dbp, dep); xfs_dir3_leaf_check(dp, lbp); @@ -1501,21 +1541,17 @@ xfs_dir2_leaf_search_hash( xfs_dahash_t hashwant; /* hash value looking for */ int high; /* high leaf index */ int low; /* low leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; - leaf = lbp->b_addr; - ents = args->dp->d_ops->leaf_ents_p(leaf); - args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(args->dp->i_mount, &leafhdr, lbp->b_addr); /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = ents, low = 0, high = leafhdr.count - 1, + for (lep = leafhdr.ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1552,6 +1588,7 @@ xfs_dir2_leaf_trim_data( struct xfs_buf *lbp, /* leaf buffer */ xfs_dir2_db_t db) /* data block number */ { + struct xfs_da_geometry *geo = args->geo; __be16 *bestsp; /* leaf bests table */ struct xfs_buf *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ @@ -1565,23 +1602,23 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), - -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp); if (error) return error; leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; - struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); + struct xfs_dir2_data_free *bf = + xfs_dir2_data_bestfree_p(dp->i_mount, hdr); ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); ASSERT(be16_to_cpu(bf[0].length) == - args->geo->blksize - dp->d_ops->data_entry_offset); + geo->blksize - geo->data_entry_offset); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1639,7 +1676,6 @@ xfs_dir2_node_to_leaf( int error; /* error return code */ struct xfs_buf *fbp; /* buffer for freespace block */ xfs_fileoff_t fo; /* freespace file offset */ - xfs_dir2_free_t *free; /* freespace structure */ struct xfs_buf *lbp; /* buffer for leaf block */ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1697,7 +1733,7 @@ xfs_dir2_node_to_leaf( return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); @@ -1708,8 +1744,7 @@ xfs_dir2_node_to_leaf( error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); if (error) return error; - free = fbp->b_addr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr); ASSERT(!freehdr.firstdb); @@ -1743,10 +1778,10 @@ xfs_dir2_node_to_leaf( /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), + memcpy(xfs_dir2_leaf_bests_p(ltp), freehdr.bests, freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(mp, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); xfs_dir3_leaf_log_tail(args, lbp); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 1fc44efc344d..a0cc5e240306 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -32,8 +32,25 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); -static int xfs_dir2_node_addname_int(xfs_da_args_t *args, - xfs_da_state_blk_t *fblk); + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / geo->free_max_bests); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % geo->free_max_bests; +} /* * Check internal consistency of a leafn block. @@ -47,7 +64,7 @@ xfs_dir3_leafn_check( struct xfs_dir2_leaf *leaf = bp->b_addr; struct xfs_dir3_icleaf_hdr leafhdr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; @@ -56,7 +73,7 @@ xfs_dir3_leafn_check( } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); } static inline void @@ -162,10 +179,9 @@ xfs_dir3_free_header_check( struct xfs_buf *bp) { struct xfs_mount *mp = dp->i_mount; + int maxbests = mp->m_dir_geo->free_max_bests; unsigned int firstdb; - int maxbests; - maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * maxbests; @@ -196,14 +212,14 @@ __xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, - xfs_daddr_t mappedbno, + unsigned int flags, struct xfs_buf **bpp) { xfs_failaddr_t fa; int err; - err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + err = xfs_da_read_buf(tp, dp, fbno, flags, bpp, XFS_DATA_FORK, + &xfs_dir3_free_buf_ops); if (err || !*bpp) return err; @@ -222,6 +238,58 @@ __xfs_dir3_free_read( return 0; } +void +xfs_dir2_free_hdr_from_disk( + struct xfs_mount *mp, + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from; + + to->magic = be32_to_cpu(from3->hdr.hdr.magic); + to->firstdb = be32_to_cpu(from3->hdr.firstdb); + to->nvalid = be32_to_cpu(from3->hdr.nvalid); + to->nused = be32_to_cpu(from3->hdr.nused); + to->bests = from3->bests; + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); + } else { + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + to->bests = from->bests; + + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); + } +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_mount *mp, + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + to3->hdr.hdr.magic = cpu_to_be32(from->magic); + to3->hdr.firstdb = cpu_to_be32(from->firstdb); + to3->hdr.nvalid = cpu_to_be32(from->nvalid); + to3->hdr.nused = cpu_to_be32(from->nused); + } else { + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); + } +} + int xfs_dir2_free_read( struct xfs_trans *tp, @@ -229,7 +297,7 @@ xfs_dir2_free_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp); } static int @@ -239,7 +307,7 @@ xfs_dir2_free_try_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp); } static int @@ -256,7 +324,7 @@ xfs_dir3_free_get_buf( struct xfs_dir3_icfree_hdr hdr; error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), - -1, &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK); if (error) return error; @@ -280,7 +348,7 @@ xfs_dir3_free_get_buf( uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; - dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); + xfs_dir2_free_hdr_to_disk(mp, bp->b_addr, &hdr); *bpp = bp; return 0; } @@ -291,21 +359,19 @@ xfs_dir3_free_get_buf( STATIC void xfs_dir2_free_log_bests( struct xfs_da_args *args, + struct xfs_dir3_icfree_hdr *hdr, struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { - xfs_dir2_free_t *free; /* freespace structure */ - __be16 *bests; + struct xfs_dir2_free *free = bp->b_addr; - free = bp->b_addr; - bests = args->dp->d_ops->free_bests_p(free); ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); xfs_trans_log_buf(args->trans, bp, - (uint)((char *)&bests[first] - (char *)free), - (uint)((char *)&bests[last] - (char *)free + - sizeof(bests[0]) - 1)); + (char *)&hdr->bests[first] - (char *)free, + (char *)&hdr->bests[last] - (char *)free + + sizeof(hdr->bests[0]) - 1); } /* @@ -324,7 +390,7 @@ xfs_dir2_free_log_header( free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); #endif xfs_trans_log_buf(args->trans, bp, 0, - args->dp->d_ops->free_hdr_size - 1); + args->geo->free_hdr_size - 1); } /* @@ -341,14 +407,12 @@ xfs_dir2_leaf_to_node( int error; /* error return value */ struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_db_t fdb; /* freespace block number */ - xfs_dir2_free_t *free; /* freespace structure */ __be16 *from; /* pointer to freespace entry */ int i; /* leaf freespace index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ int n; /* count of live freespc ents */ xfs_dir2_data_off_t off; /* freespace entry value */ - __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icfree_hdr freehdr; @@ -370,24 +434,25 @@ xfs_dir2_leaf_to_node( if (error) return error; - free = fbp->b_addr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, fbp->b_addr); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > - (uint)dp->i_d.di_size / args->geo->blksize) + (uint)dp->i_d.di_size / args->geo->blksize) { + xfs_buf_corruption_error(lbp); return -EFSCORRUPTED; + } /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ from = xfs_dir2_leaf_bests_p(ltp); - to = dp->d_ops->free_bests_p(free); - for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { - if ((off = be16_to_cpu(*from)) != NULLDATAOFF) + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++) { + off = be16_to_cpu(*from); + if (off != NULLDATAOFF) n++; - *to = cpu_to_be16(off); + freehdr.bests[i] = cpu_to_be16(off); } /* @@ -396,8 +461,8 @@ xfs_dir2_leaf_to_node( freehdr.nused = n; freehdr.nvalid = be32_to_cpu(ltp->bestcount); - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_hdr_to_disk(dp->i_mount, fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, &freehdr, fbp, 0, freehdr.nvalid - 1); xfs_dir2_free_log_header(args, fbp); /* @@ -440,15 +505,17 @@ xfs_dir2_leafn_add( trace_xfs_dir2_leafn_add(args, index); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); + ents = leafhdr.ents; /* * Quick check just to make sure we are not going to index * into other peoples memory */ - if (index < 0) + if (index < 0) { + xfs_buf_corruption_error(bp); return -EFSCORRUPTED; + } /* * If there are already the maximum number of leaf entries in @@ -457,7 +524,7 @@ xfs_dir2_leafn_add( * a compact. */ - if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { + if (leafhdr.count == args->geo->leaf_max_ents) { if (!leafhdr.stale) return -ENOSPC; compact = leafhdr.stale > 1; @@ -495,9 +562,9 @@ xfs_dir2_leafn_add( lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, args->blkno, args->index)); - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, bp); - xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_log_ents(args, &leafhdr, bp, lfloglow, lfloghigh); xfs_dir3_leaf_check(dp, bp); return 0; } @@ -511,10 +578,9 @@ xfs_dir2_free_hdr_check( { struct xfs_dir3_icfree_hdr hdr; - dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &hdr, bp->b_addr); - ASSERT((hdr.firstdb % - dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); + ASSERT((hdr.firstdb % dp->i_mount->m_dir_geo->free_max_bests) == 0); ASSERT(hdr.firstdb <= db); ASSERT(db < hdr.firstdb + hdr.nvalid); } @@ -532,11 +598,9 @@ xfs_dir2_leaf_lasthash( struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - struct xfs_dir2_leaf *leaf = bp->b_addr; - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, bp->b_addr); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC || @@ -547,9 +611,7 @@ xfs_dir2_leaf_lasthash( *count = leafhdr.count; if (!leafhdr.count) return 0; - - ents = dp->d_ops->leaf_ents_p(leaf); - return be32_to_cpu(ents[leafhdr.count - 1].hashval); + return be32_to_cpu(leafhdr.ents[leafhdr.count - 1].hashval); } /* @@ -578,15 +640,13 @@ xfs_dir2_leafn_lookup_for_addname( xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); xfs_dir3_leaf_check(dp, bp); ASSERT(leafhdr.count > 0); @@ -606,11 +666,11 @@ xfs_dir2_leafn_lookup_for_addname( ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } - length = dp->d_ops->data_entsize(args->namelen); + length = xfs_dir2_data_entsize(mp, args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &ents[index]; + for (lep = &leafhdr.ents[index]; index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* @@ -632,14 +692,14 @@ xfs_dir2_leafn_lookup_for_addname( * in hand, take a look at it. */ if (newdb != curdb) { - __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; curdb = newdb; /* * Convert the data block to the free block * holding its freespace information. */ - newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); + newfdb = xfs_dir2_db_to_fdb(args->geo, newdb); /* * If it's not the one we have in hand, read it in. */ @@ -663,20 +723,20 @@ xfs_dir2_leafn_lookup_for_addname( /* * Get the index for our entry. */ - fi = dp->d_ops->db_to_fdindex(args->geo, curdb); + fi = xfs_dir2_db_to_fdindex(args->geo, curdb); /* * If it has room, return it. */ - bests = dp->d_ops->free_bests_p(free); - if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { - XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", - XFS_ERRLEVEL_LOW, mp); + xfs_dir2_free_hdr_from_disk(mp, &freehdr, free); + if (XFS_IS_CORRUPT(mp, + freehdr.bests[fi] == + cpu_to_be16(NULLDATAOFF))) { if (curfdb != newfdb) xfs_trans_brelse(tp, curbp); return -EFSCORRUPTED; } curfdb = newfdb; - if (be16_to_cpu(bests[fi]) >= length) + if (be16_to_cpu(freehdr.bests[fi]) >= length) goto out; } } @@ -730,19 +790,19 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); xfs_dir3_leaf_check(dp, bp); - if (leafhdr.count <= 0) + if (leafhdr.count <= 0) { + xfs_buf_corruption_error(bp); return -EFSCORRUPTED; + } /* * Look up the hash value in the leaf entries. @@ -758,7 +818,7 @@ xfs_dir2_leafn_lookup_for_entry( /* * Loop over leaf entries with the right hash value. */ - for (lep = &ents[index]; + for (lep = &leafhdr.ents[index]; index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* @@ -797,7 +857,7 @@ xfs_dir2_leafn_lookup_for_entry( error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, newdb), - -1, &curbp); + 0, &curbp); if (error) return error; } @@ -815,7 +875,7 @@ xfs_dir2_leafn_lookup_for_entry( * EEXIST immediately. If it's the first case-insensitive * match, store the block & inode number and continue looking. */ - cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { /* If there is a CI match block, drop it */ if (args->cmpresult != XFS_CMP_DIFFERENT && @@ -823,7 +883,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); - args->filetype = dp->d_ops->data_get_ftype(dep); + args->filetype = xfs_dir2_data_get_ftype(mp, dep); *indexp = index; state->extravalid = 1; state->extrablk.bp = curbp; @@ -913,7 +973,7 @@ xfs_dir3_leafn_moveents( if (start_d < dhdr->count) { memmove(&dents[start_d + count], &dents[start_d], (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, + xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d + count, count + dhdr->count - 1); } /* @@ -935,7 +995,7 @@ xfs_dir3_leafn_moveents( */ memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d, start_d + count - 1); /* * If there are source entries after the ones we copied, @@ -944,7 +1004,8 @@ xfs_dir3_leafn_moveents( if (start_s + count < shdr->count) { memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(args, shdr, bp_s, start_s, + start_s + count - 1); } /* @@ -973,10 +1034,10 @@ xfs_dir2_leafn_order( struct xfs_dir3_icleaf_hdr hdr1; struct xfs_dir3_icleaf_hdr hdr2; - dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); - dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); - ents1 = dp->d_ops->leaf_ents_p(leaf1); - ents2 = dp->d_ops->leaf_ents_p(leaf2); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2); + ents1 = hdr1.ents; + ents2 = hdr2.ents; if (hdr1.count > 0 && hdr2.count > 0 && (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || @@ -1026,10 +1087,10 @@ xfs_dir2_leafn_rebalance( leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); - dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); - ents1 = dp->d_ops->leaf_ents_p(leaf1); - ents2 = dp->d_ops->leaf_ents_p(leaf2); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2); + ents1 = hdr1.ents; + ents2 = hdr2.ents; oldsum = hdr1.count + hdr2.count; #if defined(DEBUG) || defined(XFS_WARN) @@ -1075,8 +1136,8 @@ xfs_dir2_leafn_rebalance( ASSERT(hdr1.stale + hdr2.stale == oldstale); /* log the changes made when moving the entries */ - dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); - dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf1, &hdr1); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf2, &hdr2); xfs_dir3_leaf_log_header(args, blk1->bp); xfs_dir3_leaf_log_header(args, blk2->bp); @@ -1122,19 +1183,17 @@ xfs_dir3_data_block_free( int longest) { int logfree = 0; - __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; struct xfs_inode *dp = args->dp; - dp->d_ops->free_hdr_from_disk(&freehdr, free); - bests = dp->d_ops->free_bests_p(free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free); if (hdr) { /* * Data block is not empty, just set the free entry to the new * value. */ - bests[findex] = cpu_to_be16(longest); - xfs_dir2_free_log_bests(args, fbp, findex, findex); + freehdr.bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex); return 0; } @@ -1150,18 +1209,18 @@ xfs_dir3_data_block_free( int i; /* free entry index */ for (i = findex - 1; i >= 0; i--) { - if (bests[i] != cpu_to_be16(NULLDATAOFF)) + if (freehdr.bests[i] != cpu_to_be16(NULLDATAOFF)) break; } freehdr.nvalid = i + 1; logfree = 0; } else { /* Not the last entry, just punch it out. */ - bests[findex] = cpu_to_be16(NULLDATAOFF); + freehdr.bests[findex] = cpu_to_be16(NULLDATAOFF); logfree = 1; } - dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_hdr_to_disk(dp->i_mount, free, &freehdr); xfs_dir2_free_log_header(args, fbp); /* @@ -1186,7 +1245,7 @@ xfs_dir3_data_block_free( /* Log the free entry that changed, unless we got rid of it. */ if (logfree) - xfs_dir2_free_log_bests(args, fbp, findex, findex); + xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex); return 0; } @@ -1203,6 +1262,7 @@ xfs_dir2_leafn_remove( xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { + struct xfs_da_geometry *geo = args->geo; xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ struct xfs_buf *dbp; /* data block buffer */ @@ -1217,27 +1277,25 @@ xfs_dir2_leafn_remove( xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir2_data_free *bf; /* bestfree table */ struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_remove(args, index); dp = args->dp; tp = args->trans; leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); /* * Point to the entry we're removing. */ - lep = &ents[index]; + lep = &leafhdr.ents[index]; /* * Extract the data block and offset from the entry. */ - db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); /* @@ -1245,11 +1303,11 @@ xfs_dir2_leafn_remove( * Log the leaf block changes. */ leafhdr.stale++; - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, bp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(args, bp, index, index); + xfs_dir3_leaf_log_ents(args, &leafhdr, bp, index, index); /* * Make the data entry free. Keep track of the longest freespace @@ -1258,17 +1316,18 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(args, dbp, off, - dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, + &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); xfs_dir3_data_check(dp, dbp); @@ -1287,9 +1346,8 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = dp->d_ops->db_to_fdb(args->geo, db); - error = xfs_dir2_free_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fdb), + fdb = xfs_dir2_db_to_fdb(geo, db); + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb), &fbp); if (error) return error; @@ -1297,23 +1355,22 @@ xfs_dir2_leafn_remove( #ifdef DEBUG { struct xfs_dir3_icfree_hdr freehdr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); - ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * - (fdb - xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET))); + + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free); + ASSERT(freehdr.firstdb == geo->free_max_bests * + (fdb - xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET))); } #endif /* * Calculate which entry we need to fix. */ - findex = dp->d_ops->db_to_fdindex(args->geo, db); + findex = xfs_dir2_db_to_fdindex(geo, db); longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == args->geo->blksize - - dp->d_ops->data_entry_offset) { + if (longest == geo->blksize - geo->data_entry_offset) { /* * Try to punch out the data block. */ @@ -1345,9 +1402,9 @@ xfs_dir2_leafn_remove( * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = (dp->d_ops->leaf_hdr_size + - (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < - args->geo->magicpct; + *rval = (geo->leaf_hdr_size + + (uint)sizeof(leafhdr.ents) * (leafhdr.count - leafhdr.stale)) < + geo->magicpct; return 0; } @@ -1446,12 +1503,12 @@ xfs_dir2_leafn_toosmall( */ blk = &state->path.blk[state->path.active - 1]; leaf = blk->bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); + ents = leafhdr.ents; xfs_dir3_leaf_check(dp, blk->bp); count = leafhdr.count - leafhdr.stale; - bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + bytes = state->args->geo->leaf_hdr_size + count * sizeof(ents[0]); if (bytes > (state->args->geo->blksize >> 1)) { /* * Blk over 50%, don't try to join. @@ -1496,8 +1553,7 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, dp, - blkno, -1, &bp); + error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp); if (error) return error; @@ -1509,8 +1565,8 @@ xfs_dir2_leafn_toosmall( (state->args->geo->blksize >> 2); leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf); + ents = hdr2.ents; count += hdr2.count - hdr2.stale; bytes -= count * sizeof(ents[0]); @@ -1572,10 +1628,10 @@ xfs_dir2_leafn_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); - dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); - sents = dp->d_ops->leaf_ents_p(save_leaf); - dents = dp->d_ops->leaf_ents_p(drop_leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &savehdr, save_leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &drophdr, drop_leaf); + sents = savehdr.ents; + dents = drophdr.ents; /* * If there are any stale leaf entries, take this opportunity @@ -1601,8 +1657,8 @@ xfs_dir2_leafn_unbalance( save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); /* log the changes made when moving the entries */ - dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); - dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, save_leaf, &savehdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, drop_leaf, &drophdr); xfs_dir3_leaf_log_header(args, save_blk->bp); xfs_dir3_leaf_log_header(args, drop_blk->bp); @@ -1611,113 +1667,143 @@ xfs_dir2_leafn_unbalance( } /* - * Top-level node form directory addname routine. + * Add a new data block to the directory at the free space index that the caller + * has specified. */ -int /* error */ -xfs_dir2_node_addname( - xfs_da_args_t *args) /* operation arguments */ +static int +xfs_dir2_node_add_datablk( + struct xfs_da_args *args, + struct xfs_da_state_blk *fblk, + xfs_dir2_db_t *dbno, + struct xfs_buf **dbpp, + struct xfs_buf **fbpp, + struct xfs_dir3_icfree_hdr *hdr, + int *findex) { - xfs_da_state_blk_t *blk; /* leaf block for insert */ - int error; /* error return value */ - int rval; /* sub-return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_data_free *bf; + xfs_dir2_db_t fbno; + struct xfs_buf *fbp; + struct xfs_buf *dbp; + int error; - trace_xfs_dir2_node_addname(args); + /* Not allowed to allocate, return failure. */ + if (args->total == 0) + return -ENOSPC; + + /* Allocate and initialize the new data block. */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno); + if (error) + return error; + error = xfs_dir3_data_init(args, *dbno, &dbp); + if (error) + return error; /* - * Allocate and initialize the state (btree cursor). - */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; - /* - * Look up the name. We're not supposed to find it, but - * this gives us the insertion point. + * Get the freespace block corresponding to the data block + * that was just allocated. */ - error = xfs_da3_node_lookup_int(state, &rval); + fbno = xfs_dir2_db_to_fdb(args->geo, *dbno); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) - rval = error; - if (rval != -ENOENT) { - goto done; - } + return error; + /* - * Add the data entry to a data block. - * Extravalid is set to a freeblock found by lookup. + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. */ - rval = xfs_dir2_node_addname_int(args, - state->extravalid ? &state->extrablk : NULL); - if (rval) { - goto done; + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno); + if (error) + return error; + + if (XFS_IS_CORRUPT(mp, + xfs_dir2_db_to_fdb(args->geo, *dbno) != + fbno)) { + xfs_alert(mp, +"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld", + __func__, (unsigned long long)dp->i_ino, + (long long)xfs_dir2_db_to_fdb(args->geo, *dbno), + (long long)*dbno, (long long)fbno); + if (fblk) { + xfs_alert(mp, + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", + fblk, (unsigned long long)fblk->blkno, + fblk->index, fblk->magic); + } else { + xfs_alert(mp, " ... fblk is NULL"); + } + return -EFSCORRUPTED; + } + + /* Get a buffer for the new block. */ + error = xfs_dir3_free_get_buf(args, fbno, &fbp); + if (error) + return error; + xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr); + + /* Remember the first slot as our empty slot. */ + hdr->firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + args->geo->free_max_bests; + } else { + xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr); } - blk = &state->path.blk[state->path.active - 1]; - ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + + /* Set the freespace block index from the data block number. */ + *findex = xfs_dir2_db_to_fdindex(args->geo, *dbno); + + /* Extend the freespace table if the new data block is off the end. */ + if (*findex >= hdr->nvalid) { + ASSERT(*findex < args->geo->free_max_bests); + hdr->nvalid = *findex + 1; + hdr->bests[*findex] = cpu_to_be16(NULLDATAOFF); + } + /* - * Add the new leaf entry. + * If this entry was for an empty data block (this should always be + * true) then update the header. */ - rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); - if (rval == 0) { - /* - * It worked, fix the hash values up the btree. - */ - if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da3_fixhashpath(state, &state->path); - } else { - /* - * It didn't work, we need to split the leaf block. - */ - if (args->total == 0) { - ASSERT(rval == -ENOSPC); - goto done; - } - /* - * Split the leaf block and insert the new entry. - */ - rval = xfs_da3_split(state); + if (hdr->bests[*findex] == cpu_to_be16(NULLDATAOFF)) { + hdr->nused++; + xfs_dir2_free_hdr_to_disk(mp, fbp->b_addr, hdr); + xfs_dir2_free_log_header(args, fbp); } -done: - xfs_da_state_free(state); - return rval; + + /* Update the freespace value for the new block in the table. */ + bf = xfs_dir2_data_bestfree_p(mp, dbp->b_addr); + hdr->bests[*findex] = bf[0].length; + + *dbpp = dbp; + *fbpp = fbp; + return 0; } -/* - * Add the data entry for a node-format directory name addition. - * The leaf entry is added in xfs_dir2_leafn_add. - * We may enter with a freespace block that the lookup found. - */ -static int /* error */ -xfs_dir2_node_addname_int( - xfs_da_args_t *args, /* operation arguments */ - xfs_da_state_blk_t *fblk) /* optional freespace block */ +static int +xfs_dir2_node_find_freeblk( + struct xfs_da_args *args, + struct xfs_da_state_blk *fblk, + xfs_dir2_db_t *dbnop, + struct xfs_buf **fbpp, + struct xfs_dir3_icfree_hdr *hdr, + int *findexp, + int length) { - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_db_t dbno; /* data block number */ - struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ - int error; /* error return value */ - xfs_dir2_db_t fbno; /* freespace block number */ - struct xfs_buf *fbp; /* freespace buffer */ - int findex; /* freespace entry index */ - xfs_dir2_free_t *free=NULL; /* freespace block structure */ - xfs_dir2_db_t ifbno; /* initial freespace block no */ - xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ - int length; /* length of the new entry */ - int logfree; /* need to log free entry */ - xfs_mount_t *mp; /* filesystem mount point */ - int needlog; /* need to log data header */ - int needscan; /* need to rescan data frees */ - __be16 *tagp; /* data entry tag pointer */ - xfs_trans_t *tp; /* transaction pointer */ - __be16 *bests; - struct xfs_dir3_icfree_hdr freehdr; - struct xfs_dir2_data_free *bf; - xfs_dir2_data_aoff_t aoff; + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_buf *fbp = NULL; + xfs_dir2_db_t firstfbno; + xfs_dir2_db_t lastfbno; + xfs_dir2_db_t ifbno = -1; + xfs_dir2_db_t dbno = -1; + xfs_dir2_db_t fbno; + xfs_fileoff_t fo; + int findex = 0; + int error; - dp = args->dp; - mp = dp->i_mount; - tp = args->trans; - length = dp->d_ops->data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1725,288 +1811,150 @@ xfs_dir2_node_addname_int( */ if (fblk) { fbp = fblk->bp; - /* - * Remember initial freespace block number. - */ - ifbno = fblk->blkno; - free = fbp->b_addr; findex = fblk->index; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - /* - * This means the free entry showed that the data block had - * space for our entry, so we remembered it. - * Use that data block. - */ + xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr); if (findex >= 0) { - ASSERT(findex < freehdr.nvalid); - ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(bests[findex]) >= length); - dbno = freehdr.firstdb + findex; - } else { - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - dbno = -1; - findex = 0; + /* caller already found the freespace for us. */ + ASSERT(findex < hdr->nvalid); + ASSERT(be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(hdr->bests[findex]) >= length); + dbno = hdr->firstdb + findex; + goto found_block; } - } else { + /* - * Didn't come in with a freespace block, so no data block. + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. */ - ifbno = dbno = -1; + ifbno = fblk->blkno; + xfs_trans_brelse(tp, fbp); fbp = NULL; - findex = 0; + fblk->bp = NULL; } /* - * If we don't have a data block yet, we're going to scan the - * freespace blocks looking for one. Figure out what the - * highest freespace block number is. - */ - if (dbno == -1) { - xfs_fileoff_t fo; /* freespace block number */ - - if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) - return error; - lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); - fbno = ifbno; - } - /* - * While we haven't identified a data block, search the freeblock - * data for a good data block. If we find a null freeblock entry, - * indicating a hole in the data blocks, remember that. - */ - while (dbno == -1) { - /* - * If we don't have a freeblock in hand, get the next one. - */ - if (fbp == NULL) { - /* - * Happens the first time through unless lookup gave - * us a freespace block to start with. - */ - if (++fbno == 0) - fbno = xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET); - /* - * If it's ifbno we already looked at it. - */ - if (fbno == ifbno) - fbno++; - /* - * If it's off the end we're done. - */ - if (fbno >= lastfbno) - break; - /* - * Read the block. There can be holes in the - * freespace blocks, so this might not succeed. - * This should be really rare, so there's no reason - * to avoid it. - */ - error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fbno), - &fbp); - if (error) - return error; - if (!fbp) - continue; - free = fbp->b_addr; - findex = 0; - } - /* - * Look at the current free entry. Is it good enough? - * - * The bests initialisation should be where the bufer is read in - * the above branch. But gcc is too stupid to realise that bests - * and the freehdr are actually initialised if they are placed - * there, so we have to do it here to avoid warnings. Blech. - */ - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - if (be16_to_cpu(bests[findex]) != NULLDATAOFF && - be16_to_cpu(bests[findex]) >= length) - dbno = freehdr.firstdb + findex; - else { - /* - * Are we done with the freeblock? - */ - if (++findex == freehdr.nvalid) { - /* - * Drop the block. - */ - xfs_trans_brelse(tp, fbp); - fbp = NULL; - if (fblk && fblk->bp) - fblk->bp = NULL; - } - } - } - /* - * If we don't have a data block, we need to allocate one and make - * the freespace entries refer to it. + * If we don't have a data block yet, we're going to scan the freespace + * data for a data block with enough free space in it. */ - if (unlikely(dbno == -1)) { - /* - * Not allowed to allocate, return failure. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) - return -ENOSPC; - - /* - * Allocate and initialize the new data block. - */ - if (unlikely((error = xfs_dir2_grow_inode(args, - XFS_DIR2_DATA_SPACE, - &dbno)) || - (error = xfs_dir3_data_init(args, dbno, &dbp)))) - return error; + error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK); + if (error) + return error; + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); + firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET); - /* - * If (somehow) we have a freespace block, get rid of it. - */ - if (fbp) - xfs_trans_brelse(tp, fbp); - if (fblk && fblk->bp) - fblk->bp = NULL; + for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) { + /* If it's ifbno we already looked at it. */ + if (fbno == ifbno) + continue; /* - * Get the freespace block corresponding to the data block - * that was just allocated. + * Read the block. There can be holes in the freespace blocks, + * so this might not succeed. This should be really rare, so + * there's no reason to avoid it. */ - fbno = dp->d_ops->db_to_fdb(args->geo, dbno); error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; + if (!fbp) + continue; - /* - * If there wasn't a freespace block, the read will - * return a NULL fbp. Allocate and initialize a new one. - */ - if (!fbp) { - error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno); - if (error) - return error; + xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr); - if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { - xfs_alert(mp, -"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", - __func__, (unsigned long long)dp->i_ino, - (long long)dp->d_ops->db_to_fdb( - args->geo, dbno), - (long long)dbno, (long long)fbno, - (unsigned long long)ifbno, lastfbno); - if (fblk) { - xfs_alert(mp, - " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", - fblk, - (unsigned long long)fblk->blkno, - fblk->index, - fblk->magic); - } else { - xfs_alert(mp, " ... fblk is NULL"); - } - XFS_ERROR_REPORT("xfs_dir2_node_addname_int", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; + /* Scan the free entry array for a large enough free space. */ + for (findex = hdr->nvalid - 1; findex >= 0; findex--) { + if (be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF && + be16_to_cpu(hdr->bests[findex]) >= length) { + dbno = hdr->firstdb + findex; + goto found_block; } - - /* - * Get a buffer for the new block. - */ - error = xfs_dir3_free_get_buf(args, fbno, &fbp); - if (error) - return error; - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - /* - * Remember the first slot as our empty slot. - */ - freehdr.firstdb = - (fbno - xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET)) * - dp->d_ops->free_max_bests(args->geo); - } else { - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); } - /* - * Set the freespace block index from the data block number. - */ - findex = dp->d_ops->db_to_fdindex(args->geo, dbno); - /* - * If it's after the end of the current entries in the - * freespace block, extend that table. - */ - if (findex >= freehdr.nvalid) { - ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); - freehdr.nvalid = findex + 1; - /* - * Tag new entry so nused will go up. - */ - bests[findex] = cpu_to_be16(NULLDATAOFF); - } - /* - * If this entry was for an empty data block - * (this should always be true) then update the header. - */ - if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { - freehdr.nused++; - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_header(args, fbp); - } - /* - * Update the real value in the table. - * We haven't allocated the data entry yet so this will - * change again. - */ - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - bests[findex] = bf[0].length; - logfree = 1; + /* Didn't find free space, go on to next free block */ + xfs_trans_brelse(tp, fbp); } + +found_block: + *dbnop = dbno; + *fbpp = fbp; + *findexp = findex; + return 0; +} + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int +xfs_dir2_node_addname_int( + struct xfs_da_args *args, /* operation arguments */ + struct xfs_da_state_blk *fblk) /* optional freespace block */ +{ + struct xfs_dir2_data_unused *dup; /* data unused entry pointer */ + struct xfs_dir2_data_entry *dep; /* data entry pointer */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_free *bf; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_buf *dbp; /* data block buffer */ + struct xfs_buf *fbp; /* freespace buffer */ + xfs_dir2_data_aoff_t aoff; + xfs_dir2_db_t dbno; /* data block number */ + int error; /* error return value */ + int findex; /* freespace entry index */ + int length; /* length of the new entry */ + int logfree = 0; /* need to log free entry */ + int needlog = 0; /* need to log data header */ + int needscan = 0; /* need to rescan data frees */ + __be16 *tagp; /* data entry tag pointer */ + + length = xfs_dir2_data_entsize(dp->i_mount, args->namelen); + error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &freehdr, + &findex, length); + if (error) + return error; + /* - * We had a data block so we don't have to make a new one. + * Now we know if we must allocate blocks, so if we are checking whether + * we can insert without allocation then we can return now. */ - else { - /* - * If just checking, we succeeded. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + if (dbno == -1) + return -ENOSPC; + return 0; + } - /* - * Read the data block in. - */ + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (dbno == -1) { + /* we're going to have to log the free block index later */ + logfree = 1; + error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp, + &freehdr, &findex); + } else { + /* Read the data block in. */ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, dbno), - -1, &dbp); - if (error) - return error; - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - logfree = 0; + 0, &dbp); } + if (error) + return error; + + /* setup for data block up now */ + hdr = dbp->b_addr; + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); ASSERT(be16_to_cpu(bf[0].length) >= length); - /* - * Point to the existing unused space. - */ + + /* Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); - needscan = needlog = 0; - /* - * Mark the first part of the unused space, inuse for us. - */ + + /* Mark the first part of the unused space, inuse for us. */ aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, &needlog, &needscan); @@ -2014,46 +1962,106 @@ xfs_dir2_node_addname_int( xfs_trans_brelse(tp, dbp); return error; } - /* - * Fill in the new entry and log it. - */ + + /* Fill in the new entry and log it. */ dep = (xfs_dir2_data_entry_t *)dup; dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - dp->d_ops->data_put_ftype(dep, args->filetype); - tagp = dp->d_ops->data_entry_tag_p(dep); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); + tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(args, dbp, dep); - /* - * Rescan the block for bestfree if needed. - */ + + /* Rescan the freespace and log the data block if needed. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); - /* - * Log the data block header if needed. - */ + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); + + /* If the freespace block entry is now wrong, update it. */ + if (freehdr.bests[findex] != bf[0].length) { + freehdr.bests[findex] = bf[0].length; + logfree = 1; + } + + /* Log the freespace entry if needed. */ + if (logfree) + xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex); + + /* Return the data block and offset in args. */ + args->blkno = (xfs_dablk_t)dbno; + args->index = be16_to_cpu(*tagp); + return 0; +} + +/* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_addname(args); + /* - * If the freespace entry is now wrong, update it. + * Allocate and initialize the state (btree cursor). */ - bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ - if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { - bests[findex] = bf[0].length; - logfree = 1; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != -ENOENT) { + goto done; } /* - * Log the freespace entry if needed. + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. */ - if (logfree) - xfs_dir2_free_log_bests(args, fbp, findex, findex); + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); /* - * Return the data block and offset in args, then drop the data block. + * Add the new leaf entry. */ - args->blkno = (xfs_dablk_t)dbno; - args->index = be16_to_cpu(*tagp); - return 0; + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da3_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == -ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da3_split(state); + } +done: + xfs_da_state_free(state); + return rval; } /* @@ -2187,8 +2195,6 @@ xfs_dir2_node_replace( int i; /* btree level */ xfs_ino_t inum; /* new inode number */ int ftype; /* new file type */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ int rval; /* internal return value */ xfs_da_state_t *state; /* btree cursor */ @@ -2220,16 +2226,17 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == -EEXIST) { - struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); - leaf = blk->bp->b_addr; - ents = args->dp->d_ops->leaf_ents_p(leaf); - lep = &ents[blk->index]; ASSERT(state->extravalid); + + xfs_dir2_leaf_hdr_from_disk(state->mp, &leafhdr, + blk->bp->b_addr); /* * Point to the data entry. */ @@ -2239,13 +2246,13 @@ xfs_dir2_node_replace( dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, - be32_to_cpu(lep->address))); + be32_to_cpu(leafhdr.ents[blk->index].address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - args->dp->d_ops->data_put_ftype(dep, ftype); + xfs_dir2_data_put_ftype(state->mp, dep, ftype); xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } @@ -2302,7 +2309,7 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free); /* * If there are used entries, there's nothing to do. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 59f9fb2241a5..01ee0b926572 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -8,7 +8,41 @@ struct dir_context; +/* + * In-core version of the leaf and free block headers to abstract the + * differences in the v2 and v3 disk format of the headers. + */ +struct xfs_dir3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t stale; + + /* + * Pointer to the on-disk format entries, which are behind the + * variable size (v4 vs v5) header in the on-disk block. + */ + struct xfs_dir2_leaf_entry *ents; +}; + +struct xfs_dir3_icfree_hdr { + uint32_t magic; + uint32_t firstdb; + uint32_t nvalid; + uint32_t nused; + + /* + * Pointer to the on-disk format entries, which are behind the + * variable size (v4 vs v5) header in the on-disk block. + */ + __be16 *bests; +}; + /* xfs_dir2.c */ +xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, + const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, @@ -26,6 +60,15 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_buf *lbp, struct xfs_buf *dbp); /* xfs_dir2_data.c */ +struct xfs_dir2_data_free *xfs_dir2_data_bestfree_p(struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr); +__be16 *xfs_dir2_data_entry_tag_p(struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep); +uint8_t xfs_dir2_data_get_ftype(struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep); +void xfs_dir2_data_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep, uint8_t ftype); + #ifdef DEBUG extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); #else @@ -34,10 +77,10 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); -extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno); +int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp); +int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + unsigned int flags); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, @@ -47,10 +90,14 @@ extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); /* xfs_dir2_leaf.c */ -extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from); +void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); +int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); +int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -62,7 +109,8 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, uint16_t magic); extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, - struct xfs_buf *bp, int first, int last); + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first, + int last); extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); @@ -79,10 +127,11 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp, - struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr, - struct xfs_dir2_leaf *leaf); + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); /* xfs_dir2_node.c */ +void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp, + struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from); extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp, @@ -108,6 +157,14 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ +xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); +xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *hdr); +void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino); +uint8_t xfs_dir2_sf_get_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep); +struct xfs_dir2_sf_entry *xfs_dir2_sf_nextentry(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, @@ -118,9 +175,33 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +int xfs_dir2_sf_entsize(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, int len); +void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino); +void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); +static inline unsigned int +xfs_dir2_data_entsize( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int len; + + len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen + + sizeof(xfs_dir2_data_off_t) /* tag */; + if (xfs_sb_version_hasftype(&mp->m_sb)) + len += sizeof(uint8_t); + return round_up(len, XFS_DIR2_DATA_ALIGN); +} + +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, + const unsigned char *name, int len); + #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 033589257f54..7b7f6fb2ea3b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -37,6 +37,126 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); +int +xfs_dir2_sf_entsize( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = len; + + count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ + + if (xfs_sb_version_hasftype(&mp->m_sb)) + count += sizeof(uint8_t); + return count; +} + +struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (void *)sfep + xfs_dir2_sf_entsize(mp, hdr, sfep->namelen); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. The actual inode numbers can + * come in two formats as well, either 4 bytes or 8 bytes wide. + */ +xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + uint8_t *from = sfep->name + sfep->namelen; + + if (xfs_sb_version_hasftype(&mp->m_sb)) + from++; + + if (!hdr->i8count) + return get_unaligned_be32(from); + return get_unaligned_be64(from) & XFS_MAXINUMBER; +} + +void +xfs_dir2_sf_put_ino( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + uint8_t *to = sfep->name + sfep->namelen; + + ASSERT(ino <= XFS_MAXINUMBER); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + to++; + + if (hdr->i8count) + put_unaligned_be64(ino, to); + else + put_unaligned_be32(ino, to); +} + +xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + if (!hdr->i8count) + return get_unaligned_be32(hdr->parent); + return get_unaligned_be64(hdr->parent) & XFS_MAXINUMBER; +} + +void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + ASSERT(ino <= XFS_MAXINUMBER); + + if (hdr->i8count) + put_unaligned_be64(ino, hdr->parent); + else + put_unaligned_be32(ino, hdr->parent); +} + +/* + * The file type field is stored at the end of the name for filetype enabled + * shortform directories, or not at all otherwise. + */ +uint8_t +xfs_dir2_sf_get_ftype( + struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep) +{ + if (xfs_sb_version_hasftype(&mp->m_sb)) { + uint8_t ftype = sfep->name[sfep->namelen]; + + if (ftype < XFS_DIR3_FT_MAX) + return ftype; + } + + return XFS_DIR3_FT_UNKNOWN; +} + +void +xfs_dir2_sf_put_ftype( + struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, + uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + sfep->name[sfep->namelen] = ftype; +} + /* * Given a block directory (dp/block), calculate its size as a shortform (sf) * directory and a header for the sf directory, if it will fit it the @@ -125,7 +245,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - dp->d_ops->sf_put_parent_ino(sfhp, parent); + xfs_dir2_sf_put_parent_ino(sfhp, parent); return size; } @@ -135,64 +255,48 @@ xfs_dir2_block_sfsize( */ int /* error */ xfs_dir2_block_to_sf( - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ struct xfs_buf *bp, int size, /* shortform directory size */ - xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ + struct xfs_dir2_sf_hdr *sfhp) /* shortform directory hdr */ { - xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* unused data pointer */ - char *endptr; /* end of data entries */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int error; /* error return value */ int logflags; /* inode logging flags */ - xfs_mount_t *mp; /* filesystem mount point */ - char *ptr; /* current data pointer */ - xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ - xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ + struct xfs_dir2_sf_entry *sfep; /* shortform entry */ + struct xfs_dir2_sf_hdr *sfp; /* shortform directory header */ + unsigned int offset = args->geo->data_entry_offset; + unsigned int end; trace_xfs_dir2_block_to_sf(args); - dp = args->dp; - mp = dp->i_mount; - - /* - * allocate a temporary destination buffer the size of the inode - * to format the data into. Once we have formatted the data, we - * can free the block and copy the formatted data into the inode literal - * area. - */ - dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); - hdr = bp->b_addr; - /* - * Copy the header into the newly allocate local space. + * Allocate a temporary destination buffer the size of the inode to + * format the data into. Once we have formatted the data, we can free + * the block and copy the formatted data into the inode literal area. */ - sfp = (xfs_dir2_sf_hdr_t *)dst; + sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0); memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); /* - * Set up to loop over the block's entries. + * Loop over the active and unused entries. Stop when we reach the + * leaf/tail portion of the block. */ - ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = xfs_dir3_data_endp(args->geo, hdr); + end = xfs_dir3_data_end_offset(args->geo, bp->b_addr); sfep = xfs_dir2_sf_firstentry(sfp); - /* - * Loop over the active and unused entries. - * Stop when we reach the leaf/tail portion of the block. - */ - while (ptr < endptr) { + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + /* * If it's unused, just skip over it. */ - dup = (xfs_dir2_data_unused_t *)ptr; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); continue; } - dep = (xfs_dir2_data_entry_t *)ptr; + /* * Skip . */ @@ -204,24 +308,22 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - dp->d_ops->sf_get_parent_ino(sfp)); + xfs_dir2_sf_get_parent_ino(sfp)); /* * Normal entry, copy it into shortform. */ else { sfep->namelen = dep->namelen; - xfs_dir2_sf_put_offset(sfep, - (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)hdr)); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, dep->name, dep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, + xfs_dir2_sf_put_ino(mp, sfp, sfep, be64_to_cpu(dep->inumber)); - dp->d_ops->sf_put_ftype(sfep, - dp->d_ops->data_get_ftype(dep)); + xfs_dir2_sf_put_ftype(mp, sfep, + xfs_dir2_data_get_ftype(mp, dep)); - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } - ptr += dp->d_ops->data_entsize(dep->namelen); + offset += xfs_dir2_data_entsize(mp, dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); @@ -240,7 +342,7 @@ xfs_dir2_block_to_sf( * Convert the inode to local format and copy the data in. */ ASSERT(dp->i_df.if_bytes == 0); - xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size); + xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; dp->i_d.di_size = size; @@ -248,7 +350,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(dst); + kmem_free(sfp); return error; } @@ -277,13 +379,7 @@ xfs_dir2_sf_addname( ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Make sure the shortform value has some of its header. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -291,7 +387,7 @@ xfs_dir2_sf_addname( /* * Compute entry (and change in) size. */ - incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); + incr_isize = xfs_dir2_sf_entsize(dp->i_mount, sfp, args->namelen); objchange = 0; /* @@ -364,18 +460,17 @@ xfs_dir2_sf_addname_easy( xfs_dir2_data_aoff_t offset, /* offset to use for new ent */ int new_isize) /* new directory size */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int byteoff; /* byte offset in sf dir */ - xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), + xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. @@ -388,8 +483,8 @@ xfs_dir2_sf_addname_easy( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); - dp->d_ops->sf_put_ftype(sfep, args->filetype); + xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber); + xfs_dir2_sf_put_ftype(mp, sfep, args->filetype); /* * Update the header and inode. @@ -416,9 +511,10 @@ xfs_dir2_sf_addname_hard( int objchange, /* changing inode number size */ int new_isize) /* new directory size */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int add_datasize; /* data size need for new ent */ char *buf; /* buffer for old */ - xfs_inode_t *dp; /* incore directory inode */ int eof; /* reached end of old dir */ int nbytes; /* temp for byte copies */ xfs_dir2_data_aoff_t new_offset; /* next offset value */ @@ -432,11 +528,9 @@ xfs_dir2_sf_addname_hard( /* * Copy the old directory to the stack buffer. */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; - buf = kmem_alloc(old_isize, KM_SLEEP); + buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); /* @@ -444,13 +538,13 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = dp->d_ops->data_first_offset, + for (offset = args->geo->data_first_offset, oldsfep = xfs_dir2_sf_firstentry(oldsfp), - add_datasize = dp->d_ops->data_entsize(args->namelen), + add_datasize = xfs_dir2_data_entsize(mp, args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), - oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), + offset = new_offset + xfs_dir2_data_entsize(mp, oldsfep->namelen), + oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) @@ -479,8 +573,8 @@ xfs_dir2_sf_addname_hard( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); - dp->d_ops->sf_put_ftype(sfep, args->filetype); + xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber); + xfs_dir2_sf_put_ftype(mp, sfep, args->filetype); sfp->count++; if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) sfp->i8count++; @@ -488,7 +582,7 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); @@ -510,7 +604,8 @@ xfs_dir2_sf_addname_pick( xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */ xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int holefit; /* found hole it will fit in */ int i; /* entry number */ xfs_dir2_data_aoff_t offset; /* data block offset */ @@ -519,11 +614,9 @@ xfs_dir2_sf_addname_pick( int size; /* entry's data size */ int used; /* data bytes used */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - size = dp->d_ops->data_entsize(args->namelen); - offset = dp->d_ops->data_first_offset; + size = xfs_dir2_data_entsize(mp, args->namelen); + offset = args->geo->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -535,8 +628,8 @@ xfs_dir2_sf_addname_pick( if (!holefit) holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); offset = xfs_dir2_sf_get_offset(sfep) + - dp->d_ops->data_entsize(sfep->namelen); - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + xfs_dir2_data_entsize(mp, sfep->namelen); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -578,7 +671,8 @@ static void xfs_dir2_sf_check( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int i; /* entry number */ int i8count; /* number of big inode#s */ xfs_ino_t ino; /* entry inode number */ @@ -586,23 +680,21 @@ xfs_dir2_sf_check( xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = dp->d_ops->data_first_offset; - ino = dp->d_ops->sf_get_parent_ino(sfp); + offset = args->geo->data_first_offset; + ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); - ino = dp->d_ops->sf_get_ino(sfp, sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = xfs_dir2_sf_get_offset(sfep) + - dp->d_ops->data_entsize(sfep->namelen); - ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); + xfs_dir2_data_entsize(mp, sfep->namelen); + ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); @@ -622,22 +714,16 @@ xfs_dir2_sf_verify( struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; - const struct xfs_dir_ops *dops; struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; int offset; - int size; + int64_t size; int error; uint8_t filetype; ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); - /* - * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops, - * so we can only trust the mountpoint to have the right pointer. - */ - dops = xfs_dir_get_ops(mp, NULL); ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; @@ -653,12 +739,12 @@ xfs_dir2_sf_verify( endp = (char *)sfp + size; /* Check .. entry */ - ino = dops->sf_get_parent_ino(sfp); + ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) return __this_address; - offset = dops->data_first_offset; + offset = mp->m_dir_geo->data_first_offset; /* Check all reported entries */ sfep = xfs_dir2_sf_firstentry(sfp); @@ -680,7 +766,7 @@ xfs_dir2_sf_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = dops->sf_nextentry(sfp, sfep); + next_sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); if (endp < (char *)next_sfep) return __this_address; @@ -689,19 +775,19 @@ xfs_dir2_sf_verify( return __this_address; /* Check the inode number. */ - ino = dops->sf_get_ino(sfp, sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) return __this_address; /* Check the file type. */ - filetype = dops->sf_get_ftype(sfep); + filetype = xfs_dir2_sf_get_ftype(mp, sfep); if (filetype >= XFS_DIR3_FT_MAX) return __this_address; offset = xfs_dir2_sf_get_offset(sfep) + - dops->data_entsize(sfep->namelen); + xfs_dir2_data_entsize(mp, sfep->namelen); sfep = next_sfep; } @@ -763,7 +849,7 @@ xfs_dir2_sf_create( /* * Now can put in the inode number, since i8count is set. */ - dp->d_ops->sf_put_parent_ino(sfp, pino); + xfs_dir2_sf_put_parent_ino(sfp, pino); sfp->count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); @@ -779,7 +865,8 @@ int /* error */ xfs_dir2_sf_lookup( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ @@ -790,16 +877,9 @@ xfs_dir2_sf_lookup( trace_xfs_dir2_sf_lookup(args); xfs_dir2_sf_check(args); - dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Bail out if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -818,7 +898,7 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = dp->d_ops->sf_get_parent_ino(sfp); + args->inumber = xfs_dir2_sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; args->filetype = XFS_DIR3_FT_DIR; return -EEXIST; @@ -828,18 +908,17 @@ xfs_dir2_sf_lookup( */ ci_sfep = NULL; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { /* * Compare name and if it's an exact match, return the inode * number. If it's the first case-insensitive match, store the * inode number and continue looking for an exact match. */ - cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, - sfep->namelen); + cmp = xfs_dir2_compname(args, sfep->name, sfep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; - args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); - args->filetype = dp->d_ops->sf_get_ftype(sfep); + args->inumber = xfs_dir2_sf_get_ino(mp, sfp, sfep); + args->filetype = xfs_dir2_sf_get_ftype(mp, sfep); if (cmp == XFS_CMP_EXACT) return -EEXIST; ci_sfep = sfep; @@ -864,8 +943,9 @@ int /* error */ xfs_dir2_sf_removename( xfs_da_args_t *args) { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int byteoff; /* offset of removed entry */ - xfs_inode_t *dp; /* incore directory inode */ int entsize; /* this entry's size */ int i; /* shortform entry index */ int newsize; /* new inode size */ @@ -875,17 +955,9 @@ xfs_dir2_sf_removename( trace_xfs_dir2_sf_removename(args); - dp = args->dp; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); oldsize = (int)dp->i_d.di_size; - /* - * Bail out if the directory is way too short. - */ - if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -895,10 +967,10 @@ xfs_dir2_sf_removename( * Find the one we're deleting. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == + ASSERT(xfs_dir2_sf_get_ino(mp, sfp, sfep) == args->inumber); break; } @@ -912,7 +984,7 @@ xfs_dir2_sf_removename( * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = dp->d_ops->sf_entsize(sfp, args->namelen); + entsize = xfs_dir2_sf_entsize(mp, sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -945,13 +1017,35 @@ xfs_dir2_sf_removename( } /* + * Check whether the sf dir replace operation need more blocks. + */ +bool +xfs_dir2_sf_replace_needblock( + struct xfs_inode *dp, + xfs_ino_t inum) +{ + int newsize; + struct xfs_dir2_sf_hdr *sfp; + + if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return false; + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; + + return inum > XFS_DIR2_MAX_SHORT_INUM && + sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); +} + +/* * Replace the inode number of an entry in a shortform directory. */ int /* error */ xfs_dir2_sf_replace( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ xfs_ino_t ino=0; /* entry old inode number */ int i8elevated; /* sf_toino8 set i8count=1 */ @@ -960,16 +1054,8 @@ xfs_dir2_sf_replace( trace_xfs_dir2_sf_replace(args); - dp = args->dp; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Bail out if the shortform directory is way too small. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -980,17 +1066,14 @@ xfs_dir2_sf_replace( */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { int error; /* error return value */ - int newsize; /* new inode size */ - newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ - if (newsize > XFS_IFORK_DSIZE(dp)) { + if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) { error = xfs_dir2_sf_to_block(args); - if (error) { + if (error) return error; - } return xfs_dir2_block_replace(args); } /* @@ -1008,22 +1091,23 @@ xfs_dir2_sf_replace( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - ino = dp->d_ops->sf_get_parent_ino(sfp); + ino = xfs_dir2_sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); - dp->d_ops->sf_put_parent_ino(sfp, args->inumber); + xfs_dir2_sf_put_parent_ino(sfp, args->inumber); } /* * Normal entry, look for the name. */ else { for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ino = dp->d_ops->sf_get_ino(sfp, sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); ASSERT(args->inumber != ino); - dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); - dp->d_ops->sf_put_ftype(sfep, args->filetype); + xfs_dir2_sf_put_ino(mp, sfp, sfep, + args->inumber); + xfs_dir2_sf_put_ftype(mp, sfep, args->filetype); break; } } @@ -1076,8 +1160,9 @@ static void xfs_dir2_sf_toino4( xfs_da_args_t *args) /* operation arguments */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; char *buf; /* old dir's buffer */ - xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ @@ -1088,15 +1173,13 @@ xfs_dir2_sf_toino4( trace_xfs_dir2_sf_toino4(args); - dp = args->dp; - /* * Copy the old directory to the buffer. * Then nuke it from the inode, and add the new buffer to the inode. * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, KM_SLEEP); + buf = kmem_alloc(oldsize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); @@ -1116,21 +1199,22 @@ xfs_dir2_sf_toino4( */ sfp->count = oldsfp->count; sfp->i8count = 0; - dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), - oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, - dp->d_ops->sf_get_ino(oldsfp, oldsfep)); - dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + xfs_dir2_sf_put_ino(mp, sfp, sfep, + xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep)); + xfs_dir2_sf_put_ftype(mp, sfep, + xfs_dir2_sf_get_ftype(mp, oldsfep)); } /* * Clean up the inode. @@ -1149,8 +1233,9 @@ static void xfs_dir2_sf_toino8( xfs_da_args_t *args) /* operation arguments */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; char *buf; /* old dir's buffer */ - xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ @@ -1161,15 +1246,13 @@ xfs_dir2_sf_toino8( trace_xfs_dir2_sf_toino8(args); - dp = args->dp; - /* * Copy the old directory to the buffer. * Then nuke it from the inode, and add the new buffer to the inode. * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, KM_SLEEP); + buf = kmem_alloc(oldsize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); @@ -1189,21 +1272,22 @@ xfs_dir2_sf_toino8( */ sfp->count = oldsfp->count; sfp->i8count = 1; - dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), - oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, - dp->d_ops->sf_get_ino(oldsfp, oldsfep)); - dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + xfs_dir2_sf_put_ino(mp, sfp, sfep, + xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep)); + xfs_dir2_sf_put_ftype(mp, sfep, + xfs_dir2_sf_get_ftype(mp, oldsfep)); } /* * Clean up the inode. diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index e8bd688a4073..bedc1e752b60 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -35,10 +35,10 @@ xfs_calc_dquots_per_chunk( xfs_failaddr_t xfs_dquot_verify( - struct xfs_mount *mp, - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, + xfs_dqid_t id, + uint type) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index c968b60cee15..77e9fa385980 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -920,13 +920,13 @@ static inline uint xfs_dinode_size(int version) * This enum is used in string mapping in xfs_trace.h; please keep the * TRACE_DEFINE_ENUMs for it up to date. */ -typedef enum xfs_dinode_fmt { +enum xfs_dinode_fmt { XFS_DINODE_FMT_DEV, /* xfs_dev_t */ XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ XFS_DINODE_FMT_UUID /* added long ago, but never used */ -} xfs_dinode_fmt_t; +}; #define XFS_INODE_FORMAT_STR \ { XFS_DINODE_FMT_DEV, "dev" }, \ @@ -1144,11 +1144,11 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the xfs_dquot_t that + * information for a user. This is the q_core of the struct xfs_dquot that * is kept in kernel memory. We pad this with some more expansion room * to construct the on disk structure. */ -typedef struct xfs_disk_dquot { +struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ @@ -1171,15 +1171,15 @@ typedef struct xfs_disk_dquot { __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ __be16 d_pad; -} xfs_disk_dquot_t; +}; /* * This is what goes on disk. This is separated from the xfs_disk_dquot because * carrying the unnecessary padding would be a waste of memory. */ typedef struct xfs_dqblk { - xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[4]; /* filling for posterity */ + struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */ + char dd_fill[4];/* filling for posterity */ /* * These two are only present on filesystems with the CRC bits set. @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 52d03a3a02a4..ef95ca07d084 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -287,7 +287,7 @@ struct xfs_ag_geometry { uint32_t ag_ifree; /* o: inodes free */ uint32_t ag_sick; /* o: sick things in ag */ uint32_t ag_checked; /* o: checked metadata in ag */ - uint32_t ag_reserved32; /* o: zero */ + uint32_t ag_flags; /* i/o: flags for this ag */ uint64_t ag_reserved[12];/* o: zero */ }; #define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ @@ -324,7 +324,7 @@ typedef struct xfs_growfs_rt { * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE */ typedef struct xfs_bstime { - time_t tv_sec; /* seconds */ + __kernel_long_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } xfs_bstime_t; @@ -366,11 +366,11 @@ struct xfs_bulkstat { uint64_t bs_blocks; /* number of blocks */ uint64_t bs_xflags; /* extended flags */ - uint64_t bs_atime; /* access time, seconds */ - uint64_t bs_mtime; /* modify time, seconds */ + int64_t bs_atime; /* access time, seconds */ + int64_t bs_mtime; /* modify time, seconds */ - uint64_t bs_ctime; /* inode change time, seconds */ - uint64_t bs_btime; /* creation time, seconds */ + int64_t bs_ctime; /* inode change time, seconds */ + int64_t bs_btime; /* creation time, seconds */ uint32_t bs_gen; /* generation count */ uint32_t bs_uid; /* user id */ @@ -416,7 +416,7 @@ struct xfs_bulkstat { /* * Project quota id helpers (previously projid was 16bit only - * and using two 16bit values to hold new 32bit projid was choosen + * and using two 16bit values to hold new 32bit projid was chosen * to retain compatibility with "old" filesystems). */ static inline uint32_t diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 04377ab75863..bf161e930f1d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -276,6 +276,7 @@ xfs_ialloc_inode_init( int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; + int error; /* * Loop over the new block(s), filling in the inodes. For small block @@ -327,12 +328,11 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * M_IGEO(mp)->blocks_per_cluster)); - fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * - M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED); - if (!fbuf) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, + XBF_UNMAPPED, &fbuf); + if (error) + return error; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; @@ -544,7 +544,10 @@ xfs_inobt_insert_sprec( nrec->ir_free, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } goto out; } @@ -557,17 +560,23 @@ xfs_inobt_insert_sprec( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); - XFS_WANT_CORRUPTED_GOTO(mp, - rec.ir_startino == nrec->ir_startino, - error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } + if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + error = -EFSCORRUPTED; + goto error; + } /* * This should never fail. If we have coexisting records that * cannot merge, something is seriously wrong. */ - XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), - error); + if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + error = -EFSCORRUPTED; + goto error; + } trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, rec.ir_holemask, nrec->ir_startino, @@ -1057,7 +1066,8 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; } return 0; @@ -1081,7 +1091,8 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; } return 0; @@ -1161,12 +1172,18 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); + if (XFS_IS_CORRUPT(mp, j != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if (rec.ir_freecount > 0) { /* @@ -1321,19 +1338,28 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } } alloc_inode: @@ -1393,7 +1419,8 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) + return -EFSCORRUPTED; /* * See if we've landed in the parent inode record. The finobt @@ -1416,10 +1443,16 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; - XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); + if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { + error = -EFSCORRUPTED; + goto error_rcur; + } } - XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { + error = -EFSCORRUPTED; + goto error_rcur; + } if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer @@ -1472,7 +1505,8 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; return 0; } } @@ -1483,12 +1517,14 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; return 0; } @@ -1510,20 +1546,24 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && - (rec.ir_freecount == frec->ir_freecount)); + if (XFS_IS_CORRUPT(cur->bc_mp, + rec.ir_free != frec->ir_free || + rec.ir_freecount != frec->ir_freecount)) + return -EFSCORRUPTED; return xfs_inobt_update(cur, &rec); } @@ -1933,14 +1973,20 @@ xfs_difree_inobt( __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Get the offset in the inode chunk. */ @@ -2052,7 +2098,10 @@ xfs_difree_finobt( * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ - XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); + if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { + error = -EFSCORRUPTED; + goto error; + } error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, ibtrec->ir_count, @@ -2075,14 +2124,20 @@ xfs_difree_finobt( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; - XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && - (rec.ir_freecount == ibtrec->ir_freecount), - error); + if (XFS_IS_CORRUPT(mp, + rec.ir_free != ibtrec->ir_free || + rec.ir_freecount != ibtrec->ir_freecount)) { + error = -EFSCORRUPTED; + goto error; + } /* * The content of inobt records should always match between the inobt @@ -2787,8 +2842,13 @@ xfs_ialloc_setup_geometry( igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); - /* Set the maximum inode count for this filesystem. */ - if (sbp->sb_imax_pct) { + /* + * Set the maximum inode count for this filesystem, being careful not + * to use obviously garbage sb_inopblog/sb_inopblock values. Regular + * users should never get here due to failing sb verification, but + * certain users (xfs_db) need to be usable even with corrupt metadata. + */ + if (sbp->sb_imax_pct && igeo->ialloc_blks) { /* * Make sure the maximum inode count is a multiple * of the units we allocate inodes in. @@ -2849,3 +2909,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 27aa3f2bc4bc..52451809c478 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -596,7 +596,7 @@ xfs_iext_realloc_root( struct xfs_ifork *ifp, struct xfs_iext_cursor *cur) { - size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec); + int64_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec); void *new; /* account for the prev/next pointers */ @@ -616,7 +616,7 @@ xfs_iext_realloc_root( * sequence counter is seen before the modifications to the extent tree itself * take effect. */ -static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) +static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) { WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } @@ -633,7 +633,7 @@ xfs_iext_insert( struct xfs_iext_leaf *new = NULL; int nr_entries, i; - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); if (ifp->if_height == 0) xfs_iext_alloc_root(ifp, cur); @@ -875,7 +875,7 @@ xfs_iext_remove( ASSERT(ifp->if_u1.if_root != NULL); ASSERT(xfs_iext_valid(ifp, cur)); - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; for (i = cur->pos; i < nr_entries; i++) @@ -983,7 +983,7 @@ xfs_iext_update_extent( { struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); if (cur->pos == 0) { struct xfs_bmbt_irec old; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 28ab3c5255e1..8afacfe4be0a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -213,13 +213,12 @@ xfs_inode_from_disk( to->di_version = from->di_version; if (to->di_version == 1) { set_nlink(inode, be16_to_cpu(from->di_onlink)); - to->di_projid_lo = 0; - to->di_projid_hi = 0; + to->di_projid = 0; to->di_version = 2; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | + be16_to_cpu(from->di_projid_lo); } to->di_format = from->di_format; @@ -256,8 +255,8 @@ xfs_inode_from_disk( if (to->di_version == 3) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } @@ -279,8 +278,8 @@ xfs_inode_to_disk( to->di_format = from->di_format; to->di_uid = cpu_to_be32(from->di_uid); to->di_gid = cpu_to_be32(from->di_gid); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); + to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); @@ -306,8 +305,8 @@ xfs_inode_to_disk( if (from->di_version == 3) { to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); @@ -632,8 +631,6 @@ xfs_iread( if ((iget_flags & XFS_IGET_CREATE) && xfs_sb_version_hascrc(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { - /* initialise the on-disk inode core */ - memset(&ip->i_d, 0, sizeof(ip->i_d)); VFS_I(ip)->i_generation = prandom_u32(); ip->i_d.di_version = 3; return 0; diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index ab0f84165317..fd94b1078722 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -21,8 +21,7 @@ struct xfs_icdinode { uint16_t di_flushiter; /* incremented on flush */ uint32_t di_uid; /* owner's user id */ uint32_t di_gid; /* owner's group id */ - uint16_t di_projid_lo; /* lower part of owner's project id */ - uint16_t di_projid_hi; /* higher part of owner's project id */ + uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ @@ -37,7 +36,7 @@ struct xfs_icdinode { uint64_t di_flags2; /* more random flags */ uint32_t di_cowextsize; /* basic cow extent size for file */ - xfs_ictimestamp_t di_crtime; /* time created */ + struct timespec64 di_crtime; /* time created */ }; /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bf3e04018246..ad2b9c313fd2 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -75,11 +75,15 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); break; default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); return -EFSCORRUPTED; } break; default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); return -EFSCORRUPTED; } if (error) @@ -94,7 +98,7 @@ xfs_iformat_fork( return 0; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: @@ -110,14 +114,16 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); break; default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); error = -EFSCORRUPTED; break; } if (error) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; if (ip->i_cowfp) - kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); ip->i_cowfp = NULL; xfs_idestroy_fork(ip, XFS_DATA_FORK); } @@ -129,7 +135,7 @@ xfs_init_local_fork( struct xfs_inode *ip, int whichfork, const void *data, - int size) + int64_t size) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int mem_size = size, real_size = 0; @@ -147,7 +153,7 @@ xfs_init_local_fork( if (size) { real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -302,7 +308,7 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ifp->if_broot = kmem_alloc(size, KM_NOFS); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -367,7 +373,7 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot = kmem_alloc(new_size, KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; } @@ -382,7 +388,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_SLEEP | KM_NOFS); + KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -408,7 +414,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + new_broot = kmem_alloc(new_size, KM_NOFS); /* * First copy over the btree block header. */ @@ -467,11 +473,11 @@ xfs_iroot_realloc( void xfs_idata_realloc( struct xfs_inode *ip, - int byte_diff, + int64_t byte_diff, int whichfork) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int new_size = (int)ifp->if_bytes + byte_diff; + int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork)); @@ -492,7 +498,7 @@ xfs_idata_realloc( * We enforce that here. */ ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_SLEEP | KM_NOFS); + roundup(new_size, 4), KM_NOFS); ifp->if_bytes = new_size; } @@ -525,10 +531,10 @@ xfs_idestroy_fork( } if (whichfork == XFS_ATTR_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; } else if (whichfork == XFS_COW_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); ip->i_cowfp = NULL; } } @@ -552,7 +558,7 @@ xfs_iextents_copy( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; - int copied = 0; + int64_t copied = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); @@ -683,7 +689,7 @@ xfs_ifork_init_cow( return; ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_SLEEP | KM_NOFS); + KM_NOFS); ip->i_cowfp->if_flags = XFS_IFEXTENTS; ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 00c62ce170d0..500333d0101e 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -13,16 +13,16 @@ struct xfs_dinode; * File incore extent information, present for each of data & attr forks. */ struct xfs_ifork { - int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* fork mod counter */ + int64_t if_bytes; /* bytes in if_u1 */ struct xfs_btree_block *if_broot; /* file's incore btree root */ - short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ + unsigned int if_seq; /* fork mod counter */ int if_height; /* height of the extent tree */ union { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ }; /* @@ -87,18 +87,24 @@ struct xfs_ifork { #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +#define xfs_ifork_has_extents(ip, w) \ + (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \ + XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE) + struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); -void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, + int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); -void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); +void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, + const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e5f97c69b320..9bac0d2e56dc 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -432,9 +432,9 @@ static inline uint xfs_log_dinode_size(int version) } /* - * Buffer Log Format defintions + * Buffer Log Format definitions * - * These are the physical dirty bitmap defintions for the log format structure. + * These are the physical dirty bitmap definitions for the log format structure. */ #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 @@ -462,11 +462,20 @@ static inline uint xfs_log_dinode_size(int version) #define XFS_BLF_GDQUOT_BUF (1<<4) /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + * This is the structure used to lay out a buf log item in the log. The data + * map describes which 128 byte chunks of the buffer have been logged. + * + * The placement of blf_map_size causes blf_data_map to start at an odd + * multiple of sizeof(unsigned int) offset within the struct. Because the data + * bitmap size will always be an even number, the end of the data_map (and + * therefore the structure) will also be at an odd multiple of sizeof(unsigned + * int). Some 64-bit compilers will insert padding at the end of the struct to + * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore, + * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and + * keep the structure size consistent between 32-bit and 64-bit platforms. + */ +#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) +#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index f3d18eaecebb..3bf671637a91 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -30,14 +30,14 @@ typedef struct xlog_recover_item { xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; -typedef struct xlog_recover { +struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ xfs_trans_header_t r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ -} xlog_recover_t; +}; #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 51bb9bdb0e84..6e1665f2cb67 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -200,7 +200,10 @@ xfs_refcount_insert( error = xfs_btree_insert(cur, i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } out_error: if (error) @@ -227,10 +230,16 @@ xfs_refcount_delete( error = xfs_refcount_get_rec(cur, &irec, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); error = xfs_btree_delete(cur, i); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (error) goto out_error; error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); @@ -349,7 +358,10 @@ xfs_refcount_split_extent( error = xfs_refcount_get_rec(cur, &rcext, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; @@ -371,7 +383,10 @@ xfs_refcount_split_extent( error = xfs_refcount_insert(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } return error; out_error: @@ -410,19 +425,27 @@ xfs_refcount_merge_center_extents( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (center->rc_refcount > 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* Enlarge the left extent. */ @@ -430,7 +453,10 @@ xfs_refcount_merge_center_extents( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } left->rc_blockcount = extlen; error = xfs_refcount_update(cur, left); @@ -469,14 +495,18 @@ xfs_refcount_merge_left_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* Enlarge the left extent. */ @@ -484,7 +514,10 @@ xfs_refcount_merge_left_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } left->rc_blockcount += cleft->rc_blockcount; error = xfs_refcount_update(cur, left); @@ -526,14 +559,18 @@ xfs_refcount_merge_right_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* Enlarge the right extent. */ @@ -541,7 +578,10 @@ xfs_refcount_merge_right_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } right->rc_startblock -= cright->rc_blockcount; right->rc_blockcount += cright->rc_blockcount; @@ -587,7 +627,10 @@ xfs_refcount_find_left_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (xfs_refc_next(&tmp) != agbno) return 0; @@ -605,8 +648,10 @@ xfs_refcount_find_left_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) @@ -671,7 +716,10 @@ xfs_refcount_find_right_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (tmp.rc_startblock != agbno + aglen) return 0; @@ -689,8 +737,10 @@ xfs_refcount_find_right_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) @@ -913,8 +963,11 @@ xfs_refcount_adjust_extents( &found_tmp); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_tmp == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, + found_tmp != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } cur->bc_private.a.priv.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, @@ -955,8 +1008,10 @@ xfs_refcount_adjust_extents( error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } cur->bc_private.a.priv.refc.nr_ops++; goto advloop; } else { @@ -1122,8 +1177,6 @@ xfs_refcount_finish_one( XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - if (!agbp) - return -EFSCORRUPTED; rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); if (!rcur) { @@ -1174,7 +1227,7 @@ out_cur: /* * Record a refcount intent for later processing. */ -static int +static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, @@ -1189,44 +1242,43 @@ __xfs_refcount_add( blockcount); ri = kmem_alloc(sizeof(struct xfs_refcount_intent), - KM_SLEEP | KM_NOFS); + KM_NOFS); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); - return 0; } /* * Increase the reference count of the blocks backing a file's extent. */ -int +void xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) - return 0; + return; - return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, - PREV->br_startblock, PREV->br_blockcount); + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + PREV->br_blockcount); } /* * Decrease the reference count of the blocks backing a file's extent. */ -int +void xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) - return 0; + return; - return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, - PREV->br_startblock, PREV->br_blockcount); + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + PREV->br_blockcount); } /* @@ -1273,7 +1325,10 @@ xfs_refcount_find_shared( error = xfs_refcount_get_rec(cur, &tmp, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1285,7 +1340,10 @@ xfs_refcount_find_shared( error = xfs_refcount_get_rec(cur, &tmp, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* If the extent starts after the range we want, bail out */ @@ -1313,7 +1371,10 @@ xfs_refcount_find_shared( error = xfs_refcount_get_rec(cur, &tmp, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; @@ -1414,8 +1475,11 @@ xfs_refcount_adjust_cow_extents( switch (adj) { case XFS_REFCOUNT_ADJUST_COW_ALLOC: /* Adding a CoW reservation, there should be nothing here. */ - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_startblock >= agbno + aglen, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, + agbno + aglen > ext.rc_startblock)) { + error = -EFSCORRUPTED; + goto out_error; + } tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; @@ -1427,17 +1491,25 @@ xfs_refcount_adjust_cow_extents( &found_tmp); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_tmp == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } break; case XFS_REFCOUNT_ADJUST_COW_FREE: /* Removing a CoW reservation, there should be one extent. */ - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_startblock == agbno, out_error); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_blockcount == aglen, out_error); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_refcount == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, @@ -1445,8 +1517,10 @@ xfs_refcount_adjust_cow_extents( error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } break; default: ASSERT(0); @@ -1541,47 +1615,40 @@ __xfs_refcount_cow_free( } /* Record a CoW staging extent in the refcount btree. */ -int +void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { struct xfs_mount *mp = tp->t_mountp; - int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return 0; + return; - error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); - if (error) - return error; + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); /* Add rmap entry */ - return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), + xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ -int +void xfs_refcount_free_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { struct xfs_mount *mp = tp->t_mountp; - int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return 0; + return; /* Remove rmap entry */ - error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), + xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); - if (error) - return error; - - return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); } struct xfs_refcount_recovery { @@ -1592,17 +1659,18 @@ struct xfs_refcount_recovery { /* Stuff an extent on the recovery list. */ STATIC int xfs_refcount_recover_extent( - struct xfs_btree_cur *cur, + struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv) { struct list_head *debris = priv; struct xfs_refcount_recovery *rr; - if (be32_to_cpu(rec->refc.rc_refcount) != 1) + if (XFS_IS_CORRUPT(cur->bc_mp, + be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); list_add_tail(&rr->rr_list, debris); @@ -1648,10 +1716,6 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; - if (!agbp) { - error = -ENOMEM; - goto out_trans; - } cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); /* Find all the leftover CoW staging extents. */ @@ -1679,10 +1743,8 @@ xfs_refcount_recover_cow_leftovers( /* Free the orphan record */ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; fsb = XFS_AGB_TO_FSB(mp, agno, agbno); - error = xfs_refcount_free_cow_extent(tp, fsb, + xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); - if (error) - goto out_trans; /* Free the block. */ xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 1d9c518575e7..209795539c8d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -29,9 +29,9 @@ struct xfs_refcount_intent { xfs_extlen_t ri_blockcount; }; -extern int xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); -extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, @@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, - xfs_fsblock_t fsb, xfs_extlen_t len); -extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, - xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, + xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, + xfs_extlen_t len); extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index e6aeb390b2fb..ff9412f113c4 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -113,7 +113,10 @@ xfs_rmap_insert( error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } rcur->bc_rec.r.rm_startblock = agbno; rcur->bc_rec.r.rm_blockcount = len; @@ -123,7 +126,10 @@ xfs_rmap_insert( error = xfs_btree_insert(rcur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, @@ -149,12 +155,18 @@ xfs_rmap_delete( error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_delete(rcur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, @@ -168,7 +180,6 @@ xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) { - irec->rm_flags = 0; irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); @@ -254,15 +265,15 @@ xfs_rmap_find_left_neighbor_helper( rec->rm_flags); if (rec->rm_owner != info->high.rm_owner) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; *info->irec = *rec; *info->stat = 1; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -305,7 +316,7 @@ xfs_rmap_find_left_neighbor( error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, @@ -330,16 +341,16 @@ xfs_rmap_lookup_le_range_helper( rec->rm_flags); if (rec->rm_owner != info->high.rm_owner) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && (rec->rm_offset > info->high.rm_offset || rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; *info->irec = *rec; *info->stat = 1; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -377,7 +388,7 @@ xfs_rmap_lookup_le_range( cur->bc_private.a.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, @@ -407,24 +418,39 @@ xfs_rmap_free_check_owner( return 0; /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (rec->rm_flags & XFS_RMAP_UNWRITTEN), out); + if (XFS_IS_CORRUPT(mp, + (flags & XFS_RMAP_UNWRITTEN) != + (rec->rm_flags & XFS_RMAP_UNWRITTEN))) { + error = -EFSCORRUPTED; + goto out; + } /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out); + if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) { + error = -EFSCORRUPTED; + goto out; + } /* Check the offset, if necessary. */ if (XFS_RMAP_NON_INODE_OWNER(owner)) goto out; if (flags & XFS_RMAP_BMBT_BLOCK) { - XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK, - out); + if (XFS_IS_CORRUPT(mp, + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) { + error = -EFSCORRUPTED; + goto out; + } } else { - XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out); - XFS_WANT_CORRUPTED_GOTO(mp, - ltoff + rec->rm_blockcount >= offset + len, - out); + if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) { + error = -EFSCORRUPTED; + goto out; + } + if (XFS_IS_CORRUPT(mp, + offset + len > ltoff + rec->rm_blockcount)) { + error = -EFSCORRUPTED; + goto out; + } } out: @@ -483,12 +509,18 @@ xfs_rmap_unmap( error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_rmap_get_rec(cur, <rec, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_private.a.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -503,8 +535,12 @@ xfs_rmap_unmap( * be the case that the "left" extent goes all the way to EOFS. */ if (owner == XFS_RMAP_OWN_NULL) { - XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock + - ltrec.rm_blockcount, out_error); + if (XFS_IS_CORRUPT(mp, + bno < + ltrec.rm_startblock + ltrec.rm_blockcount)) { + error = -EFSCORRUPTED; + goto out_error; + } goto out_done; } @@ -527,15 +563,22 @@ xfs_rmap_unmap( error = xfs_rmap_get_rec(cur, &rtrec, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (rtrec.rm_startblock >= bno + len) goto out_done; } /* Make sure the extent we found covers the entire freeing range. */ - XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); + if (XFS_IS_CORRUPT(mp, + ltrec.rm_startblock > bno || + ltrec.rm_startblock + ltrec.rm_blockcount < + bno + len)) { + error = -EFSCORRUPTED; + goto out_error; + } /* Check owner information. */ error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, @@ -552,7 +595,10 @@ xfs_rmap_unmap( error = xfs_btree_delete(cur, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } else if (ltrec.rm_startblock == bno) { /* * overlap left hand side of extent: move the start, trim the @@ -744,7 +790,10 @@ xfs_rmap_map( error = xfs_rmap_get_rec(cur, <rec, &have_lt); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error); + if (XFS_IS_CORRUPT(mp, have_lt != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_private.a.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -754,9 +803,12 @@ xfs_rmap_map( have_lt = 0; } - XFS_WANT_CORRUPTED_GOTO(mp, - have_lt == 0 || - ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error); + if (XFS_IS_CORRUPT(mp, + have_lt != 0 && + ltrec.rm_startblock + ltrec.rm_blockcount > bno)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Increment the cursor to see if we have a right-adjacent record to our @@ -770,9 +822,14 @@ xfs_rmap_map( error = xfs_rmap_get_rec(cur, >rec, &have_gt); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock, - out_error); + if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, cur->bc_private.a.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -822,7 +879,10 @@ xfs_rmap_map( error = xfs_btree_delete(cur, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* point the cursor back to the left record and update */ @@ -866,7 +926,10 @@ xfs_rmap_map( error = xfs_btree_insert(cur, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, @@ -958,12 +1021,18 @@ xfs_rmap_convert( error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_rmap_get_rec(cur, &PREV, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_private.a.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, @@ -996,10 +1065,16 @@ xfs_rmap_convert( error = xfs_rmap_get_rec(cur, &LEFT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - XFS_WANT_CORRUPTED_GOTO(mp, - LEFT.rm_startblock + LEFT.rm_blockcount <= bno, - done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + if (XFS_IS_CORRUPT(mp, + LEFT.rm_startblock + LEFT.rm_blockcount > + bno)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, cur->bc_private.a.agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, @@ -1018,7 +1093,10 @@ xfs_rmap_convert( error = xfs_btree_increment(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_increment(cur, 0, &i); if (error) goto done; @@ -1027,9 +1105,14 @@ xfs_rmap_convert( error = xfs_rmap_get_rec(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, - done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, cur->bc_private.a.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, @@ -1056,7 +1139,10 @@ xfs_rmap_convert( error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Switch out based on the FILLING and CONTIG state bits. @@ -1072,7 +1158,10 @@ xfs_rmap_convert( error = xfs_btree_increment(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, @@ -1080,11 +1169,17 @@ xfs_rmap_convert( error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, @@ -1092,11 +1187,17 @@ xfs_rmap_convert( error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW = LEFT; NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; error = xfs_rmap_update(cur, &NEW); @@ -1116,11 +1217,17 @@ xfs_rmap_convert( error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW = LEFT; NEW.rm_blockcount += PREV.rm_blockcount; error = xfs_rmap_update(cur, &NEW); @@ -1136,7 +1243,10 @@ xfs_rmap_convert( error = xfs_btree_increment(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, @@ -1144,11 +1254,17 @@ xfs_rmap_convert( error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW = PREV; NEW.rm_blockcount = len + RIGHT.rm_blockcount; NEW.rm_flags = newext; @@ -1215,7 +1331,10 @@ xfs_rmap_convert( error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: @@ -1254,7 +1373,10 @@ xfs_rmap_convert( oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_startblock = bno; NEW.rm_owner = owner; NEW.rm_offset = offset; @@ -1266,7 +1388,10 @@ xfs_rmap_convert( error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case 0: @@ -1296,7 +1421,10 @@ xfs_rmap_convert( error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -1306,7 +1434,10 @@ xfs_rmap_convert( oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; @@ -1315,7 +1446,10 @@ xfs_rmap_convert( error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: @@ -1384,7 +1518,10 @@ xfs_rmap_convert_shared( &PREV, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } ASSERT(PREV.rm_offset <= offset); ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); @@ -1407,9 +1544,12 @@ xfs_rmap_convert_shared( goto done; if (i) { state |= RMAP_LEFT_VALID; - XFS_WANT_CORRUPTED_GOTO(mp, - LEFT.rm_startblock + LEFT.rm_blockcount <= bno, - done); + if (XFS_IS_CORRUPT(mp, + LEFT.rm_startblock + LEFT.rm_blockcount > + bno)) { + error = -EFSCORRUPTED; + goto done; + } if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) state |= RMAP_LEFT_CONTIG; } @@ -1424,9 +1564,14 @@ xfs_rmap_convert_shared( error = xfs_rmap_get_rec(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, - done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, cur->bc_private.a.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, @@ -1473,7 +1618,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1496,7 +1644,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += PREV.rm_blockcount; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1519,7 +1670,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += RIGHT.rm_blockcount; NEW.rm_flags = RIGHT.rm_flags; error = xfs_rmap_update(cur, &NEW); @@ -1539,7 +1693,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_flags = newext; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1571,7 +1728,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += len; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1613,7 +1773,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount = offset - NEW.rm_offset; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1645,7 +1808,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount -= len; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1680,7 +1846,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount = offset - NEW.rm_offset; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1766,25 +1935,44 @@ xfs_rmap_unmap_shared( <rec, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ltoff = ltrec.rm_offset; /* Make sure the extent we found covers the entire freeing range. */ - XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); + if (XFS_IS_CORRUPT(mp, + ltrec.rm_startblock > bno || + ltrec.rm_startblock + ltrec.rm_blockcount < + bno + len)) { + error = -EFSCORRUPTED; + goto out_error; + } /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) { + error = -EFSCORRUPTED; + goto out_error; + } /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + if (XFS_IS_CORRUPT(mp, + (flags & XFS_RMAP_UNWRITTEN) != + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) { + error = -EFSCORRUPTED; + goto out_error; + } /* Check the offset. */ - XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, - out_error); + if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) { + error = -EFSCORRUPTED; + goto out_error; + } if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* Exact match, simply remove the record from rmap tree. */ @@ -1837,7 +2025,10 @@ xfs_rmap_unmap_shared( ltrec.rm_offset, ltrec.rm_flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ltrec.rm_blockcount -= len; error = xfs_rmap_update(cur, <rec); if (error) @@ -1863,7 +2054,10 @@ xfs_rmap_unmap_shared( ltrec.rm_offset, ltrec.rm_flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ltrec.rm_blockcount = bno - ltrec.rm_startblock; error = xfs_rmap_update(cur, <rec); if (error) @@ -1939,7 +2133,10 @@ xfs_rmap_map_shared( error = xfs_rmap_get_rec(cur, >rec, &have_gt); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, cur->bc_private.a.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -1988,7 +2185,10 @@ xfs_rmap_map_shared( ltrec.rm_offset, ltrec.rm_flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_rmap_update(cur, <rec); if (error) @@ -2200,7 +2400,7 @@ xfs_rmap_finish_one( error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; - if (!agbp) + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) return -EFSCORRUPTED; rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); @@ -2268,7 +2468,7 @@ xfs_rmap_update_is_needed( * Record a rmap intent; the list is kept sorted first by AG and then by * increasing age. */ -static int +static void __xfs_rmap_add( struct xfs_trans *tp, enum xfs_rmap_intent_type type, @@ -2287,7 +2487,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; @@ -2295,11 +2495,10 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); - return 0; } /* Map an extent into a file. */ -int +void xfs_rmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -2307,15 +2506,15 @@ xfs_rmap_map_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ -int +void xfs_rmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -2323,9 +2522,9 @@ xfs_rmap_unmap_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -2336,7 +2535,7 @@ xfs_rmap_unmap_extent( * Note that tp can be NULL here as no transaction is used for COW fork * unwritten conversion. */ -int +void xfs_rmap_convert_extent( struct xfs_mount *mp, struct xfs_trans *tp, @@ -2345,15 +2544,15 @@ xfs_rmap_convert_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(mp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ -int +void xfs_rmap_alloc_extent( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2364,18 +2563,18 @@ xfs_rmap_alloc_extent( struct xfs_bmbt_irec bmap; if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) - return 0; + return; bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); } /* Schedule the deletion of an rmap for non-file data. */ -int +void xfs_rmap_free_extent( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2386,14 +2585,14 @@ xfs_rmap_free_extent( struct xfs_bmbt_irec bmap; if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) - return 0; + return; bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); } /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ @@ -2511,7 +2710,7 @@ xfs_rmap_has_other_keys_helper( ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; rks->has_rmap = true; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -2540,8 +2739,11 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); + if (error < 0) + return error; + *has_rmap = rks.has_rmap; - return error; + return 0; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index e21ed0294e5c..abe633403fd1 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack( if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) return -EFSCORRUPTED; irec->rm_offset = XFS_RMAP_OFF(offset); + irec->rm_flags = 0; if (offset & XFS_RMAP_OFF_ATTR_FORK) irec->rm_flags |= XFS_RMAP_ATTR_FORK; if (offset & XFS_RMAP_OFF_BMBT_BLOCK) @@ -161,16 +162,16 @@ struct xfs_rmap_intent { }; /* functions for updating the rmapbt based on bmbt map/unmap operations */ -int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, +void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, +void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 8ea1efc97b41..f42c74cb8be5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -15,7 +15,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_rtalloc.h" - +#include "xfs_error.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -70,7 +70,7 @@ xfs_rtbuf_get( if (error) return error; - if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a08dd8f40346..2f60fc3c99a0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" @@ -928,7 +929,7 @@ xfs_log_sb( xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* @@ -984,9 +985,9 @@ xfs_update_secondary_sbs( for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { struct xfs_buf *bp; - bp = xfs_buf_get(mp->m_ddev_targp, + error = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), - XFS_FSS_TO_BB(mp, 1)); + XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, * continue. xfs_repair chooses the "best" superblock based @@ -994,12 +995,12 @@ xfs_update_secondary_sbs( * superblocks un-updated than updated, and xfs_repair may * pick them over the properly-updated primary. */ - if (!bp) { + if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", agno); if (!saved_error) - saved_error = -ENOMEM; + saved_error = error; continue; } @@ -1184,13 +1185,14 @@ xfs_sb_get_secondary( struct xfs_buf **bpp) { struct xfs_buf *bp; + int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (!bp) - return -ENOMEM; + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); *bpp = bp; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index e0641b7337b3..c45acbd3add9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -177,10 +177,4 @@ struct xfs_ino_geometry { unsigned int agino_log; /* #bits for agino in inum */ }; -/* Keep iterating the data structure. */ -#define XFS_ITER_CONTINUE (0) - -/* Stop iterating the data structure. */ -#define XFS_ITER_ABORT (1) - #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index a9ad90926b87..2b8ccb5b975d 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -55,7 +55,7 @@ xfs_trans_ichgtime( int flags) { struct inode *inode = VFS_I(ip); - struct timespec64 tv; + struct timespec64 tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -66,10 +66,8 @@ xfs_trans_ichgtime( inode->i_mtime = tv; if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; - if (flags & XFS_ICHGTIME_CREATE) { - ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; - } + if (flags & XFS_ICHGTIME_CREATE) + ip->i_d.di_crtime = tv; } /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index d12bbd526e7c..7a9c04920505 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +static unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res( /* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -234,40 +260,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* @@ -718,7 +776,7 @@ xfs_calc_clear_agi_bucket_reservation( /* * Adjusting quota limits. - * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + * the disk quota buffer: sizeof(struct xfs_disk_dquot) */ STATIC uint xfs_calc_qm_setqlim_reservation(void) @@ -742,7 +800,7 @@ xfs_calc_qm_dqalloc_reservation( /* * Turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 * the superblock for the quota flags: sector size */ STATIC uint @@ -755,7 +813,7 @@ xfs_calc_qm_quotaoff_reservation( /* * End of turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 */ STATIC uint xfs_calc_qm_quotaoff_end_reservation(void) diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 802b34cd10fe..397d94775440 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -21,7 +21,6 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ -typedef int32_t xfs_tid_t; /* transaction identifier */ typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ @@ -33,7 +32,6 @@ typedef uint64_t xfs_fileoff_t; /* block number in a file */ typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ -typedef int64_t xfs_sfiloff_t; /* signed block number in a file */ /* * New verifiers will return the instruction address of the failing check. @@ -169,6 +167,14 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +/* per-AG block reservation types */ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_AGFL, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_RMAPBT, +}; + /* * Type verifier functions */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 16b09b941441..ba0f747c82e8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -639,7 +639,7 @@ xchk_agfl_block( xchk_agfl_block_xref(sc, agbno); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_ITER_ABORT; + return -ECANCELED; return 0; } @@ -730,7 +730,7 @@ xchk_agfl( /* Check the blocks in the AGFL. */ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), sc->sa.agfl_bp, xchk_agfl_block, &sai); - if (error == XFS_ITER_ABORT) { + if (error == -ECANCELED) { error = 0; goto out_free; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 7a1a38b636a9..d5e6db9af434 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -659,8 +659,6 @@ xrep_agfl( error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; /* * Make sure we have the AGFL buffer, as scrub might have decided it @@ -735,8 +733,6 @@ xrep_agi_find_btrees( error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; /* Find the btree roots. */ error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL); diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index a43d1813c4ff..5533e48e605d 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -97,7 +97,6 @@ xchk_allocbt_rec( xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; - int error = 0; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); @@ -109,7 +108,7 @@ xchk_allocbt_rec( xchk_allocbt_xref(bs->sc, bno, len); - return error; + return 0; } /* Scrub the freespace btrees for some AG. */ diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 1afc58bf71dd..d9f0dd444b80 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -80,7 +80,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); if (error) return error; } @@ -163,8 +163,6 @@ xchk_xattr_listent( args.valuelen = valuelen; error = xfs_attr_get_ilocked(context->dp, &args); - if (error == -EEXIST) - error = 0; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -173,7 +171,7 @@ xchk_xattr_listent( args.blkno); fail_xref: if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = XFS_ITER_ABORT; + context->seen_enough = 1; return; } @@ -400,15 +398,14 @@ out: STATIC int xchk_xattr_rec( struct xchk_da_btree *ds, - int level, - void *rec) + int level) { struct xfs_mount *mp = ds->state->mp; - struct xfs_attr_leaf_entry *ent = rec; - struct xfs_da_state_blk *blk; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; struct xfs_buf *bp; + struct xfs_attr_leaf_entry *ent; xfs_dahash_t calc_hash; xfs_dahash_t hash; int nameidx; @@ -416,7 +413,9 @@ xchk_xattr_rec( unsigned int badflags; int error; - blk = &ds->state->path.blk[level]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + ent = xfs_attr3_leaf_entryp(blk->bp->b_addr) + blk->index; /* Check the whole block, if necessary. */ error = xchk_xattr_block(ds, level); diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 3d47d111be5a..18a684e18a69 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -294,5 +294,6 @@ xfs_bitmap_set_btblocks( struct xfs_bitmap *bitmap, struct xfs_btree_cur *cur) { - return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, bitmap); + return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, + XFS_BTREE_VISIT_ALL, bitmap); } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1bd29fdc2ab5..fa6ea6407992 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -75,6 +75,7 @@ struct xchk_bmap_info { xfs_fileoff_t lastoff; bool is_rt; bool is_shared; + bool was_loaded; int whichfork; }; @@ -213,25 +214,20 @@ xchk_bmap_xref_rmap( /* Cross-reference a single rtdev extent record. */ STATIC void -xchk_bmap_rt_extent_xref( - struct xchk_bmap_info *info, +xchk_bmap_rt_iextent_xref( struct xfs_inode *ip, - struct xfs_btree_cur *cur, + struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return; - xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, irec->br_blockcount); } /* Cross-reference a single datadev extent record. */ STATIC void -xchk_bmap_extent_xref( - struct xchk_bmap_info *info, +xchk_bmap_iextent_xref( struct xfs_inode *ip, - struct xfs_btree_cur *cur, + struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; @@ -240,9 +236,6 @@ xchk_bmap_extent_xref( xfs_extlen_t len; int error; - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return; - agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); len = irec->br_blockcount; @@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent( /* Scrub a single extent record. */ STATIC int -xchk_bmap_extent( +xchk_bmap_iextent( struct xfs_inode *ip, - struct xfs_btree_cur *cur, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - struct xfs_buf *bp = NULL; xfs_filblks_t end; int error = 0; - if (cur) - xfs_btree_get_block(cur, 0, &bp); - /* * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. @@ -364,10 +352,13 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + if (info->is_rt) - xchk_bmap_rt_extent_xref(info, ip, cur, irec); + xchk_bmap_rt_iextent_xref(ip, info, irec); else - xchk_bmap_extent_xref(info, ip, cur, irec); + xchk_bmap_iextent_xref(ip, info, irec); info->lastoff = irec->br_startoff + irec->br_blockcount; return error; @@ -380,10 +371,13 @@ xchk_bmapbt_rec( union xfs_btree_rec *rec) { struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec iext_irec; + struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; struct xfs_inode *ip = bs->cur->bc_private.b.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); uint64_t owner; int i; @@ -402,9 +396,26 @@ xchk_bmapbt_rec( } } - /* Set up the in-core record and scrub it. */ + /* + * Check that the incore extent tree contains an extent that matches + * this one exactly. We validate those cached bmaps later, so we don't + * need to check them here. If the incore extent tree was just loaded + * from disk by the scrubber, we assume that its contents match what's + * on disk (we still hold the ILOCK) and skip the equivalence check. + */ + if (!info->was_loaded) + return 0; + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); - return xchk_bmap_extent(ip, bs->cur, info, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, + &iext_irec) || + irec.br_startoff != iext_irec.br_startoff || + irec.br_startblock != iext_irec.br_startblock || + irec.br_blockcount != iext_irec.br_blockcount || + irec.br_state != iext_irec.br_state) + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; } /* Scan the btree records. */ @@ -415,15 +426,26 @@ xchk_bmap_btree( struct xchk_bmap_info *info) { struct xfs_owner_info oinfo; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; struct xfs_btree_cur *cur; int error; + /* Load the incore bmap cache if it's not loaded. */ + info->was_loaded = ifp->if_flags & XFS_IFEXTENTS; + if (!info->was_loaded) { + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + } + + /* Check the btree structure. */ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); xfs_btree_del_cursor(cur, error); +out: return error; } @@ -500,7 +522,7 @@ xchk_bmap_check_rmap( out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; return 0; } @@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps( sbcri.sc = sc; sbcri.whichfork = whichfork; error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; xfs_btree_del_cursor(cur, error); @@ -671,13 +693,6 @@ xchk_bmap( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; - /* Now try to scrub the in-memory extent list. */ - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(sc->tp, ip, whichfork); - if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; - } - /* Find the offset of the last extent in the mapping. */ error = xfs_bmap_last_offset(ip, &endoff, whichfork); if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) @@ -689,7 +704,7 @@ xchk_bmap( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - break; + goto out; if (isnullstartblock(irec.br_startblock)) continue; if (irec.br_startoff >= endoff) { @@ -697,7 +712,7 @@ xchk_bmap( irec.br_startoff); goto out; } - error = xchk_bmap_extent(ip, NULL, &info, &irec); + error = xchk_bmap_iextent(ip, &info, &irec); if (error) goto out; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 003a772cd26c..2e50d146105d 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -14,8 +14,15 @@ static inline bool xchk_should_terminate( struct xfs_scrub *sc, - int *error) + int *error) { + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + if (fatal_signal_pending(current)) { if (*error == 0) *error = -EAGAIN; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 77ff9f97bcda..97a15b6f2865 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -77,40 +77,18 @@ xchk_da_set_corrupt( __return_address); } -/* Find an entry at a certain level in a da btree. */ -STATIC void * -xchk_da_btree_entry( - struct xchk_da_btree *ds, - int level, - int rec) +static struct xfs_da_node_entry * +xchk_da_btree_node_entry( + struct xchk_da_btree *ds, + int level) { - char *ents; - struct xfs_da_state_blk *blk; - void *baddr; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_da3_icnode_hdr hdr; - /* Dispatch the entry finding function. */ - blk = &ds->state->path.blk[level]; - baddr = blk->bp->b_addr; - switch (blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - ents = (char *)xfs_attr3_leaf_entryp(baddr); - return ents + (rec * sizeof(struct xfs_attr_leaf_entry)); - case XFS_DIR2_LEAFN_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); - return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); - case XFS_DIR2_LEAF1_MAGIC: - case XFS_DIR3_LEAF1_MAGIC: - ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); - return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr); - return ents + (rec * sizeof(struct xfs_da_node_entry)); - } + ASSERT(blk->magic == XFS_DA_NODE_MAGIC); - return NULL; + xfs_da3_node_hdr_from_disk(ds->sc->mp, &hdr, blk->bp->b_addr); + return hdr.btree + blk->index; } /* Scrub a da btree hash (key). */ @@ -120,7 +98,6 @@ xchk_da_btree_hash( int level, __be32 *hashp) { - struct xfs_da_state_blk *blks; struct xfs_da_node_entry *entry; xfs_dahash_t hash; xfs_dahash_t parent_hash; @@ -135,8 +112,7 @@ xchk_da_btree_hash( return 0; /* Is this hash no larger than the parent hash? */ - blks = ds->state->path.blk; - entry = xchk_da_btree_entry(ds, level - 1, blks[level - 1].index); + entry = xchk_da_btree_node_entry(ds, level - 1); parent_hash = be32_to_cpu(entry->hashval); if (parent_hash < hash) xchk_da_set_corrupt(ds, level); @@ -355,8 +331,8 @@ xchk_da_btree_block( goto out_nobuf; /* Read the buffer. */ - error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2, - &blk->bp, dargs->whichfork, + error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, + XFS_DABUF_MAP_HOLE_OK, &blk->bp, dargs->whichfork, &xchk_da_btree_buf_ops); if (!xchk_da_process_error(ds, level, &error)) goto out_nobuf; @@ -433,8 +409,8 @@ xchk_da_btree_block( XFS_BLFT_DA_NODE_BUF); blk->magic = XFS_DA_NODE_MAGIC; node = blk->bp->b_addr; - ip->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = ip->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(ip->i_mount, &nodehdr, node); + btree = nodehdr.btree; *pmaxrecs = nodehdr.count; blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval); if (level == 0) { @@ -479,14 +455,12 @@ xchk_da_btree( struct xfs_mount *mp = sc->mp; struct xfs_da_state_blk *blks; struct xfs_da_node_entry *key; - void *rec; xfs_dablk_t blkno; int level; int error; /* Skip short format data structures; no btree to scan. */ - if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE) + if (!xfs_ifork_has_extents(sc->ip, whichfork)) return 0; /* Set up initial da state. */ @@ -538,9 +512,7 @@ xchk_da_btree( } /* Dispatch record scrubbing. */ - rec = xchk_da_btree_entry(&ds, level, - blks[level].index); - error = scrub_fn(&ds, level, rec); + error = scrub_fn(&ds, level); if (error) break; if (xchk_should_terminate(sc, &error) || @@ -562,7 +534,7 @@ xchk_da_btree( } /* Hashes in order for scrub? */ - key = xchk_da_btree_entry(&ds, level, blks[level].index); + key = xchk_da_btree_node_entry(&ds, level); error = xchk_da_btree_hash(&ds, level, &key->hashval); if (error) goto out; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index cb3f0003245b..1f3515c6d5a8 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -28,8 +28,7 @@ struct xchk_da_btree { int tree_level; }; -typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, - int level, void *rec); +typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, int level); /* Check for da btree operation errors. */ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 1e2e11721eb9..266da4e4bde6 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -113,6 +113,9 @@ xchk_dir_actor( offset = xfs_dir2_db_to_da(mp->m_dir_geo, xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + if (xchk_should_terminate(sdc->sc, &error)) + return error; + /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); @@ -179,15 +182,17 @@ out: STATIC int xchk_dir_rec( struct xchk_da_btree *ds, - int level, - void *rec) + int level) { + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_mount *mp = ds->state->mp; - struct xfs_dir2_leaf_entry *ent = rec; struct xfs_inode *dp = ds->dargs.dp; + struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_data_entry *dent; struct xfs_buf *bp; - char *p, *endp; + struct xfs_dir2_leaf_entry *ent; + unsigned int end; + unsigned int iter_off; xfs_ino_t ino; xfs_dablk_t rec_bno; xfs_dir2_db_t db; @@ -195,9 +200,16 @@ xchk_dir_rec( xfs_dir2_dataptr_t ptr; xfs_dahash_t calc_hash; xfs_dahash_t hash; + struct xfs_dir3_icleaf_hdr hdr; unsigned int tag; int error; + ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC); + + xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr); + ent = hdr.ents + blk->index; + /* Check the hash of the entry. */ error = xchk_da_btree_hash(ds, level, &ent->hashval); if (error) @@ -209,15 +221,16 @@ xchk_dir_rec( return 0; /* Find the directory entry's location. */ - db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr); - off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr); - rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db); + db = xfs_dir2_dataptr_to_db(geo, ptr); + off = xfs_dir2_dataptr_to_off(geo, ptr); + rec_bno = xfs_dir2_db_to_da(geo, db); - if (rec_bno >= mp->m_dir_geo->leafblk) { + if (rec_bno >= geo->leafblk) { xchk_da_set_corrupt(ds, level); goto out; } - error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp); + error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, + XFS_DABUF_MAP_HOLE_OK, &bp); if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, &error)) goto out; @@ -230,38 +243,37 @@ xchk_dir_rec( if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out_relse; - dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); + dent = bp->b_addr + off; /* Make sure we got a real directory entry. */ - p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr); - endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); - if (!endp) { + iter_off = geo->data_entry_offset; + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + if (!end) { xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out_relse; } - while (p < endp) { - struct xfs_dir2_data_entry *dep; - struct xfs_dir2_data_unused *dup; + for (;;) { + struct xfs_dir2_data_entry *dep = bp->b_addr + iter_off; + struct xfs_dir2_data_unused *dup = bp->b_addr + iter_off; + + if (iter_off >= end) { + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } - dup = (struct xfs_dir2_data_unused *)p; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - p += be16_to_cpu(dup->length); + iter_off += be16_to_cpu(dup->length); continue; } - dep = (struct xfs_dir2_data_entry *)p; if (dep == dent) break; - p += mp->m_dir_inode_ops->data_entsize(dep->namelen); - } - if (p >= endp) { - xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); - goto out_relse; + iter_off += xfs_dir2_data_entsize(mp, dep->namelen); } /* Retrieve the entry, sanity check it, and compare hashes. */ ino = be64_to_cpu(dent->inumber); hash = be32_to_cpu(ent->hashval); - tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent)); + tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent)); if (!xfs_verify_dir_ino(mp, ino) || tag != off) xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); if (dent->namelen == 0) { @@ -319,19 +331,15 @@ xchk_directory_data_bestfree( struct xfs_buf *bp; struct xfs_dir2_data_free *bf; struct xfs_mount *mp = sc->mp; - const struct xfs_dir_ops *d_ops; - char *ptr; - char *endptr; u16 tag; unsigned int nr_bestfrees = 0; unsigned int nr_frees = 0; unsigned int smallest_bestfree; int newlen; - int offset; + unsigned int offset; + unsigned int end; int error; - d_ops = sc->ip->d_ops; - if (is_block) { /* dir block format */ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) @@ -339,7 +347,7 @@ xchk_directory_data_bestfree( error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); } else { /* dir data format */ - error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp); + error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); } if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -351,7 +359,7 @@ xchk_directory_data_bestfree( goto out_buf; /* Do the bestfrees correspond to actual free space? */ - bf = d_ops->data_bestfree_p(bp->b_addr); + bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr); smallest_bestfree = UINT_MAX; for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { offset = be16_to_cpu(dfp->offset); @@ -361,13 +369,13 @@ xchk_directory_data_bestfree( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } - dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset); + dup = bp->b_addr + offset; tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); /* bestfree doesn't match the entry it points at? */ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) || be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) || - tag != ((char *)dup - (char *)bp->b_addr)) { + tag != offset) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } @@ -383,30 +391,30 @@ xchk_directory_data_bestfree( } /* Make sure the bestfrees are actually the best free spaces. */ - ptr = (char *)d_ops->data_entry_p(bp->b_addr); - endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); + offset = mp->m_dir_geo->data_entry_offset; + end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr); /* Iterate the entries, stopping when we hit or go past the end. */ - while (ptr < endptr) { - dup = (struct xfs_dir2_data_unused *)ptr; + while (offset < end) { + dup = bp->b_addr + offset; + /* Skip real entries */ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) { - struct xfs_dir2_data_entry *dep; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; - dep = (struct xfs_dir2_data_entry *)ptr; - newlen = d_ops->data_entsize(dep->namelen); + newlen = xfs_dir2_data_entsize(mp, dep->namelen); if (newlen <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } - ptr += newlen; + offset += newlen; continue; } /* Spot check this free entry */ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); - if (tag != ((char *)dup - (char *)bp->b_addr)) { + if (tag != offset) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } @@ -425,13 +433,13 @@ xchk_directory_data_bestfree( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } - ptr += newlen; - if (ptr <= endptr) + offset += newlen; + if (offset <= end) nr_frees++; } /* We're required to fill all the space. */ - if (ptr != endptr) + if (offset != end) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); /* Did we see at least as many free slots as there are bestfrees? */ @@ -458,7 +466,7 @@ xchk_directory_check_freesp( { struct xfs_dir2_data_free *dfp; - dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr); + dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr); if (len != be16_to_cpu(dfp->length)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); @@ -475,12 +483,10 @@ xchk_directory_leaf1_bestfree( xfs_dablk_t lblk) { struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; struct xfs_dir2_leaf_tail *ltp; struct xfs_dir2_leaf *leaf; struct xfs_buf *dbp; struct xfs_buf *bp; - const struct xfs_dir_ops *d_ops = sc->ip->d_ops; struct xfs_da_geometry *geo = sc->mp->m_dir_geo; __be16 *bestp; __u16 best; @@ -492,14 +498,13 @@ xchk_directory_leaf1_bestfree( int error; /* Read the free space block. */ - error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp); + error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; xchk_buffer_recheck(sc, bp); leaf = bp->b_addr; - d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf); ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestcount = be32_to_cpu(ltp->bestcount); bestp = xfs_dir2_leaf_bests_p(ltp); @@ -521,24 +526,25 @@ xchk_directory_leaf1_bestfree( } /* Is the leaf count even remotely sane? */ - if (leafhdr.count > d_ops->leaf_max_ents(geo)) { + if (leafhdr.count > geo->leaf_max_ents) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } /* Leaves and bests don't overlap in leaf format. */ - if ((char *)&ents[leafhdr.count] > (char *)bestp) { + if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } /* Check hash value order, count stale entries. */ for (i = 0; i < leafhdr.count; i++) { - hash = be32_to_cpu(ents[i].hashval); + hash = be32_to_cpu(leafhdr.ents[i].hashval); if (i > 0 && lasthash > hash) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); lasthash = hash; - if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (leafhdr.ents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } if (leafhdr.stale != stale) @@ -552,7 +558,7 @@ xchk_directory_leaf1_bestfree( if (best == NULLDATAOFF) continue; error = xfs_dir3_data_read(sc->tp, sc->ip, - i * args->geo->fsbcount, -1, &dbp); + i * args->geo->fsbcount, 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; @@ -575,7 +581,6 @@ xchk_directory_free_bestfree( struct xfs_dir3_icfree_hdr freehdr; struct xfs_buf *dbp; struct xfs_buf *bp; - __be16 *bestp; __u16 best; unsigned int stale = 0; int i; @@ -595,17 +600,16 @@ xchk_directory_free_bestfree( } /* Check all the entries. */ - sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr); - bestp = sc->ip->d_ops->free_bests_p(bp->b_addr); - for (i = 0; i < freehdr.nvalid; i++, bestp++) { - best = be16_to_cpu(*bestp); + xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr); + for (i = 0; i < freehdr.nvalid; i++) { + best = be16_to_cpu(freehdr.bests[i]); if (best == NULLDATAOFF) { stale++; continue; } error = xfs_dir3_data_read(sc->tp, sc->ip, (freehdr.firstdb + i) * args->geo->fsbcount, - -1, &dbp); + 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index fc3f510c9034..ec2064ed3c30 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -83,9 +83,6 @@ xchk_fscount_warmup( error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); if (error) break; - error = -ENOMEM; - if (!agf_bp || !agi_bp) - break; /* * These are supposed to be initialized by the header read @@ -104,7 +101,7 @@ next_loop_perag: pag = NULL; error = 0; - if (fatal_signal_pending(current)) + if (xchk_should_terminate(sc, &error)) break; } @@ -125,7 +122,7 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); if (!sc->buf) return -ENOMEM; fsc = sc->buf; @@ -163,6 +160,7 @@ xchk_fscount_aggregate_agcounts( uint64_t delayed; xfs_agnumber_t agno; int tries = 8; + int error = 0; retry: fsc->icount = 0; @@ -196,10 +194,13 @@ retry: xfs_perag_put(pag); - if (fatal_signal_pending(current)) + if (xchk_should_terminate(sc, &error)) break; } + if (error) + return error; + /* * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index b2f602811e9d..83d27cdf579b 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -11,6 +11,7 @@ #include "xfs_sb.h" #include "xfs_health.h" #include "scrub/scrub.h" +#include "scrub/health.h" /* * Scrub and In-Core Filesystem Health Assessments diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index c962bd534690..5705adc43a75 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -32,8 +32,10 @@ xchk_setup_parent( struct xchk_parent_ctx { struct dir_context dc; + struct xfs_scrub *sc; xfs_ino_t ino; xfs_nlink_t nlink; + bool cancelled; }; /* Look for a single entry in a directory pointing to an inode. */ @@ -47,11 +49,21 @@ xchk_parent_actor( unsigned type) { struct xchk_parent_ctx *spc; + int error = 0; spc = container_of(dc, struct xchk_parent_ctx, dc); if (spc->ino == ino) spc->nlink++; - return 0; + + /* + * If we're facing a fatal signal, bail out. Store the cancellation + * status separately because the VFS readdir code squashes error codes + * into short directory reads. + */ + if (xchk_should_terminate(spc->sc, &error)) + spc->cancelled = true; + + return error; } /* Count the number of dentries in the parent dir that point to this inode. */ @@ -62,10 +74,9 @@ xchk_parent_count_parent_dentries( xfs_nlink_t *nlink) { struct xchk_parent_ctx spc = { - .dc.actor = xchk_parent_actor, - .dc.pos = 0, - .ino = sc->ip->i_ino, - .nlink = 0, + .dc.actor = xchk_parent_actor, + .ino = sc->ip->i_ino, + .sc = sc, }; size_t bufsize; loff_t oldpos; @@ -80,7 +91,7 @@ xchk_parent_count_parent_dentries( */ lock_mode = xfs_ilock_data_map_shared(parent); if (parent->i_d.di_nextents > 0) - error = xfs_dir3_data_readahead(parent, 0, -1); + error = xfs_dir3_data_readahead(parent, 0, 0); xfs_iunlock(parent, lock_mode); if (error) return error; @@ -97,6 +108,10 @@ xchk_parent_count_parent_dentries( error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); if (error) goto out; + if (spc.cancelled) { + error = -EAGAIN; + goto out; + } if (oldpos == spc.dc.pos) break; oldpos = spc.dc.pos; diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 0a33b4421c32..905a34558361 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -93,6 +93,10 @@ xchk_quota_item( unsigned long long rcount; xfs_ino_t fs_icount; xfs_dqid_t id = be32_to_cpu(d->d_id); + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; /* * Except for the root dquot, the actual dquot we got must either have @@ -178,6 +182,9 @@ xchk_quota_item( if (id != 0 && rhard != 0 && rcount > rhard) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -EFSCORRUPTED; + return 0; } diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 93b3793bc5b3..0cab11a5d390 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -341,7 +341,6 @@ xchk_refcountbt_rec( xfs_extlen_t len; xfs_nlink_t refcount; bool has_cowflag; - int error = 0; bno = be32_to_cpu(rec->refc.rc_startblock); len = be32_to_cpu(rec->refc.rc_blockcount); @@ -366,7 +365,7 @@ xchk_refcountbt_rec( xchk_refcountbt_xref(bs->sc, bno, len, refcount); - return error; + return 0; } /* Make sure we have as many refc blocks as the rmap says. */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4cfeec57fb05..e489d7a8446a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -341,17 +341,21 @@ xrep_init_btblock( struct xfs_trans *tp = sc->tp; struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; + int error; trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), btnum); ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), - XFS_FSB_TO_BB(mp, 1), 0); + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, + &bp); + if (error) + return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); - xfs_trans_log_buf(tp, bp, 0, bp->b_length); + xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); bp->b_ops = ops; *bpp = bp; @@ -542,8 +546,6 @@ xrep_reap_block( error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; } else { agf_bp = sc->sa.agf_bp; } @@ -664,7 +666,7 @@ xrep_findroot_agfl_walk( { xfs_agblock_t *agbno = priv; - return (*agbno == bno) ? XFS_ITER_ABORT : 0; + return (*agbno == bno) ? -ECANCELED : 0; } /* Does this block match the btree information passed in? */ @@ -694,7 +696,7 @@ xrep_findroot_block( if (owner == XFS_RMAP_OWN_AG) { error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, xrep_findroot_agfl_walk, &agbno); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) return 0; if (error) return error; diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60c61d7052a8..c3422403b169 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -75,7 +75,6 @@ static inline xfs_extlen_t xrep_calc_ag_resblks( struct xfs_scrub *sc) { - ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); return 0; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 15c8c5f3f688..f1775bb19313 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -16,6 +16,7 @@ #include "xfs_qm.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 99c0b1234c3c..5641ae512c9e 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3362bae28b46..096203119934 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -329,7 +329,7 @@ TRACE_EVENT(xchk_btree_op_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(int, error) __field(void *, ret_ip) ), @@ -414,7 +414,7 @@ TRACE_EVENT(xchk_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( @@ -452,7 +452,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cbda40d40326..cd743fad8478 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -12,8 +12,10 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_trace.h" -#include <linux/posix_acl_xattr.h> +#include "xfs_error.h" +#include "xfs_acl.h" +#include <linux/posix_acl_xattr.h> /* * Locking scheme: @@ -23,6 +25,7 @@ STATIC struct posix_acl * xfs_acl_from_disk( + struct xfs_mount *mp, const struct xfs_acl *aclp, int len, int max_entries) @@ -32,11 +35,18 @@ xfs_acl_from_disk( const struct xfs_acl_entry *ace; unsigned int count, i; - if (len < sizeof(*aclp)) + if (len < sizeof(*aclp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, + len); return ERR_PTR(-EFSCORRUPTED); + } + count = be32_to_cpu(aclp->acl_cnt); - if (count > max_entries || XFS_ACL_SIZE(count) != len) + if (count > max_entries || XFS_ACL_SIZE(count) != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, + len); return ERR_PTR(-EFSCORRUPTED); + } acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) @@ -112,7 +122,7 @@ xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl; + struct xfs_acl *xfs_acl = NULL; unsigned char *ea_name; int error; int len; @@ -135,12 +145,9 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); - if (!xfs_acl) - return ERR_PTR(-ENOMEM); - - error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, - &len, ATTR_ROOT); + error = xfs_attr_get(ip, ea_name, strlen(ea_name), + (unsigned char **)&xfs_acl, &len, + ATTR_ALLOC | ATTR_ROOT); if (error) { /* * If the attribute doesn't exist make sure we have a negative @@ -149,10 +156,10 @@ xfs_get_acl(struct inode *inode, int type) if (error != -ENOATTR) acl = ERR_PTR(error); } else { - acl = xfs_acl_from_disk(xfs_acl, len, + acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + kmem_free(xfs_acl); } - kmem_free(xfs_acl); return acl; } @@ -180,7 +187,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct xfs_acl *xfs_acl; int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + xfs_acl = kmem_zalloc_large(len, 0); if (!xfs_acl) return -ENOMEM; @@ -190,15 +197,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) len -= sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, - len, ATTR_ROOT); + error = xfs_attr_set(ip, ea_name, strlen(ea_name), + (unsigned char *)xfs_acl, len, ATTR_ROOT); kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. */ - error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); + error = xfs_attr_remove(ip, ea_name, + strlen(ea_name), + ATTR_ROOT); /* * If the attribute didn't exist to start with that's fine. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f16d5f196c6b..58e937be24ce 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -18,108 +18,22 @@ #include "xfs_bmap_util.h" #include "xfs_reflink.h" -/* - * structure owned by writepages passed to individual writepage calls - */ struct xfs_writepage_ctx { - struct xfs_bmbt_irec imap; - int fork; + struct iomap_writepage_ctx ctx; unsigned int data_seq; unsigned int cow_seq; - struct xfs_ioend *ioend; }; -struct block_device * -xfs_find_bdev_for_inode( - struct inode *inode) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - - if (XFS_IS_REALTIME_INODE(ip)) - return mp->m_rtdev_targp->bt_bdev; - else - return mp->m_ddev_targp->bt_bdev; -} - -struct dax_device * -xfs_find_daxdev_for_inode( - struct inode *inode) +static inline struct xfs_writepage_ctx * +XFS_WPC(struct iomap_writepage_ctx *ctx) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - - if (XFS_IS_REALTIME_INODE(ip)) - return mp->m_rtdev_targp->bt_daxdev; - else - return mp->m_ddev_targp->bt_daxdev; -} - -static void -xfs_finish_page_writeback( - struct inode *inode, - struct bio_vec *bvec, - int error) -{ - struct iomap_page *iop = to_iomap_page(bvec->bv_page); - - if (error) { - SetPageError(bvec->bv_page); - mapping_set_error(inode->i_mapping, -EIO); - } - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) > 0); - - if (!iop || atomic_dec_and_test(&iop->write_count)) - end_page_writeback(bvec->bv_page); -} - -/* - * We're now finished for good with this ioend structure. Update the page - * state, release holds on bios, and finally free up memory. Do not use the - * ioend after this. - */ -STATIC void -xfs_destroy_ioend( - struct xfs_ioend *ioend, - int error) -{ - struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - bool quiet = bio_flagged(bio, BIO_QUIET); - - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, iter_all) - xfs_finish_page_writeback(inode, bvec, error); - bio_put(bio); - } - - if (unlikely(error && !quiet)) { - xfs_err_ratelimited(XFS_I(inode)->i_mount, - "writeback error on sector %llu", start); - } + return container_of(ctx, struct xfs_writepage_ctx, ctx); } /* * Fast and loose check if this write could update the on-disk inode size. */ -static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) { return ioend->io_offset + ioend->io_size > XFS_I(ioend->io_inode)->i_d.di_size; @@ -127,7 +41,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) STATIC int xfs_setfilesize_trans_alloc( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; struct xfs_trans *tp; @@ -137,7 +51,7 @@ xfs_setfilesize_trans_alloc( if (error) return error; - ioend->io_append_trans = tp; + ioend->io_private = tp; /* * We may pass freeze protection with a transaction. So tell lockdep @@ -200,11 +114,11 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend, + struct iomap_ioend *ioend, int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_trans *tp = ioend->io_append_trans; + struct xfs_trans *tp = ioend->io_private; /* * The transaction may have been allocated in the I/O submission thread, @@ -228,9 +142,8 @@ xfs_setfilesize_ioend( */ STATIC void xfs_end_ioend( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { - struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -257,7 +170,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) xfs_reflink_cancel_cow_range(ip, offset, size, true); goto done; } @@ -265,154 +178,86 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_state == XFS_EXT_UNWRITTEN) + else if (ioend->io_type == IOMAP_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); else - ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private); done: - if (ioend->io_append_trans) + if (ioend->io_private) error = xfs_setfilesize_ioend(ioend, error); - list_replace_init(&ioend->io_list, &ioend_list); - xfs_destroy_ioend(ioend, error); - - while (!list_empty(&ioend_list)) { - ioend = list_first_entry(&ioend_list, struct xfs_ioend, - io_list); - list_del_init(&ioend->io_list); - xfs_destroy_ioend(ioend, error); - } - + iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } /* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -xfs_ioend_can_merge( - struct xfs_ioend *ioend, - struct xfs_ioend *next) -{ - if (ioend->io_bio->bi_status != next->io_bio->bi_status) - return false; - if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) - return false; - if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ - (next->io_state == XFS_EXT_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - return true; -} - -/* * If the to be merged ioend has a preallocated transaction for file * size updates we need to ensure the ioend it is merged into also * has one. If it already has one we can simply cancel the transaction * as it is guaranteed to be clean. */ static void -xfs_ioend_merge_append_transactions( - struct xfs_ioend *ioend, - struct xfs_ioend *next) +xfs_ioend_merge_private( + struct iomap_ioend *ioend, + struct iomap_ioend *next) { - if (!ioend->io_append_trans) { - ioend->io_append_trans = next->io_append_trans; - next->io_append_trans = NULL; + if (!ioend->io_private) { + ioend->io_private = next->io_private; + next->io_private = NULL; } else { xfs_setfilesize_ioend(next, -ECANCELED); } } -/* Try to merge adjacent completions. */ -STATIC void -xfs_ioend_try_merge( - struct xfs_ioend *ioend, - struct list_head *more_ioends) -{ - struct xfs_ioend *next_ioend; - - while (!list_empty(more_ioends)) { - next_ioend = list_first_entry(more_ioends, struct xfs_ioend, - io_list); - if (!xfs_ioend_can_merge(ioend, next_ioend)) - break; - list_move_tail(&next_ioend->io_list, &ioend->io_list); - ioend->io_size += next_ioend->io_size; - if (next_ioend->io_append_trans) - xfs_ioend_merge_append_transactions(ioend, next_ioend); - } -} - -/* list_sort compare function for ioends */ -static int -xfs_ioend_compare( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_ioend *ia; - struct xfs_ioend *ib; - - ia = container_of(a, struct xfs_ioend, io_list); - ib = container_of(b, struct xfs_ioend, io_list); - if (ia->io_offset < ib->io_offset) - return -1; - else if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - /* Finish all pending io completions. */ void xfs_end_io( struct work_struct *work) { - struct xfs_inode *ip; - struct xfs_ioend *ioend; - struct list_head completion_list; + struct xfs_inode *ip = + container_of(work, struct xfs_inode, i_ioend_work); + struct iomap_ioend *ioend; + struct list_head tmp; unsigned long flags; - ip = container_of(work, struct xfs_inode, i_ioend_work); - spin_lock_irqsave(&ip->i_ioend_lock, flags); - list_replace_init(&ip->i_ioend_list, &completion_list); + list_replace_init(&ip->i_ioend_list, &tmp); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - list_sort(NULL, &completion_list, xfs_ioend_compare); - - while (!list_empty(&completion_list)) { - ioend = list_first_entry(&completion_list, struct xfs_ioend, - io_list); + iomap_sort_ioends(&tmp); + while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, + io_list))) { list_del_init(&ioend->io_list); - xfs_ioend_try_merge(ioend, &completion_list); + iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private); xfs_end_ioend(ioend); } } +static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend) +{ + return ioend->io_private || + ioend->io_type == IOMAP_UNWRITTEN || + (ioend->io_flags & IOMAP_F_SHARED); +} + STATIC void xfs_end_bio( struct bio *bio) { - struct xfs_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = bio->bi_private; struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_mount *mp = ip->i_mount; unsigned long flags; - if (ioend->io_fork == XFS_COW_FORK || - ioend->io_state == XFS_EXT_UNWRITTEN || - ioend->io_append_trans != NULL) { - spin_lock_irqsave(&ip->i_ioend_lock, flags); - if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, - &ip->i_ioend_work)); - list_add_tail(&ioend->io_list, &ip->i_ioend_list); - spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - } else - xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); + ASSERT(xfs_ioend_needs_workqueue(ioend)); + + spin_lock_irqsave(&ip->i_ioend_lock, flags); + if (list_empty(&ip->i_ioend_list)) + WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + &ip->i_ioend_work)); + list_add_tail(&ioend->io_list, &ip->i_ioend_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); } /* @@ -421,19 +266,19 @@ xfs_end_bio( */ static bool xfs_imap_valid( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + loff_t offset) { - if (offset_fsb < wpc->imap.br_startoff || - offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) return false; /* * If this is a COW mapping, it is sufficient to check that the mapping * covers the offset. Be careful to check this first because the caller * can revalidate a COW mapping without updating the data seqno. */ - if (wpc->fork == XFS_COW_FORK) + if (wpc->iomap.flags & IOMAP_F_SHARED) return true; /* @@ -443,17 +288,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) return false; if (xfs_inode_has_cow_data(ip) && - wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) return false; return true; } /* * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->imap. + * extent that maps offset_fsb in wpc->iomap. * * The current page is held locked so nothing could have removed the block * backing offset_fsb, although it could have moved from the COW to the data @@ -461,32 +306,38 @@ xfs_imap_valid( */ static int xfs_convert_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + int whichfork, + loff_t offset) { int error; + unsigned *seq; + + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; /* - * Attempt to allocate whatever delalloc extent currently backs - * offset_fsb and put the result into wpc->imap. Allocate in a loop - * because it may take several attempts to allocate real blocks for a - * contiguous delalloc extent if free space is sufficiently fragmented. + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into wpc->iomap. Allocate in a loop because it + * may take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. */ do { - error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, - &wpc->imap, wpc->fork == XFS_COW_FORK ? - &wpc->cow_seq : &wpc->data_seq); + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) return error; - } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + } while (wpc->iomap.offset + wpc->iomap.length <= offset); return 0; } -STATIC int +static int xfs_map_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { @@ -496,6 +347,7 @@ xfs_map_blocks( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; + int whichfork = XFS_DATA_FORK; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -519,7 +371,7 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) + if (xfs_imap_valid(wpc, ip, offset)) return 0; /* @@ -541,10 +393,10 @@ retry: xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) cow_fsb = imap.br_startoff; if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { - wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); + XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_COW_FORK; + whichfork = XFS_COW_FORK; goto allocate_blocks; } @@ -552,7 +404,7 @@ retry: * No COW extent overlap. Revalidate now that we may have updated * ->cow_seq. If the data mapping is still valid, we're done. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) { + if (xfs_imap_valid(wpc, ip, offset)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -564,11 +416,9 @@ retry: */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ - wpc->data_seq = READ_ONCE(ip->i_df.if_seq); + XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_DATA_FORK; - /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { imap.br_blockcount = imap.br_startoff - offset_fsb; @@ -592,11 +442,11 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - wpc->imap = imap; - trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, offset_fsb); + error = xfs_convert_blocks(wpc, ip, whichfork, offset); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -605,7 +455,7 @@ allocate_blocks: * the former case, but prevent additional retries to avoid * looping forever for the latter case. */ - if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++) goto retry; ASSERT(error != -EAGAIN); return error; @@ -616,34 +466,22 @@ allocate_blocks: * original delalloc one. Trim the return extent to the next COW * boundary again to force a re-lookup. */ - if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && - cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) - wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) { + loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb); + + if (cow_offset < wpc->iomap.offset + wpc->iomap.length) + wpc->iomap.length = cow_offset - wpc->iomap.offset; + } - ASSERT(wpc->imap.br_startoff <= offset_fsb); - ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); + ASSERT(wpc->iomap.offset <= offset); + ASSERT(wpc->iomap.offset + wpc->iomap.length > offset); + trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap); return 0; } -/* - * Submit the bio for an ioend. We are passed an ioend with a bio attached to - * it, and we submit that bio. The ioend may be used for multiple bio - * submissions, so we only want to allocate an append transaction for the ioend - * once. In the case of multiple bio submission, each bio will take an IO - * reference to the ioend to ensure that the ioend completion is only done once - * all bios have been submitted and the ioend is really done. - * - * If @status is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the bio and ioend - * rather than submit it to IO. This typically only happens on a filesystem - * shutdown. - */ -STATIC int -xfs_submit_ioend( - struct writeback_control *wbc, - struct xfs_ioend *ioend, +static int +xfs_prepare_ioend( + struct iomap_ioend *ioend, int status) { unsigned int nofs_flag; @@ -656,157 +494,24 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && ioend->io_fork == XFS_COW_FORK) { + if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - (ioend->io_fork == XFS_COW_FORK || - ioend->io_state != XFS_EXT_UNWRITTEN) && + ((ioend->io_flags & IOMAP_F_SHARED) || + ioend->io_type != IOMAP_UNWRITTEN) && xfs_ioend_is_append(ioend) && - !ioend->io_append_trans) + !ioend->io_private) status = xfs_setfilesize_trans_alloc(ioend); memalloc_nofs_restore(nofs_flag); - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = xfs_end_bio; - - /* - * If we are failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - if (status) { - ioend->io_bio->bi_status = errno_to_blk_status(status); - bio_endio(ioend->io_bio); - return status; - } - - submit_bio(ioend->io_bio); - return 0; -} - -static struct xfs_ioend * -xfs_alloc_ioend( - struct inode *inode, - int fork, - xfs_exntst_t state, - xfs_off_t offset, - struct block_device *bdev, - sector_t sector, - struct writeback_control *wbc) -{ - struct xfs_ioend *ioend; - struct bio *bio; - - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = sector; - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - bio->bi_write_hint = inode->i_write_hint; - wbc_init_bio(wbc, bio); - - ioend = container_of(bio, struct xfs_ioend, io_inline_bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_fork = fork; - ioend->io_state = state; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = offset; - ioend->io_append_trans = NULL; - ioend->io_bio = bio; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to do perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in xfs_destroy_ioend(). - */ -static struct bio * -xfs_chain_bio( - struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); - bio_copy_dev(new, prev);/* also copies over blkcg information */ - new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; - new->bi_write_hint = prev->bi_write_hint; - - bio_chain(prev, new); - bio_get(prev); /* for xfs_destroy_ioend */ - submit_bio(prev); - return new; -} - -/* - * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. - */ -STATIC void -xfs_add_to_ioend( - struct inode *inode, - xfs_off_t offset, - struct page *page, - struct iomap_page *iop, - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct list_head *iolist) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - unsigned len = i_blocksize(inode); - unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; - sector_t sector; - - sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + - ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - - if (!wpc->ioend || - wpc->fork != wpc->ioend->io_fork || - wpc->imap.br_state != wpc->ioend->io_state || - sector != bio_end_sector(wpc->ioend->io_bio) || - offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, - wpc->imap.br_state, offset, bdev, sector, wbc); - } - - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - - if (iop && !same_page) - atomic_inc(&iop->write_count); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) - wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio); - bio_add_page(wpc->ioend->io_bio, page, len, poff); - } - - wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, page, len); -} - -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned int offset, - unsigned int length) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset, length); - iomap_invalidatepage(page, offset, length); + if (xfs_ioend_needs_workqueue(ioend)) + ioend->io_bio->bi_end_io = xfs_end_bio; + return status; } /* @@ -820,8 +525,8 @@ xfs_vm_invalidatepage( * transaction as there is no space left for block reservation (typically why we * see a ENOSPC in writeback). */ -STATIC void -xfs_aops_discard_page( +static void +xfs_discard_page( struct page *page) { struct inode *inode = page->mapping->host; @@ -843,246 +548,14 @@ xfs_aops_discard_page( if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - xfs_vm_invalidatepage(page, 0, PAGE_SIZE); -} - -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -xfs_writepage_map( - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct inode *inode, - struct page *page, - uint64_t end_offset) -{ - LIST_HEAD(submit_list); - struct iomap_page *iop = to_iomap_page(page); - unsigned len = i_blocksize(inode); - struct xfs_ioend *ioend, *next; - uint64_t file_offset; /* file offset of page */ - int error = 0, count = 0, i; - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) == 0); - - /* - * Walk through the page to find areas to write back. If we run off the - * end of the current map or find the current map invalid, grab a new - * one. - */ - for (i = 0, file_offset = page_offset(page); - i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; - i++, file_offset += len) { - if (iop && !test_bit(i, iop->uptodate)) - continue; - - error = xfs_map_blocks(wpc, inode, file_offset); - if (error) - break; - if (wpc->imap.br_startblock == HOLESTARTBLOCK) - continue; - xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, - &submit_list); - count++; - } - - ASSERT(wpc->ioend || list_empty(&submit_list)); - ASSERT(PageLocked(page)); - ASSERT(!PageWriteback(page)); - - /* - * On error, we have to fail the ioend here because we may have set - * pages under writeback, we have to make sure we run IO completion to - * mark the error state of the IO appropriately, so we can't cancel the - * ioend directly here. That means we have to mark this page as under - * writeback if we included any blocks from it in the ioend chain so - * that completion treats it correctly. - * - * If we didn't include the page in the ioend, the on error we can - * simply discard and unlock it as there are no other users of the page - * now. The caller will still need to trigger submission of outstanding - * ioends on the writepage context so they are treated correctly on - * error. - */ - if (unlikely(error)) { - if (!count) { - xfs_aops_discard_page(page); - ClearPageUptodate(page); - unlock_page(page); - goto done; - } - - /* - * If the page was not fully cleaned, we need to ensure that the - * higher layers come back to it correctly. That means we need - * to keep the page dirty, and for WB_SYNC_ALL writeback we need - * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed - * so another attempt to write this page in this writeback sweep - * will be made. - */ - set_page_writeback_keepwrite(page); - } else { - clear_page_dirty_for_io(page); - set_page_writeback(page); - } - - unlock_page(page); - - /* - * Preserve the original error if there was one, otherwise catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = xfs_submit_ioend(wbc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - end_page_writeback(page); -done: - mapping_set_error(page->mapping, error); - return error; + iomap_invalidatepage(page, 0, PAGE_SIZE); } -/* - * Write out a dirty page. - * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to - * regular allocated space. - */ -STATIC int -xfs_do_writepage( - struct page *page, - struct writeback_control *wbc, - void *data) -{ - struct xfs_writepage_ctx *wpc = data; - struct inode *inode = page->mapping->host; - loff_t offset; - uint64_t end_offset; - pgoff_t end_index; - - trace_xfs_writepage(inode, page, 0, 0); - - /* - * Refuse to write the page out if we are called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Given that we do not allow direct reclaim to call us, we should - * never be called while in a filesystem transaction. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - - /* - * Is this page beyond the end of the file? - * - * The page index is less than the end_index, adjust the end_offset - * to the highest offset that this page should represent. - * ----------------------------------------------------- - * | file mapping | <EOF> | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - offset = i_size_read(inode); - end_index = offset >> PAGE_SHIFT; - if (page->index < end_index) - end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; - else { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | <EOF> | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ - unsigned offset_into_page = offset & (PAGE_SIZE - 1); - - /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the - * page so that reclaim stops reclaiming it. Otherwise - * xfs_vm_releasepage() is called on it and gets confused. - * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's - * offset is just equal to the EOF. - */ - if (page->index > end_index || - (page->index == end_index && offset_into_page == 0)) - goto redirty; - - /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." - */ - zero_user_segment(page, offset_into_page, PAGE_SIZE); - - /* Adjust the end_offset to the end of file */ - end_offset = offset; - } - - return xfs_writepage_map(wpc, wbc, inode, page, end_offset); - -redirty: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} +static const struct iomap_writeback_ops xfs_writeback_ops = { + .map_blocks = xfs_map_blocks, + .prepare_ioend = xfs_prepare_ioend, + .discard_page = xfs_discard_page, +}; STATIC int xfs_vm_writepage( @@ -1090,12 +563,8 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; - ret = xfs_do_writepage(page, wbc, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1104,13 +573,9 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1118,18 +583,11 @@ xfs_dax_writepages( struct address_space *mapping, struct writeback_control *wbc) { - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return dax_writeback_mapping_range(mapping, - xfs_find_bdev_for_inode(mapping->host), wbc); -} + struct xfs_inode *ip = XFS_I(mapping->host); -STATIC int -xfs_vm_releasepage( - struct page *page, - gfp_t gfp_mask) -{ - trace_xfs_releasepage(page->mapping->host, page, 0, 0); - return iomap_releasepage(page, gfp_mask); + xfs_iflags_clear(ip, XFS_ITRUNCATED); + return dax_writeback_mapping_range(mapping, + xfs_inode_buftarg(ip)->bt_daxdev, wbc); } STATIC sector_t @@ -1152,7 +610,7 @@ xfs_vm_bmap( */ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; - return iomap_bmap(mapping, block, &xfs_iomap_ops); + return iomap_bmap(mapping, block, &xfs_read_iomap_ops); } STATIC int @@ -1160,8 +618,7 @@ xfs_vm_readpage( struct file *unused, struct page *page) { - trace_xfs_vm_readpage(page->mapping->host, 1); - return iomap_readpage(page, &xfs_iomap_ops); + return iomap_readpage(page, &xfs_read_iomap_ops); } STATIC int @@ -1171,8 +628,7 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { - trace_xfs_vm_readpages(mapping->host, nr_pages); - return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops); + return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); } static int @@ -1181,8 +637,9 @@ xfs_iomap_swapfile_activate( struct file *swap_file, sector_t *span) { - sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file)); - return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops); + sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, + &xfs_read_iomap_ops); } const struct address_space_operations xfs_address_space_operations = { @@ -1191,8 +648,8 @@ const struct address_space_operations xfs_address_space_operations = { .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, - .releasepage = xfs_vm_releasepage, - .invalidatepage = xfs_vm_invalidatepage, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, .migratepage = iomap_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 45a1ea240cbb..e0bd68419764 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -6,29 +6,9 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set xfs_ioend_bioset; - -/* - * Structure for buffered I/O completions. - */ -struct xfs_ioend { - struct list_head io_list; /* next ioend in chain */ - int io_fork; /* inode fork written back */ - xfs_exntst_t io_state; /* extent state */ - struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ - xfs_off_t io_offset; /* offset in the file */ - struct xfs_trans *io_append_trans;/* xact. for size update */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ -}; - extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); -extern struct block_device *xfs_find_bdev_for_inode(struct inode *); -extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *); - #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index dc93c51c17de..bbfa6ba84dcd 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -22,24 +22,21 @@ #include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_dir2.h" +#include "xfs_error.h" /* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; int nmap; int error; @@ -47,47 +44,29 @@ xfs_attr3_leaf_freextent( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error; - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } return 0; @@ -101,86 +80,45 @@ xfs_attr3_leaf_freextent( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error = 0; + int i; - leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, KM_SLEEP); + int blkcnt; - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue; - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue; - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; } - kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; } @@ -190,37 +128,35 @@ xfs_attr3_leaf_inactive( */ STATIC int xfs_attr3_node_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp, - int level) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) { - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_dablk_t child_fsb; - xfs_daddr_t parent_blkno, child_blkno; - int error, i; - struct xfs_buf *child_bp; - struct xfs_da_node_entry *btree; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_blkinfo *info; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + struct xfs_buf *child_bp; struct xfs_da3_icnode_hdr ichdr; + int error, i; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return -EIO; + xfs_buf_corruption_error(bp); + return -EFSCORRUPTED; } - node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&ichdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr); parent_blkno = bp->b_bn; if (!ichdr.count) { xfs_trans_brelse(*trans, bp); return 0; } - btree = dp->d_ops->node_tree_p(node); - child_fsb = be32_to_cpu(btree[0].before); + child_fsb = be32_to_cpu(ichdr.btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* @@ -235,7 +171,7 @@ xfs_attr3_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp, + error = xfs_da3_node_read(*trans, dp, child_fsb, &child_bp, XFS_ATTR_FORK); if (error) return error; @@ -258,8 +194,9 @@ xfs_attr3_node_inactive( error = xfs_attr3_leaf_inactive(trans, dp, child_bp); break; default: - error = -EIO; + xfs_buf_corruption_error(child_bp); xfs_trans_brelse(*trans, child_bp); + error = -EFSCORRUPTED; break; } if (error) @@ -268,10 +205,17 @@ xfs_attr3_node_inactive( /* * Remove the subsidiary block from the cache and from the log. */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, - XFS_ATTR_FORK); + error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, + child_blkno, + XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, + &child_bp); if (error) return error; + error = bp->b_error; + if (error) { + xfs_trans_brelse(*trans, child_bp); + return error; + } xfs_trans_binval(*trans, child_bp); /* @@ -279,13 +223,15 @@ xfs_attr3_node_inactive( * child block number. */ if (i + 1 < ichdr.count) { - error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + struct xfs_da3_icnode_hdr phdr; + + error = xfs_da3_node_read_mapped(*trans, dp, + parent_blkno, &bp, XFS_ATTR_FORK); if (error) return error; - node = bp->b_addr; - btree = dp->d_ops->node_tree_p(node); - child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_da3_node_hdr_from_disk(dp->i_mount, &phdr, + bp->b_addr); + child_fsb = be32_to_cpu(phdr.btree[i + 1].before); xfs_trans_brelse(*trans, bp); } /* @@ -310,6 +256,7 @@ xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) { + struct xfs_mount *mp = dp->i_mount; struct xfs_da_blkinfo *info; struct xfs_buf *bp; xfs_daddr_t blkno; @@ -321,7 +268,7 @@ xfs_attr3_root_inactive( * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK); if (error) return error; blkno = bp->b_bn; @@ -341,7 +288,8 @@ xfs_attr3_root_inactive( error = xfs_attr3_leaf_inactive(trans, dp, bp); break; default: - error = -EIO; + error = -EFSCORRUPTED; + xfs_buf_corruption_error(bp); xfs_trans_brelse(*trans, bp); break; } @@ -351,9 +299,15 @@ xfs_attr3_root_inactive( /* * Invalidate the incore copy of the root block. */ - error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno, + XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); if (error) return error; + error = bp->b_error; + if (error) { + xfs_trans_brelse(*trans, bp); + return error; + } xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 58fc820a70c6..d37743bdf274 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -49,14 +49,16 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we can begin returning them to the user. */ static int -xfs_attr_shortform_list(xfs_attr_list_context_t *context) +xfs_attr_shortform_list( + struct xfs_attr_list_context *context) { - attrlist_cursor_kern_t *cursor; - xfs_attr_sf_sort_t *sbuf, *sbp; - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - xfs_inode_t *dp; - int sbsize, nsbuf, count, i; + struct attrlist_cursor_kern *cursor; + struct xfs_attr_sf_sort *sbuf, *sbp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + struct xfs_inode *dp; + int sbsize, nsbuf, count, i; + int error = 0; ASSERT(context != NULL); dp = context->dp; @@ -84,6 +86,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (XFS_IS_CORRUPT(context->dp->i_mount, + !xfs_attr_namecheck(sfe->nameval, + sfe->namelen))) + return -EFSCORRUPTED; context->put_listent(context, sfe->flags, sfe->nameval, @@ -109,7 +115,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); /* * Scan the attribute list for the rest of the entries, storing @@ -161,10 +167,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) break; } } - if (i == nsbuf) { - kmem_free(sbuf); - return 0; - } + if (i == nsbuf) + goto out; /* * Loop putting entries into the user buffer. @@ -174,6 +178,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } + if (XFS_IS_CORRUPT(context->dp->i_mount, + !xfs_attr_namecheck(sbp->name, + sbp->namelen))) { + error = -EFSCORRUPTED; + goto out; + } context->put_listent(context, sbp->flags, sbp->name, @@ -183,9 +193,9 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) break; cursor->offset++; } - +out: kmem_free(sbuf); - return 0; + return error; } /* @@ -213,7 +223,7 @@ xfs_attr_node_list_lookup( ASSERT(*pbp == NULL); cursor->blkno = 0; for (;;) { - error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp, + error = xfs_da3_node_read(tp, dp, cursor->blkno, &bp, XFS_ATTR_FORK); if (error) return error; @@ -229,7 +239,7 @@ xfs_attr_node_list_lookup( goto out_corruptbuf; } - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) @@ -243,7 +253,7 @@ xfs_attr_node_list_lookup( else expected_level--; - btree = dp->d_ops->node_tree_p(node); + btree = nodehdr.btree; for (i = 0; i < nodehdr.count; btree++, i++) { if (cursor->hashval <= be32_to_cpu(btree->hashval)) { cursor->blkno = be32_to_cpu(btree->before); @@ -258,7 +268,7 @@ xfs_attr_node_list_lookup( return 0; /* We can't point back to the root. */ - if (cursor->blkno == 0) + if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) return -EFSCORRUPTED; } @@ -269,6 +279,7 @@ xfs_attr_node_list_lookup( return 0; out_corruptbuf: + xfs_buf_corruption_error(bp); xfs_trans_brelse(tp, bp); return -EFSCORRUPTED; } @@ -284,7 +295,7 @@ xfs_attr_node_list( struct xfs_buf *bp; struct xfs_inode *dp = context->dp; struct xfs_mount *mp = dp->i_mount; - int error; + int error = 0; trace_xfs_attr_node_list(context); @@ -298,8 +309,8 @@ xfs_attr_node_list( */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp, + XFS_ATTR_FORK); if ((error != 0) && (error != -EFSCORRUPTED)) return error; if (bp) { @@ -358,24 +369,27 @@ xfs_attr_node_list( */ for (;;) { leaf = bp->b_addr; - xfs_attr3_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); + if (error) + break; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; xfs_trans_brelse(context->tp, bp); - error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp); + error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, + &bp); if (error) return error; } xfs_trans_brelse(context->tp, bp); - return 0; + return error; } /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -void +int xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) @@ -417,7 +431,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return; + return 0; } } else { entry = &entries[0]; @@ -457,6 +471,9 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } + if (XFS_IS_CORRUPT(context->dp->i_mount, + !xfs_attr_namecheck(name, namelen))) + return -EFSCORRUPTED; context->put_listent(context, entry->flags, name, namelen, valuelen); if (context->seen_enough) @@ -464,7 +481,7 @@ xfs_attr3_leaf_list_int( cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return; + return 0; } /* @@ -479,13 +496,13 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); if (error) return error; - xfs_attr3_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(context->tp, bp); - return 0; + return error; } int diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9fa4a7ee8cfc..ee6f4229cebc 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -21,7 +21,7 @@ #include "xfs_icache.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" - +#include "xfs_error.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -35,7 +35,7 @@ void xfs_bui_item_free( struct xfs_bui_log_item *buip) { - kmem_zone_free(xfs_bui_zone, buip); + kmem_cache_free(xfs_bui_zone, buip); } /* @@ -141,7 +141,7 @@ xfs_bui_init( { struct xfs_bui_log_item *buip; - buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + buip = kmem_zone_zalloc(xfs_bui_zone, 0); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -201,7 +201,7 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); - kmem_zone_free(xfs_bud_zone, budp); + kmem_cache_free(xfs_bud_zone, budp); } static const struct xfs_item_ops xfs_bud_item_ops = { @@ -218,7 +218,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + budp = kmem_zone_zalloc(xfs_bud_zone, 0); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; @@ -456,7 +456,7 @@ xfs_bui_recover( if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); - return -EIO; + return -EFSCORRUPTED; } /* @@ -490,7 +490,7 @@ xfs_bui_recover( */ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); - return -EIO; + return -EFSCORRUPTED; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, @@ -525,6 +525,7 @@ xfs_bui_recover( type = bui_type; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto err_inode; } @@ -542,9 +543,7 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp, ip, &irec); - if (error) - goto err_inode; + xfs_bmap_unmap_extent(tp, ip, &irec); } set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 98c6a7a71427..e62fb5216341 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -39,9 +39,9 @@ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_BB(ip->i_mount, fsb); + return XFS_FSB_TO_DADDR(ip->i_mount, fsb); } /* @@ -53,15 +53,16 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) */ int xfs_zero_extent( - struct xfs_inode *ip, - xfs_fsblock_t start_fsb, - xfs_off_t count_fsb) + struct xfs_inode *ip, + xfs_fsblock_t start_fsb, + xfs_off_t count_fsb) { - struct xfs_mount *mp = ip->i_mount; - xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); - sector_t block = XFS_BB_TO_FSBT(mp, sector); + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); + sector_t block = XFS_BB_TO_FSBT(mp, sector); - return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + return blkdev_issue_zeroout(target->bt_bdev, block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), GFP_NOFS, 0); @@ -164,13 +165,6 @@ xfs_bmap_rtalloc( xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); - - /* Zero the extent if we were asked to do so */ - if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) { - error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); - if (error) - return error; - } } else { ap->length = 0; } @@ -179,29 +173,6 @@ xfs_bmap_rtalloc( #endif /* CONFIG_XFS_RT */ /* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* * Extent tree block counting routines. */ @@ -229,106 +200,6 @@ xfs_bmap_count_leaves( } /* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - xfs_filblks_t *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks in use. - */ -STATIC int -xfs_bmap_count_tree( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_ifork *ifp, - xfs_fsblock_t blockno, - int levelin, - xfs_extnum_t *nextents, - xfs_filblks_t *count) -{ - int error; - struct xfs_buf *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); - } - - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents, - count); - if (error) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - (*nextents) += numrecs; - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - } - } - return 0; -} - -/* * Count fsblocks of the given fork. Delayed allocation extents are * not counted towards the totals. */ @@ -340,26 +211,19 @@ xfs_bmap_count_blocks( xfs_extnum_t *nextents, xfs_filblks_t *count) { - struct xfs_mount *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - struct xfs_btree_block *block; /* current btree block */ - struct xfs_ifork *ifp; /* fork structure */ - xfs_fsblock_t bno; /* block # of "block" */ - int level; /* btree level, for checking */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + xfs_extlen_t btblocks = 0; int error; - bno = NULLFSBLOCK; - mp = ip->i_mount; *nextents = 0; *count = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + if (!ifp) return 0; switch (XFS_IFORK_FORMAT(ip, whichfork)) { - case XFS_DINODE_FMT_EXTENTS: - *nextents = xfs_bmap_count_leaves(ifp, count); - return 0; case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -367,26 +231,23 @@ xfs_bmap_count_blocks( return error; } + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + error = xfs_btree_count_blocks(cur, &btblocks); + xfs_btree_del_cursor(cur, error); + if (error) + return error; + /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + * xfs_btree_count_blocks includes the root block contained in + * the inode fork in @btblocks, so subtract one because we're + * only interested in allocated disk blocks. */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, - nextents, count); - if (error) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - return 0; + *count += btblocks - 1; + + /* fall through */ + case XFS_DINODE_FMT_EXTENTS: + *nextents = xfs_bmap_count_leaves(ifp, count); + break; } return 0; @@ -864,6 +725,7 @@ xfs_alloc_file_space( xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; + xfs_fileoff_t endoffset_fsb; int nimaps; int quota_flag; int rt; @@ -891,7 +753,8 @@ xfs_alloc_file_space( imapp = &imaps[0]; nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); + endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); + allocatesize_fsb = endoffset_fsb - startoffset_fsb; /* * Allocate file space until done or until there is an error @@ -962,8 +825,8 @@ xfs_alloc_file_space( xfs_trans_ijoin(tp, ip, 0); error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, resblks, - imapp, &nimaps); + allocatesize_fsb, alloc_type, 0, imapp, + &nimaps); if (error) goto error0; @@ -1037,6 +900,7 @@ out_trans_cancel: goto out_unlock; } +/* Caller must first wait for the completion of any pending DIOs if required. */ int xfs_flush_unmap_range( struct xfs_inode *ip, @@ -1048,9 +912,6 @@ xfs_flush_unmap_range( xfs_off_t rounding, start, end; int error; - /* wait for the completion of any pending DIOs */ - inode_dio_wait(inode); - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); start = round_down(offset, rounding); end = round_up(offset + len, rounding) - 1; @@ -1082,10 +943,6 @@ xfs_free_file_space( if (len <= 0) /* if nothing being freed */ return 0; - error = xfs_flush_unmap_range(ip, offset, len); - if (error) - return error; - startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); @@ -1111,7 +968,8 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + error = iomap_zero_range(VFS_I(ip), offset, len, NULL, + &xfs_buffered_write_iomap_ops); if (error) return error; @@ -1129,48 +987,12 @@ xfs_free_file_space( return error; } -/* - * Preallocate and zero a range of a file. This mechanism has the allocation - * semantics of fallocate and in addition converts data in the range to zeroes. - */ -int -xfs_zero_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) -{ - struct xfs_mount *mp = ip->i_mount; - uint blksize; - int error; - - trace_xfs_zero_file_space(ip); - - blksize = 1 << mp->m_sb.sb_blocklog; - - /* - * Punch a hole and prealloc the range. We use hole punch rather than - * unwritten extent conversion for two reasons: - * - * 1.) Hole punch handles partial block zeroing for us. - * - * 2.) If prealloc returns ENOSPC, the file range is still zero-valued - * by virtue of the hole punch. - */ - error = xfs_free_file_space(ip, offset, len); - if (error || xfs_is_always_cow_inode(ip)) - return error; - - return xfs_alloc_file_space(ip, round_down(offset, blksize), - round_up(offset + len, blksize) - - round_down(offset, blksize), - XFS_BMAPI_PREALLOC); -} - static int xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1184,6 +1006,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ @@ -1532,24 +1365,16 @@ xfs_swap_extent_rmap( trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); /* Remove the mapping from the donor file. */ - error = xfs_bmap_unmap_extent(tp, tip, &uirec); - if (error) - goto out; + xfs_bmap_unmap_extent(tp, tip, &uirec); /* Remove the mapping from the source file. */ - error = xfs_bmap_unmap_extent(tp, ip, &irec); - if (error) - goto out; + xfs_bmap_unmap_extent(tp, ip, &irec); /* Map the donor file's blocks into the source file. */ - error = xfs_bmap_map_extent(tp, ip, &uirec); - if (error) - goto out; + xfs_bmap_map_extent(tp, ip, &uirec); /* Map the source file's blocks into the donor file. */ - error = xfs_bmap_map_extent(tp, tip, &irec); - if (error) - goto out; + xfs_bmap_map_extent(tp, tip, &irec); error = xfs_defer_finish(tpp); tp = *tpp; @@ -1756,6 +1581,14 @@ xfs_swap_extents( goto out_unlock; } + error = xfs_qm_dqattach(ip); + if (error) + goto out_unlock; + + error = xfs_qm_dqattach(tip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(ip); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 7a78229cf1a7..9f993168b55b 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,8 +30,6 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, - int whichfork, int *eof); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); @@ -59,8 +57,6 @@ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len, int alloc_type); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); -int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ca0849043f54..217e4f82a44a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -198,20 +198,22 @@ xfs_buf_free_maps( } } -static struct xfs_buf * +static int _xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; int error; int i; + *bpp = NULL; bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) - return NULL; + return -ENOMEM; /* * We don't want certain flags to appear in b_flags unless they are @@ -238,8 +240,8 @@ _xfs_buf_alloc( */ error = xfs_buf_get_maps(bp, nmaps); if (error) { - kmem_zone_free(xfs_buf_zone, bp); - return NULL; + kmem_cache_free(xfs_buf_zone, bp); + return error; } bp->b_bn = map[0].bm_bn; @@ -256,7 +258,8 @@ _xfs_buf_alloc( XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); - return bp; + *bpp = bp; + return 0; } /* @@ -304,7 +307,7 @@ _xfs_buf_free_pages( * The buffer must not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */ -void +static void xfs_buf_free( xfs_buf_t *bp) { @@ -328,7 +331,7 @@ xfs_buf_free( kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); xfs_buf_free_maps(bp); - kmem_zone_free(xfs_buf_zone, bp); + kmem_cache_free(xfs_buf_zone, bp); } /* @@ -345,6 +348,15 @@ xfs_buf_allocate_memory( unsigned short page_count, i; xfs_off_t start, end; int error; + xfs_km_flags_t kmflag_mask = 0; + + /* + * assure zeroed buffer for non-read cases. + */ + if (!(flags & XBF_READ)) { + kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; + } /* * for buffers that are contained within a single page, just allocate @@ -353,7 +365,9 @@ xfs_buf_allocate_memory( */ size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { - bp->b_addr = kmem_alloc(size, KM_NOFS); + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); + bp->b_addr = kmem_alloc_io(size, align_mask, + KM_NOFS | kmflag_mask); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; @@ -368,7 +382,7 @@ xfs_buf_allocate_memory( } bp->b_offset = offset_in_page(bp->b_addr); bp->b_pages = bp->b_page_array; - bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_pages[0] = kmem_to_page(bp->b_addr); bp->b_page_count = 1; bp->b_flags |= _XBF_KMEM; return 0; @@ -450,7 +464,7 @@ _xfs_buf_map_pages( unsigned nofs_flag; /* - * vm_map_ram() will allocate auxillary structures (e.g. + * vm_map_ram() will allocate auxiliary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim * that we are in such a context via PF_MEMALLOC_NOFS to prevent @@ -671,53 +685,39 @@ xfs_buf_incore( * cache hits, as metadata intensive workloads will see 3 orders of magnitude * more hits than misses. */ -struct xfs_buf * +int xfs_buf_get_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; + *bpp = NULL; error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); - - switch (error) { - case 0: - /* cache hit */ + if (!error) goto found; - case -EAGAIN: - /* cache hit, trylock failure, caller handles failure */ - ASSERT(flags & XBF_TRYLOCK); - return NULL; - case -ENOENT: - /* cache miss, go for insert */ - break; - case -EFSCORRUPTED: - default: - /* - * None of the higher layers understand failure types - * yet, so return NULL to signal a fatal lookup error. - */ - return NULL; - } + if (error != -ENOENT) + return error; - new_bp = _xfs_buf_alloc(target, map, nmaps, flags); - if (unlikely(!new_bp)) - return NULL; + error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); + if (error) + return error; error = xfs_buf_allocate_memory(new_bp, flags); if (error) { xfs_buf_free(new_bp); - return NULL; + return error; } error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); if (error) { xfs_buf_free(new_bp); - return NULL; + return error; } if (bp != new_bp) @@ -730,7 +730,7 @@ found: xfs_warn(target->bt_mount, "%s: failed to map pagesn", __func__); xfs_buf_relse(bp); - return NULL; + return error; } } @@ -743,7 +743,8 @@ found: XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); - return bp; + *bpp = bp; + return 0; } STATIC int @@ -795,46 +796,77 @@ xfs_buf_reverify( return bp->b_error; } -xfs_buf_t * +int xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - const struct xfs_buf_ops *ops) + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops, + xfs_failaddr_t fa) { struct xfs_buf *bp; + int error; flags |= XBF_READ; + *bpp = NULL; - bp = xfs_buf_get_map(target, map, nmaps, flags); - if (!bp) - return NULL; + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); + if (error) + return error; trace_xfs_buf_read(bp, flags, _RET_IP_); if (!(bp->b_flags & XBF_DONE)) { + /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; - _xfs_buf_read(bp, flags); - return bp; + error = _xfs_buf_read(bp, flags); + + /* Readahead iodone already dropped the buffer, so exit. */ + if (flags & XBF_ASYNC) + return 0; + } else { + /* Buffer already read; all we need to do is check it. */ + error = xfs_buf_reverify(bp, ops); + + /* Readahead already finished; drop the buffer and exit. */ + if (flags & XBF_ASYNC) { + xfs_buf_relse(bp); + return 0; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); } - xfs_buf_reverify(bp, ops); + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ + if (error) { + if (!XFS_FORCED_SHUTDOWN(target->bt_mount)) + xfs_buf_ioerror_alert(bp, fa); - if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); xfs_buf_relse(bp); - return NULL; + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; } - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - ASSERT(bp->b_ops != NULL || ops == NULL); - return bp; + *bpp = bp; + return 0; } /* @@ -848,11 +880,14 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { + struct xfs_buf *bp; + if (bdi_read_congested(target->bt_bdev->bd_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); + XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, + __this_address); } /* @@ -869,12 +904,13 @@ xfs_buf_read_uncached( const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + int error; *bpp = NULL; - bp = xfs_buf_get_uncached(target, numblks, flags); - if (!bp) - return -ENOMEM; + error = xfs_buf_get_uncached(target, numblks, flags, &bp); + if (error) + return error; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); @@ -885,7 +921,7 @@ xfs_buf_read_uncached( xfs_buf_submit(bp); if (bp->b_error) { - int error = bp->b_error; + error = bp->b_error; xfs_buf_relse(bp); return error; } @@ -894,20 +930,23 @@ xfs_buf_read_uncached( return 0; } -xfs_buf_t * +int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags) + int flags, + struct xfs_buf **bpp) { unsigned long page_count; int error, i; struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + *bpp = NULL; + /* flags might contain irrelevant bits, pass only what we care about */ - bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); - if (unlikely(bp == NULL)) + error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); + if (error) goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; @@ -917,8 +956,10 @@ xfs_buf_get_uncached( for (i = 0; i < page_count; i++) { bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); - if (!bp->b_pages[i]) + if (!bp->b_pages[i]) { + error = -ENOMEM; goto fail_free_mem; + } } bp->b_flags |= _XBF_PAGES; @@ -930,7 +971,8 @@ xfs_buf_get_uncached( } trace_xfs_buf_get_uncached(bp, _RET_IP_); - return bp; + *bpp = bp; + return 0; fail_free_mem: while (--i >= 0) @@ -938,9 +980,9 @@ xfs_buf_get_uncached( _xfs_buf_free_pages(bp); fail_free_buf: xfs_buf_free_maps(bp); - kmem_zone_free(xfs_buf_zone, bp); + kmem_cache_free(xfs_buf_zone, bp); fail: - return NULL; + return error; } /* @@ -1194,10 +1236,10 @@ __xfs_buf_ioerror( void xfs_buf_ioerror_alert( struct xfs_buf *bp, - const char *func) + xfs_failaddr_t func) { xfs_alert(bp->b_mount, -"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", +"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error); } @@ -1250,8 +1292,7 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int op, - int op_flags) + int op) { int page_index; int total_nr_pages = bp->b_page_count; @@ -1286,7 +1327,7 @@ next_chunk: bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = op; for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1331,7 +1372,6 @@ _xfs_buf_ioapply( { struct blk_plug plug; int op; - int op_flags = 0; int offset; int size; int i; @@ -1373,15 +1413,14 @@ _xfs_buf_ioapply( dump_stack(); } } - } else if (bp->b_flags & XBF_READ_AHEAD) { - op = REQ_OP_READ; - op_flags = REQ_RAHEAD; } else { op = REQ_OP_READ; + if (bp->b_flags & XBF_READ_AHEAD) + op |= REQ_RAHEAD; } /* we only use the buffer cache for meta-data */ - op_flags |= REQ_META; + op |= REQ_META; /* * Walk all the vectors issuing IO on them. Set up the initial offset @@ -1393,7 +1432,7 @@ _xfs_buf_ioapply( size = BBTOB(bp->b_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); + xfs_buf_ioapply_map(bp, i, &offset, &size, op); if (bp->b_error) break; if (size <= 0) @@ -1741,7 +1780,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); + btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; @@ -2052,8 +2091,9 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", - KM_ZONE_HWALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", + sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN, NULL); if (!xfs_buf_zone) goto out; @@ -2066,7 +2106,7 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - kmem_zone_destroy(xfs_buf_zone); + kmem_cache_destroy(xfs_buf_zone); } void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) @@ -2096,7 +2136,7 @@ xfs_verify_magic( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } @@ -2114,7 +2154,7 @@ xfs_verify_magic16( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c6e57a3f409e..d79a1fe5d738 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -192,37 +192,40 @@ struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, - const struct xfs_buf_ops *ops); +int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); +int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops, xfs_failaddr_t fa); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops); -static inline struct xfs_buf * +static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_get_map(target, &map, 1, 0); + + return xfs_buf_get_map(target, &map, 1, 0, bpp); } -static inline struct xfs_buf * +static inline int xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags, ops); + + return xfs_buf_read_map(target, &map, 1, flags, bpp, ops, + __builtin_return_address(0)); } static inline void @@ -236,15 +239,14 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - int flags); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ -extern void xfs_buf_free(xfs_buf_t *); extern void xfs_buf_rele(xfs_buf_t *); /* Locking and Unlocking Buffers */ @@ -260,7 +262,7 @@ extern void xfs_buf_ioend(struct xfs_buf *bp); extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) -extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); +extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); extern int __xfs_buf_submit(struct xfs_buf *bp, bool); static inline int xfs_buf_submit(struct xfs_buf *bp) @@ -350,6 +352,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) +static inline int +xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) +{ + return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); +} + int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7dcaec54a20b..663810e6cd59 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -27,6 +27,23 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +/* Is this log iovec plausibly large enough to contain the buffer log format? */ +bool +xfs_buf_log_check_iovec( + struct xfs_log_iovec *iovec) +{ + struct xfs_buf_log_format *blfp = iovec->i_addr; + char *bmp_end; + char *item_end; + + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + return false; + + item_end = (char *)iovec->i_addr + iovec->i_len; + bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; + return bmp_end <= item_end; +} + static inline int xfs_buf_log_format_size( struct xfs_buf_log_format *blfp) @@ -688,7 +705,7 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC int +STATIC void xfs_buf_item_get_format( struct xfs_buf_log_item *bip, int count) @@ -698,14 +715,11 @@ xfs_buf_item_get_format( if (count == 1) { bip->bli_formats = &bip->__bli_format; - return 0; + return; } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), - KM_SLEEP); - if (!bip->bli_formats) - return -ENOMEM; - return 0; + 0); } STATIC void @@ -731,7 +745,6 @@ xfs_buf_item_init( struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; - int error; int i; /* @@ -747,7 +760,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; @@ -760,19 +773,22 @@ xfs_buf_item_init( * Discontiguous buffer support follows the layout of the underlying * buffer. This makes the implementation as simple as possible. */ - error = xfs_buf_item_get_format(bip, bp->b_map_count); - ASSERT(error == 0); - if (error) { /* to stop gcc throwing set-but-unused warnings */ - kmem_zone_free(xfs_buf_item_zone, bip); - return error; - } - + xfs_buf_item_get_format(bip, bp->b_map_count); for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), XFS_BLF_CHUNK); map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) { + kmem_cache_free(xfs_buf_item_zone, bip); + xfs_err(mp, + "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", + map_size, + BBTOB(bp->b_maps[i].bm_len)); + return -EFSCORRUPTED; + } + bip->bli_formats[i].blf_type = XFS_LI_BUF; bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; @@ -805,6 +821,9 @@ xfs_buf_item_log_segment( uint end_bit; uint mask; + ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + /* * Convert byte offsets to bit numbers. */ @@ -851,7 +870,7 @@ xfs_buf_item_log_segment( * first_bit and last_bit. */ while ((bits_to_set - bits_set) >= NBWORD) { - *wordp |= 0xffffffff; + *wordp = 0xffffffff; bits_set += NBWORD; wordp++; } @@ -939,7 +958,7 @@ xfs_buf_item_free( { xfs_buf_item_free_format(bip); kmem_free(bip->bli_item.li_lv_shadow); - kmem_zone_free(xfs_buf_item_zone, bip); + kmem_cache_free(xfs_buf_item_zone, bip); } /* @@ -956,7 +975,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) @@ -1094,7 +1113,7 @@ xfs_buf_iodone_callback_error( if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); } lasttarg = bp->b_target; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 4a054b11011a..30114b510332 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,6 +61,7 @@ void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, struct list_head *); +bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 283df898dd9f..0d3b640cf1cc 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -17,6 +17,7 @@ #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" +#include "xfs_error.h" /* * Directory file type support functions @@ -47,6 +48,7 @@ xfs_dir2_sf_getdents( { int i; /* shortform entry number */ struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ @@ -68,15 +70,15 @@ xfs_dir2_sf_getdents( return 0; /* - * Precalculate offsets for . and .. as we will always need them. - * - * XXX(hch): the second argument is sometimes 0 and sometimes - * geo->datablk + * Precalculate offsets for "." and ".." as we will always need them. + * This relies on the fact that directories always start with the + * entries for "." and "..". */ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, - dp->d_ops->data_dot_offset); + geo->data_entry_offset); dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, - dp->d_ops->data_dotdot_offset); + geo->data_entry_offset + + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); /* * Put . entry unless we're starting past it. @@ -91,7 +93,7 @@ xfs_dir2_sf_getdents( * Put .. entry unless we're starting past it. */ if (ctx->pos <= dotdot_offset) { - ino = dp->d_ops->sf_get_parent_ino(sfp); + ino = xfs_dir2_sf_get_parent_ino(sfp); ctx->pos = dotdot_offset & 0x7fffffff; if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; @@ -108,17 +110,21 @@ xfs_dir2_sf_getdents( xfs_dir2_sf_get_offset(sfep)); if (ctx->pos > off) { - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); continue; } - ino = dp->d_ops->sf_get_ino(sfp, sfep); - filetype = dp->d_ops->sf_get_ftype(sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); + filetype = xfs_dir2_sf_get_ftype(mp, sfep); ctx->pos = off & 0x7fffffff; + if (XFS_IS_CORRUPT(dp->i_mount, + !xfs_dir2_namecheck(sfep->name, + sfep->namelen))) + return -EFSCORRUPTED; if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, - xfs_dir3_get_dtype(dp->i_mount, filetype))) + xfs_dir3_get_dtype(mp, filetype))) return 0; - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & @@ -135,17 +141,14 @@ xfs_dir2_block_getdents( struct dir_context *ctx) { struct xfs_inode *dp = args->dp; /* incore directory inode */ - xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ - xfs_dir2_data_entry_t *dep; /* block data entry */ - xfs_dir2_data_unused_t *dup; /* block unused entry */ - char *endptr; /* end of the data entries */ int error; /* error return value */ - char *ptr; /* current data entry */ int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; int lock_mode; + unsigned int offset; + unsigned int end; /* * If the block number in the offset is out of range, we're done. @@ -164,56 +167,55 @@ xfs_dir2_block_getdents( * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); - hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); - /* - * Set up values for the loop. - */ - ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = xfs_dir3_data_endp(geo, hdr); /* * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ - while (ptr < endptr) { + offset = geo->data_entry_offset; + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; - dup = (xfs_dir2_data_unused_t *)ptr; /* * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); continue; } - dep = (xfs_dir2_data_entry_t *)ptr; - /* * Bump pointer for the next iteration. */ - ptr += dp->d_ops->data_entsize(dep->namelen); + offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen); + /* * The entry is before the desired starting point, skip it. */ - if ((char *)dep - (char *)hdr < wantoff) + if (offset < wantoff) continue; - cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, - (char *)dep - (char *)hdr); + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset); ctx->pos = cook & 0x7fffffff; - filetype = dp->d_ops->data_get_ftype(dep); + filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); /* * If it didn't fit, set the final offset to here & return. */ + if (XFS_IS_CORRUPT(dp->i_mount, + !xfs_dir2_namecheck(dep->name, + dep->namelen))) { + error = -EFSCORRUPTED; + goto out_rele; + } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), - xfs_dir3_get_dtype(dp->i_mount, filetype))) { - xfs_trans_brelse(args->trans, bp); - return 0; - } + xfs_dir3_get_dtype(dp->i_mount, filetype))) + goto out_rele; } /* @@ -222,8 +224,9 @@ xfs_dir2_block_getdents( */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; +out_rele: xfs_trans_brelse(args->trans, bp); - return 0; + return error; } /* @@ -276,7 +279,7 @@ xfs_dir2_leaf_readbuf( new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); if (new_off > *cur_off) *cur_off = new_off; - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp); + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp); if (error) goto out; @@ -311,7 +314,8 @@ xfs_dir2_leaf_readbuf( break; } if (next_ra > *ra_blk) { - xfs_dir3_data_readahead(dp, next_ra, -2); + xfs_dir3_data_readahead(dp, next_ra, + XFS_DABUF_MAP_HOLE_OK); *ra_blk = next_ra; } ra_want -= geo->fsbcount; @@ -343,17 +347,17 @@ xfs_dir2_leaf_getdents( size_t bufsize) { struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp = NULL; /* data block buffer */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ - char *ptr = NULL; /* pointer to current data */ struct xfs_da_geometry *geo = args->geo; xfs_dablk_t rablk = 0; /* current readahead block */ xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ int lock_mode; + unsigned int offset = 0; int error = 0; /* error return value */ /* @@ -380,7 +384,7 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + if (!bp || offset >= geo->blksize) { if (bp) { xfs_trans_brelse(args->trans, bp); bp = NULL; @@ -393,36 +397,35 @@ xfs_dir2_leaf_getdents( if (error || !bp) break; - hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)dp->d_ops->data_entry_p(hdr); + offset = geo->data_entry_offset; byteoff = xfs_dir2_byte_to_off(geo, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += dp->d_ops->data_entry_offset; + curoff += geo->data_entry_offset; /* * Skip past entries until we reach our offset. */ else { - while ((char *)ptr - (char *)hdr < byteoff) { - dup = (xfs_dir2_data_unused_t *)ptr; + while (offset < byteoff) { + dup = bp->b_addr + offset; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); - ptr += length; + offset += length; continue; } - dep = (xfs_dir2_data_entry_t *)ptr; - length = - dp->d_ops->data_entsize(dep->namelen); - ptr += length; + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, + dep->namelen); + offset += length; } /* * Now set our real offset. @@ -430,32 +433,38 @@ xfs_dir2_leaf_getdents( curoff = xfs_dir2_db_off_to_byte(geo, xfs_dir2_byte_to_db(geo, curoff), - (char *)ptr - (char *)hdr); - if (ptr >= (char *)hdr + geo->blksize) { + offset); + if (offset >= geo->blksize) continue; - } } } + /* - * We have a pointer to an entry. - * Is it a live one? + * We have a pointer to an entry. Is it a live one? */ - dup = (xfs_dir2_data_unused_t *)ptr; + dup = bp->b_addr + offset; + /* * No, it's unused, skip over it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); - ptr += length; + offset += length; curoff += length; continue; } - dep = (xfs_dir2_data_entry_t *)ptr; - length = dp->d_ops->data_entsize(dep->namelen); - filetype = dp->d_ops->data_get_ftype(dep); + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, dep->namelen); + filetype = xfs_dir2_data_get_ftype(mp, dep); ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + if (XFS_IS_CORRUPT(dp->i_mount, + !xfs_dir2_namecheck(dep->name, + dep->namelen))) { + error = -EFSCORRUPTED; + break; + } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) @@ -464,7 +473,7 @@ xfs_dir2_leaf_getdents( /* * Advance to next entry in the block. */ - ptr += length; + offset += length; curoff += length; /* bufsize may have just been a guess; don't go negative */ bufsize = bufsize > length ? bufsize - length : 0; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8ec7aab89044..0b8350e84d28 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -13,6 +13,7 @@ #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" +#include "xfs_discard.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" @@ -44,7 +45,7 @@ xfs_trim_extents( xfs_log_force(mp, XFS_LOG_SYNC); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) + if (error) goto out_put_perag; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); @@ -70,7 +71,10 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_del_cursor; + } ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fb1ad4483081..d223e1ae90a6 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -48,7 +48,7 @@ static struct lock_class_key xfs_dquot_project_class; */ void xfs_qm_dqdestroy( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); @@ -56,7 +56,7 @@ xfs_qm_dqdestroy( mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); - kmem_zone_free(xfs_qm_dqzone, dqp); + kmem_cache_free(xfs_qm_dqzone, dqp); } /* @@ -113,8 +113,8 @@ xfs_qm_adjust_dqlimits( */ void xfs_qm_adjust_dqtimers( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_disk_dquot *d) { ASSERT(d->d_id); @@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers( (d->d_blk_hardlimit && (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(get_seconds() + + d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; @@ -160,7 +160,7 @@ xfs_qm_adjust_dqtimers( (d->d_ino_hardlimit && (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(get_seconds() + + d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; @@ -183,7 +183,7 @@ xfs_qm_adjust_dqtimers( (d->d_rtb_hardlimit && (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(get_seconds() + + d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; @@ -305,8 +305,8 @@ xfs_dquot_disk_alloc( /* Create the block mapping. */ xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - XFS_QM_DQALLOC_SPACE_RES(mp), &map, &nmaps); + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, + &nmaps); if (error) return error; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -320,10 +320,10 @@ xfs_dquot_disk_alloc( dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; bp->b_ops = &xfs_dquot_buf_ops; /* @@ -440,7 +440,7 @@ xfs_dquot_alloc( { struct xfs_dquot *dqp; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); dqp->dq_flags = type; dqp->q_core.d_id = cpu_to_be32(id); @@ -497,7 +497,7 @@ xfs_dquot_from_disk( struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); /* * Reservation counters are defined as reservation plus current usage @@ -833,7 +833,7 @@ xfs_qm_id_for_quotatype( case XFS_DQ_GROUP: return ip->i_d.di_gid; case XFS_DQ_PROJ: - return xfs_get_projid(ip); + return ip->i_d.di_projid; } ASSERT(0); return 0; @@ -989,7 +989,7 @@ xfs_qm_dqput( */ void xfs_qm_dqrele( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { if (!dqp) return; @@ -1018,8 +1018,8 @@ xfs_qm_dqflush_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; - xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; + struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; /* @@ -1126,11 +1126,11 @@ xfs_qm_dqflush( xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return -EIO; + return -EFSCORRUPTED; } /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); /* * Clear the dirty field and remember the flush lsn for later use. @@ -1188,8 +1188,8 @@ out_unlock: */ void xfs_dqlock2( - xfs_dquot_t *d1, - xfs_dquot_t *d2) + struct xfs_dquot *d1, + struct xfs_dquot *d2) { if (d1 && d2) { ASSERT(d1 != d2); @@ -1211,20 +1211,22 @@ xfs_dqlock2( int __init xfs_qm_init(void) { - xfs_qm_dqzone = - kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + xfs_qm_dqzone = kmem_cache_create("xfs_dquot", + sizeof(struct xfs_dquot), + 0, 0, NULL); if (!xfs_qm_dqzone) goto out; - xfs_qm_dqtrxzone = - kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx", + sizeof(struct xfs_dquot_acct), + 0, 0, NULL); if (!xfs_qm_dqtrxzone) goto out_free_dqzone; return 0; out_free_dqzone: - kmem_zone_destroy(xfs_qm_dqzone); + kmem_cache_destroy(xfs_qm_dqzone); out: return -ENOMEM; } @@ -1232,14 +1234,14 @@ out: void xfs_qm_exit(void) { - kmem_zone_destroy(xfs_qm_dqtrxzone); - kmem_zone_destroy(xfs_qm_dqzone); + kmem_cache_destroy(xfs_qm_dqtrxzone); + kmem_cache_destroy(xfs_qm_dqzone); } /* * Iterate every dquot of a particular type. The caller must ensure that the * particular quota type is active. iter_fn can return negative error codes, - * or XFS_ITER_ABORT to indicate that it wants to stop iterating. + * or -ECANCELED to indicate that it wants to stop iterating. */ int xfs_qm_dqiterate( diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 4fe85709d55d..fe3e46df604b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -30,33 +30,36 @@ enum { /* * The incore dquot structure */ -typedef struct xfs_dquot { - uint dq_flags; /* various flags (XFS_DQ_*) */ - struct list_head q_lru; /* global free list of dquots */ - struct xfs_mount*q_mount; /* filesystem this relates to */ - uint q_nrefs; /* # active refs from inodes */ - xfs_daddr_t q_blkno; /* blkno of dquot buffer */ - int q_bufoffset; /* off of dq in buffer (# dquots) */ - xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - - xfs_disk_dquot_t q_core; /* actual usage & quotas */ - xfs_dq_logitem_t q_logitem; /* dquot log item */ - xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ - xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ - xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ - xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ - int64_t q_low_space[XFS_QLOWSP_MAX]; - struct mutex q_qlock; /* quota lock */ - struct completion q_flush; /* flush completion queue */ - atomic_t q_pincount; /* dquot pin count */ - wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ -} xfs_dquot_t; +struct xfs_dquot { + uint dq_flags; + struct list_head q_lru; + struct xfs_mount *q_mount; + uint q_nrefs; + xfs_daddr_t q_blkno; + int q_bufoffset; + xfs_fileoff_t q_fileoffset; + + struct xfs_disk_dquot q_core; + struct xfs_dq_logitem q_logitem; + /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_bcount; + /* total inos allocd+reserved */ + xfs_qcnt_t q_res_icount; + /* total realtime blks used+reserved */ + xfs_qcnt_t q_res_rtbcount; + xfs_qcnt_t q_prealloc_lo_wmark; + xfs_qcnt_t q_prealloc_hi_wmark; + int64_t q_low_space[XFS_QLOWSP_MAX]; + struct mutex q_qlock; + struct completion q_flush; + atomic_t q_pincount; + struct wait_queue_head q_pinwait; +}; /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, - * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 */ enum { XFS_QLOCK_NORMAL = 0, @@ -64,21 +67,21 @@ enum { }; /* - * Manage the q_flush completion queue embedded in the dquot. This completion + * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to * disk. */ -static inline void xfs_dqflock(xfs_dquot_t *dqp) +static inline void xfs_dqflock(struct xfs_dquot *dqp) { wait_for_completion(&dqp->q_flush); } -static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp) +static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp) { return try_wait_for_completion(&dqp->q_flush); } -static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +static inline void xfs_dqfunlock(struct xfs_dquot *dqp) { complete(&dqp->q_flush); } @@ -112,7 +115,7 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) } } -static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) { switch (type & XFS_DQ_ALLTYPES) { case XFS_DQ_USER: @@ -147,31 +150,30 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); -extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, - xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, - struct xfs_dquot *); -extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, - uint type); -extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, +void xfs_qm_dqdestroy(struct xfs_dquot *dqp); +int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); +void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); +void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, + struct xfs_disk_dquot *d); +void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, + struct xfs_dquot *d); +xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); +int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, uint type, bool can_alloc, struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, - bool can_alloc, - struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, +int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, + bool can_alloc, + struct xfs_dquot **dqpp); +int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, uint type, struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_uncached(struct xfs_mount *mp, - xfs_dqid_t id, uint type, - struct xfs_dquot **dqpp); -extern void xfs_qm_dqput(xfs_dquot_t *); +int xfs_qm_dqget_uncached(struct xfs_mount *mp, + xfs_dqid_t id, uint type, + struct xfs_dquot **dqpp); +void xfs_qm_dqput(struct xfs_dquot *dqp); -extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); +void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 282ec5af293e..d60647d7197b 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init( { struct xfs_qoff_logitem *qf; - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 1aed34ccdabc..3bb19e556ade 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -11,25 +11,27 @@ struct xfs_trans; struct xfs_mount; struct xfs_qoff_logitem; -typedef struct xfs_dq_logitem { - struct xfs_log_item qli_item; /* common portion */ - struct xfs_dquot *qli_dquot; /* dquot ptr */ - xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ -} xfs_dq_logitem_t; +struct xfs_dq_logitem { + struct xfs_log_item qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ +}; -typedef struct xfs_qoff_logitem { - struct xfs_log_item qql_item; /* common portion */ - struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ +struct xfs_qoff_logitem { + struct xfs_log_item qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ unsigned int qql_flags; -} xfs_qoff_logitem_t; +}; -extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); -extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, - struct xfs_qoff_logitem *, uint); -extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, - struct xfs_qoff_logitem *, uint); -extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, - struct xfs_qoff_logitem *); +void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); +struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags); +struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, + struct xfs_qoff_logitem *startqoff, + uint flags); +void xfs_trans_log_quotaoff_item(struct xfs_trans *tp, + struct xfs_qoff_logitem *qlp); #endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 544c9482a0ef..331765afc53e 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -213,7 +213,7 @@ xfs_errortag_init( struct xfs_mount *mp) { mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; @@ -257,7 +257,7 @@ xfs_errortag_test( xfs_warn_ratelimited(mp, "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_fsname); + expression, file, line, mp->m_super->s_id); return true; } @@ -329,19 +329,40 @@ xfs_corruption_error( const char *tag, int level, struct xfs_mount *mp, - void *buf, + const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr) { - if (level <= xfs_error_level) + if (buf && level <= xfs_error_level) xfs_hex_dump(buf, bufsize); xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } /* + * Complain about the kinds of metadata corruption that we can't detect from a + * verifier, such as incorrect inter-block relationship data. Does not set + * bp->b_error. + */ +void +xfs_buf_corruption_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, %s block 0x%llx", + __return_address, bp->b_ops->name, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} + +/* * Warnings specifically for verifier errors. Differentiate CRC vs. invalid * values, and omit the stack trace unless the error level is tuned high. */ @@ -350,7 +371,7 @@ xfs_buf_verifier_error( struct xfs_buf *bp, int error, const char *name, - void *buf, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { @@ -402,7 +423,7 @@ xfs_inode_verifier_error( struct xfs_inode *ip, int error, const char *name, - void *buf, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 602aa7d62b66..31a5d321ba9a 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -12,16 +12,17 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, const char *filename, int linenum, xfs_failaddr_t failaddr); extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, void *buf, size_t bufsize, + struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_buf_corruption_error(struct xfs_buf *bp); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, void *buf, size_t bufsz, + const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); extern void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, void *buf, size_t bufsz, + const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ @@ -37,32 +38,6 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, /* Dump 128 bytes of any corrupt buffer */ #define XFS_CORRUPTION_DUMP_LEN (128) -/* - * Macros to set EFSCORRUPTED & return/branch. - */ -#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ - { \ - int fs_is_ok = (x); \ - ASSERT(fs_is_ok); \ - if (unlikely(!fs_is_ok)) { \ - XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ - XFS_ERRLEVEL_LOW, mp); \ - error = -EFSCORRUPTED; \ - goto l; \ - } \ - } - -#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ - { \ - int fs_is_ok = (x); \ - ASSERT(fs_is_ok); \ - if (unlikely(!fs_is_ok)) { \ - XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ - XFS_ERRLEVEL_LOW, mp); \ - return -EFSCORRUPTED; \ - } \ - } - #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); extern void xfs_errortag_del(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 0ed68379e551..3991e59cfd18 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -33,7 +33,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); + new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); new->agno = agno; new->bno = bno; new->length = len; @@ -367,7 +367,7 @@ restart: * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ - if (!xfs_alloc_is_userdata(args->datatype) && + if (!(args->datatype & XFS_ALLOC_USERDATA) && !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { if (!xfs_extent_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 86f6512d6864..6ea847f6e298 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -21,7 +21,7 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" - +#include "xfs_error.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; @@ -39,7 +39,7 @@ xfs_efi_item_free( if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else - kmem_zone_free(xfs_efi_zone, efip); + kmem_cache_free(xfs_efi_zone, efip); } /* @@ -163,9 +163,9 @@ xfs_efi_init( if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efi_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, KM_SLEEP); + efip = kmem_zalloc(size, 0); } else { - efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); + efip = kmem_zone_zalloc(xfs_efi_zone, 0); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); @@ -228,6 +228,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -243,7 +244,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else - kmem_zone_free(xfs_efd_zone, efdp); + kmem_cache_free(xfs_efd_zone, efdp); } /* @@ -333,9 +334,9 @@ xfs_trans_get_efd( if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + (nextents - 1) * sizeof(struct xfs_extent), - KM_SLEEP); + 0); } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + efdp = kmem_zone_zalloc(xfs_efd_zone, 0); } xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, @@ -624,7 +625,7 @@ xfs_efi_recover( */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip); - return -EIO; + return -EFSCORRUPTED; } } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 28101bbc0b78..b8a4a3f29b36 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -28,6 +28,7 @@ #include <linux/falloc.h> #include <linux/backing-dev.h> #include <linux/mman.h> +#include <linux/fadvise.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -186,8 +187,14 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, + is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -214,7 +221,7 @@ xfs_file_dax_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); } - ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -350,7 +357,7 @@ restart: trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, - NULL, &xfs_iomap_ops); + NULL, &xfs_buffered_write_iomap_ops); if (error) return error; } else @@ -369,21 +376,23 @@ static int xfs_dio_write_end_io( struct kiocb *iocb, ssize_t size, + int error, unsigned flags) { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (size <= 0) - return size; + if (error) + return error; + if (!size) + return 0; /* * Capture amount written on completion as we can't reliably account @@ -440,6 +449,10 @@ out: return error; } +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -479,8 +492,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) @@ -540,15 +552,13 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); - /* - * If unaligned, this is the only IO in-flight. If it has not yet - * completed, wait on it before we release the iolock to prevent - * subsequent overlapping IO. + * If unaligned, this is the only IO in-flight. Wait on it before we + * release the iolock to prevent subsequent overlapping IO. */ - if (ret == -EIOCBQUEUED && unaligned_io) - inode_dio_wait(inode); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, + &xfs_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io); out: xfs_iunlock(ip, iolock); @@ -587,7 +597,7 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); @@ -634,7 +644,8 @@ write_retry: current->backing_dev_info = inode_to_bdi(inode); trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); - ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -811,6 +822,36 @@ xfs_file_fallocate( if (error) goto out_unlock; + /* + * Must wait for all AIO to complete before we continue as AIO can + * change the file size on completion without holding any locks we + * currently hold. We must do this first because AIO can update both + * the on disk and in memory inode sizes, and the operations that follow + * require the in-memory size to be fully up-to-date. + */ + inode_dio_wait(inode); + + /* + * Now AIO and DIO has drained we flush and (if necessary) invalidate + * the cached range over the first operation we are about to run. + * + * We care about zero and collapse here because they both run a hole + * punch over the range first. Because that can zero data, and the range + * of invalidation for the shift operations is much larger, we still do + * the required flush for collapse in xfs_prepare_shift(). + * + * Insert has the same range requirements as collapse, and we extend the + * file first which can zero data. Hence insert has the same + * flush/invalidate requirements as collapse and so they are both + * handled at the right time by xfs_prepare_shift(). + */ + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE)) { + error = xfs_flush_unmap_range(ip, offset, len); + if (error) + goto out_unlock; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -874,16 +915,30 @@ xfs_file_fallocate( } if (mode & FALLOC_FL_ZERO_RANGE) { - error = xfs_zero_file_space(ip, offset, len); + /* + * Punch a hole and prealloc the range. We use a hole + * punch rather than unwritten extent conversion for two + * reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is + * still zero-valued by virtue of the hole punch. + */ + unsigned int blksize = i_blocksize(inode); + + trace_xfs_zero_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); } else if (mode & FALLOC_FL_UNSHARE_RANGE) { error = xfs_reflink_unshare(ip, offset, len); if (error) goto out_unlock; - - if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); - } } else { /* * If always_cow mode we can't use preallocations and @@ -893,12 +948,14 @@ xfs_file_fallocate( error = -EOPNOTSUPP; goto out_unlock; } + } + if (!xfs_is_always_cow_inode(ip)) { error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + if (error) + goto out_unlock; } - if (error) - goto out_unlock; } if (file->f_flags & O_DSYNC) @@ -933,6 +990,30 @@ out_unlock: return error; } +STATIC int +xfs_file_fadvise( + struct file *file, + loff_t start, + loff_t end, + int advice) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + int ret; + int lockflags = 0; + + /* + * Operations creating pages in page cache need protection from hole + * punching and similar ops + */ + if (advice == POSIX_FADV_WILLNEED) { + lockflags = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lockflags); + } + ret = generic_fadvise(file, start, end, advice); + if (lockflags) + xfs_iunlock(ip, lockflags); + return ret; +} STATIC loff_t xfs_file_remap_range( @@ -1028,7 +1109,7 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - error = xfs_dir3_data_readahead(ip, 0, -1); + error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); return error; } @@ -1125,12 +1206,16 @@ __xfs_filemap_fault( if (IS_DAX(inode)) { pfn_t pfn; - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_direct_write_iomap_ops : + &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + ret = iomap_page_mkwrite(vmf, + &xfs_buffered_write_iomap_ops); else ret = filemap_fault(vmf); } @@ -1194,22 +1279,22 @@ static const struct vm_operations_struct xfs_file_vm_ops = { STATIC int xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) + struct file *file, + struct vm_area_struct *vma) { - struct dax_device *dax_dev; + struct inode *inode = file_inode(file); + struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); - dax_dev = xfs_find_daxdev_for_inode(file_inode(filp)); /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(vma, dax_dev)) + if (!daxdev_mapping_supported(vma, target->bt_daxdev)) return -EOPNOTSUPP; - file_accessed(filp); + file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; - if (IS_DAX(file_inode(filp))) + if (IS_DAX(inode)) vma->vm_flags |= VM_HUGEPAGE; return 0; } @@ -1232,6 +1317,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 574a7a8b4736..1a88025e68a3 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -18,6 +18,7 @@ #include "xfs_trace.h" #include "xfs_ag_resv.h" #include "xfs_trans.h" +#include "xfs_filestream.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; @@ -158,16 +159,15 @@ xfs_filestream_pick_ag( if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); - if (err && !trylock) { + if (err) { xfs_perag_put(pag); - return err; + if (err != -EAGAIN) + return err; + /* Couldn't lock the AGF, skip this AG. */ + continue; } } - /* Might fail sometimes during the 1st pass with trylock set. */ - if (!pag->pagf_init) - goto next_ag; - /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; @@ -374,7 +374,7 @@ xfs_filestream_new_ag( startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - if (xfs_alloc_is_userdata(ap->datatype)) + if (ap->datatype & XFS_ALLOC_USERDATA) flags |= XFS_PICK_USERDATA; if (ap->tp->t_flags & XFS_TRANS_LOWMODE) flags |= XFS_PICK_LOWSPACE; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 5a8f9641562a..918456ca29e1 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -146,6 +146,7 @@ xfs_fsmap_owner_from_rmap( dest->fmr_owner = XFS_FMR_OWN_FREE; break; default: + ASSERT(0); return -EFSCORRUPTED; } return 0; @@ -250,7 +251,7 @@ xfs_getfsmap_helper( rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* Are we just counting mappings? */ @@ -259,14 +260,14 @@ xfs_getfsmap_helper( info->head->fmh_entries++; if (info->last) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; info->head->fmh_entries++; rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* @@ -276,7 +277,7 @@ xfs_getfsmap_helper( */ if (rec_daddr > info->next_daddr) { if (info->head->fmh_entries >= info->head->fmh_count) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; fmr.fmr_device = info->dev; fmr.fmr_physical = info->next_daddr; @@ -295,7 +296,7 @@ xfs_getfsmap_helper( /* Fill out the extent we found */ if (info->head->fmh_entries >= info->head->fmh_count) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); @@ -328,7 +329,7 @@ out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* Transform a rmapbt irec into a fsmap */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0b0fd10a36d4..8dc2e5414276 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -40,11 +40,11 @@ xfs_inode_alloc( * KM_MAYFAIL and return NULL here on ENOMEM. Set the * code up to do this anyway. */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + ip = kmem_zone_alloc(xfs_inode_zone, 0); if (!ip) return NULL; if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); + kmem_cache_free(xfs_inode_zone, ip); return NULL; } @@ -104,7 +104,7 @@ xfs_inode_free_callback( ip->i_itemp = NULL; } - kmem_zone_free(xfs_inode_zone, ip); + kmem_cache_free(xfs_inode_zone, ip); } static void @@ -1419,7 +1419,7 @@ xfs_inode_match_id( return 0; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - xfs_get_projid(ip) != eofb->eof_prid) + ip->i_d.di_projid != eofb->eof_prid) return 0; return 1; @@ -1443,7 +1443,7 @@ xfs_inode_match_id_union( return 1; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - xfs_get_projid(ip) == eofb->eof_prid) + ip->i_d.di_projid == eofb->eof_prid) return 1; return 0; diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d99a0a3e5f40..490fee22b878 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -55,7 +55,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { - kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip)); + kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip)); } static const struct xfs_item_ops xfs_icreate_item_ops = { @@ -89,7 +89,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + icp = kmem_zone_zalloc(xfs_icreate_zone, 0); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6467d5e1df2d..c5077e6326c7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -55,6 +55,12 @@ xfs_extlen_t xfs_get_extsz_hint( struct xfs_inode *ip) { + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (xfs_is_always_cow_inode(ip)) + return 0; if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) return ip->i_d.di_extsize; if (XFS_IS_REALTIME_INODE(ip)) @@ -809,7 +815,7 @@ xfs_ialloc( ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); inode->i_rdev = rdev; - xfs_set_projid(ip, prid); + ip->i_d.di_projid = prid; if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; @@ -845,8 +851,7 @@ xfs_ialloc( inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; - ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; + ip->i_d.di_crtime = tv; } @@ -1418,7 +1423,7 @@ xfs_link( * the tree quota mechanism could be circumvented. */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + tdp->i_d.di_projid != sip->i_d.di_projid)) { error = -EXDEV; goto error_return; } @@ -1513,10 +1518,8 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1536,21 +1539,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; @@ -1570,7 +1574,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; @@ -2018,7 +2022,7 @@ xfs_iunlink_add_backref( if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) return 0; - iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu = kmem_zalloc(sizeof(*iu), KM_NOFS); iu->iu_agino = prev_agino; iu->iu_next_unlinked = this_agino; @@ -2130,8 +2134,10 @@ xfs_iunlink_update_bucket( * passed in because either we're adding or removing ourselves from the * head of the list. */ - if (old_value == new_agino) + if (old_value == new_agino) { + xfs_buf_corruption_error(agibp); return -EFSCORRUPTED; + } agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); offset = offsetof(struct xfs_agi, agi_unlinked) + @@ -2194,6 +2200,8 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); error = -EFSCORRUPTED; goto out; } @@ -2205,8 +2213,11 @@ xfs_iunlink_update_inode( */ *old_next_agino = old_value; if (old_value == next_agino) { - if (next_agino != NULLAGINO) + if (next_agino != NULLAGINO) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; + } goto out; } @@ -2257,8 +2268,10 @@ xfs_iunlink( */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || - !xfs_verify_agino_or_null(mp, agno, next_agino)) + !xfs_verify_agino_or_null(mp, agno, next_agino)) { + xfs_buf_corruption_error(agibp); return -EFSCORRUPTED; + } if (next_agino != NULLAGINO) { struct xfs_perag *pag; @@ -2533,6 +2546,7 @@ xfs_ifree_cluster( struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_ino_t inum; + int error; inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); @@ -2561,12 +2575,11 @@ xfs_ifree_cluster( * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED); - - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * igeo->blocks_per_cluster, + XBF_UNMAPPED, &bp); + if (error) + return error; /* * This buffer may not have been correctly initialised as we @@ -3196,6 +3209,7 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; + struct xfs_buf *agibp; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); @@ -3270,7 +3284,7 @@ xfs_rename( * tree quota mechanism would be circumvented. */ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + target_dp->i_d.di_projid != src_ip->i_d.di_projid)) { error = -EXDEV; goto out_trans_cancel; } @@ -3282,7 +3296,8 @@ xfs_rename( spaceres); /* - * Set up the target. + * Check for expected errors before we dirty the transaction + * so we can return an error without a transaction abort. */ if (target_ip == NULL) { /* @@ -3294,6 +3309,45 @@ xfs_rename( if (error) goto out_trans_cancel; } + } else { + /* + * If target exists and it's a directory, check that whether + * it can be destroyed. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode) && + (!xfs_dir_isempty(target_ip) || + (VFS_I(target_ip)->i_nlink > 2))) { + error = -EEXIST; + goto out_trans_cancel; + } + } + + /* + * Directory entry creation below may acquire the AGF. Remove + * the whiteout from the unlinked list first to preserve correct + * AGI/AGF locking order. This dirties the transaction so failures + * after this point will abort and log recovery will clean up the + * mess. + * + * For whiteouts, we need to bump the link count on the whiteout + * inode. After this point, we have a real link, clear the tmpfile + * state flag from the inode so it doesn't accidentally get misused + * in future. + */ + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0); + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_cancel; + + xfs_bumplink(tp, wip); + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { /* * If target does not exist and the rename crosses * directories, adjust the target directory link count @@ -3312,22 +3366,6 @@ xfs_rename( } } else { /* target_ip != NULL */ /* - * If target exists and it's a directory, check that both - * target and source are directories and that target can be - * destroyed, or that neither is a directory. - */ - if (S_ISDIR(VFS_I(target_ip)->i_mode)) { - /* - * Make sure target dir is empty. - */ - if (!(xfs_dir_isempty(target_ip)) || - (VFS_I(target_ip)->i_nlink > 2)) { - error = -EEXIST; - goto out_trans_cancel; - } - } - - /* * Link the source inode under the target name. * If the source inode is a directory and we are moving * it across directories, its ".." entry will be @@ -3336,6 +3374,22 @@ xfs_rename( * In case there is already an entry with the same * name at the destination directory, remove it first. */ + + /* + * Check whether the replace operation will need to allocate + * blocks. This happens when the shortform directory lacks + * space and we have to convert it to a block format directory. + * When more blocks are necessary, we must lock the AGI first + * to preserve locking order (AGI -> AGF). + */ + if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) { + error = xfs_read_agi(mp, tp, + XFS_INO_TO_AGNO(mp, target_ip->i_ino), + &agibp); + if (error) + goto out_trans_cancel; + } + error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) @@ -3417,30 +3471,6 @@ xfs_rename( if (error) goto out_trans_cancel; - /* - * For whiteouts, we need to bump the link count on the whiteout inode. - * This means that failures all the way up to this point leave the inode - * on the unlinked list and so cleanup is a simple matter of dropping - * the remaining reference to it. If we fail here after bumping the link - * count, we're shutting down the filesystem so we'll never see the - * intermediate state on disk. - */ - if (wip) { - ASSERT(VFS_I(wip)->i_nlink == 0); - xfs_bumplink(tp, wip); - error = xfs_iunlink_remove(tp, wip); - if (error) - goto out_trans_cancel; - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); - - /* - * Now we have a real link, clear the "I'm a tmpfile" state - * flag from the inode so it doesn't accidentally get misused in - * future. - */ - VFS_I(wip)->i_state &= ~I_LINKABLE; - } - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 558173f95a03..492e53992fa9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -37,9 +37,6 @@ typedef struct xfs_inode { struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ - /* operations vectors */ - const struct xfs_dir_ops *d_ops; /* directory ops vector */ - /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ @@ -177,30 +174,11 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) return ret; } -/* - * Project quota id helpers (previously projid was 16bit only - * and using two 16bit values to hold new 32bit projid was chosen - * to retain compatibility with "old" filesystems). - */ -static inline prid_t -xfs_get_projid(struct xfs_inode *ip) -{ - return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; -} - -static inline void -xfs_set_projid(struct xfs_inode *ip, - prid_t projid) -{ - ip->i_d.di_projid_hi = (uint16_t) (projid >> 16); - ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff); -} - static inline prid_t xfs_get_initial_prid(struct xfs_inode *dp) { if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - return xfs_get_projid(dp); + return dp->i_d.di_projid; return XFS_PROJID_DEFAULT; } @@ -220,6 +198,13 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) } /* + * Return the buftarg used for data allocations on a given inode. + */ +#define xfs_inode_buftarg(ip) \ + (XFS_IS_REALTIME_INODE(ip) ? \ + (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) + +/* * In-core inode flags. */ #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c9a502eed204..8bd5d0de6321 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_error.h" #include <linux/iversion.h> @@ -309,8 +310,8 @@ xfs_inode_to_log_dinode( to->di_format = from->di_format; to->di_uid = from->di_uid; to->di_gid = from->di_gid; - to->di_projid_lo = from->di_projid_lo; - to->di_projid_hi = from->di_projid_hi; + to->di_projid_lo = from->di_projid & 0xffff; + to->di_projid_hi = from->di_projid >> 16; memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); @@ -340,8 +341,8 @@ xfs_inode_to_log_dinode( if (from->di_version == 3) { to->di_changecount = inode_peek_iversion(inode); - to->di_crtime.t_sec = from->di_crtime.t_sec; - to->di_crtime.t_nsec = from->di_crtime.t_nsec; + to->di_crtime.t_sec = from->di_crtime.tv_sec; + to->di_crtime.t_nsec = from->di_crtime.tv_nsec; to->di_flags2 = from->di_flags2; to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; @@ -651,7 +652,7 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, @@ -666,7 +667,7 @@ xfs_inode_item_destroy( xfs_inode_t *ip) { kmem_free(ip->i_itemp->ili_item.li_lv_shadow); - kmem_zone_free(xfs_ili_zone, ip->i_itemp); + kmem_cache_free(xfs_ili_zone, ip->i_itemp); } @@ -828,8 +829,10 @@ xfs_inode_item_format_convert( { struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; - if (buf->i_len != sizeof(*in_f32)) + if (buf->i_len != sizeof(*in_f32)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; + } in_f->ilf_type = in_f32->ilf_type; in_f->ilf_size = in_f32->ilf_size; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f7848cd5527..d42de92cb283 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -33,6 +33,8 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_reflink.h" +#include "xfs_ioctl.h" #include <linux/mount.h> #include <linux/namei.h> @@ -67,7 +69,7 @@ xfs_find_handle( return -EBADF; inode = file_inode(f.file); } else { - error = user_lpath((const char __user *)hreq->path, &path); + error = user_path_at(AT_FDCWD, hreq->path, 0, &path); if (error) return error; inode = d_inode(path.dentry); @@ -290,82 +292,6 @@ xfs_readlink_by_handle( return error; } -int -xfs_set_dmattrs( - xfs_inode_t *ip, - uint evmask, - uint16_t state) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - ip->i_d.di_dmevmask = evmask; - ip->i_d.di_dmstate = state; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp); - - return error; -} - -STATIC int -xfs_fssetdm_by_handle( - struct file *parfilp, - void __user *arg) -{ - int error; - struct fsdmidata fsd; - xfs_fsop_setdm_handlereq_t dmhreq; - struct dentry *dentry; - - if (!capable(CAP_MKNOD)) - return -EPERM; - if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) - return -EFAULT; - - error = mnt_want_write_file(parfilp); - if (error) - return error; - - dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) { - mnt_drop_write_file(parfilp); - return PTR_ERR(dentry); - } - - if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { - error = -EPERM; - goto out; - } - - if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { - error = -EFAULT; - goto out; - } - - error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, - fsd.fsd_dmstate); - - out: - mnt_drop_write_file(parfilp); - dput(dentry); - return error; -} - STATIC int xfs_attrlist_by_handle( struct file *parfilp, @@ -396,7 +322,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); if (!kbuf) goto out_dput; @@ -431,14 +357,17 @@ xfs_attrmulti_attr_get( { unsigned char *kbuf; int error = -EFAULT; + size_t namelen; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, KM_SLEEP); + kbuf = kmem_zalloc_large(*len, 0); if (!kbuf) return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + namelen = strlen(name); + error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, + flags); if (error) goto out_kfree; @@ -460,6 +389,7 @@ xfs_attrmulti_attr_set( { unsigned char *kbuf; int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -470,7 +400,8 @@ xfs_attrmulti_attr_set( if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + namelen = strlen(name); + error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); if (!error) xfs_forget_acl(inode, name, flags); kfree(kbuf); @@ -484,10 +415,12 @@ xfs_attrmulti_attr_remove( uint32_t flags) { int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - error = xfs_attr_remove(XFS_I(inode), name, flags); + namelen = strlen(name); + error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); if (!error) xfs_forget_acl(inode, name, flags); return error; @@ -536,6 +469,13 @@ xfs_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) @@ -588,13 +528,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( struct file *filp, - unsigned int cmd, xfs_flock64_t *bf) { struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; - enum xfs_prealloc_flags flags = 0; + enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; @@ -607,6 +546,9 @@ xfs_ioc_space( if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (xfs_is_always_cow_inode(ip)) + return -EOPNOTSUPP; + if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; if (filp->f_mode & FMODE_NOCMTIME) @@ -620,6 +562,7 @@ xfs_ioc_space( error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; + inode_dio_wait(inode); switch (bf->l_whence) { case 0: /*SEEK_SET*/ @@ -635,73 +578,21 @@ xfs_ioc_space( goto out_unlock; } - /* - * length of <= 0 for resv/unresv/zero is invalid. length for - * alloc/free is ignored completely and we have no idea what userspace - * might have set it to, so set it to zero to allow range - * checks to pass. - */ - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if (bf->l_len <= 0) { - error = -EINVAL; - goto out_unlock; - } - break; - default: - bf->l_len = 0; - break; - } - - if (bf->l_start < 0 || - bf->l_start > inode->i_sb->s_maxbytes || - bf->l_start + bf->l_len < 0 || - bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) { + if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { error = -EINVAL; goto out_unlock; } - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - flags |= XFS_PREALLOC_SET; - error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); - break; - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - flags |= XFS_PREALLOC_SET; - error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, - XFS_BMAPI_PREALLOC); - break; - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - error = xfs_free_file_space(ip, bf->l_start, bf->l_len); - break; - case XFS_IOC_ALLOCSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP: - case XFS_IOC_FREESP64: - flags |= XFS_PREALLOC_CLEAR; - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - - error = xfs_vn_setattr_size(file_dentry(filp), &iattr); - break; - default: - ASSERT(0); - error = -EINVAL; + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; } + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + error = xfs_vn_setattr_size(file_dentry(filp), &iattr); if (error) goto out_unlock; @@ -831,7 +722,7 @@ xfs_bulkstat_fmt( /* * Check the incoming bulk request @hdr from userspace and initialize the * internal @breq bulk request appropriately. Returns 0 if the bulk request - * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual + * should proceed; -ECANCELED if there's nothing to do; or the usual * negative error code. */ static int @@ -889,13 +780,13 @@ xfs_bulk_ireq_setup( /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) - return XFS_ITER_ABORT; + return -ECANCELED; } else if (hdr->agno) return -EINVAL; /* Asking for an inode past the end of the FS? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) - return XFS_ITER_ABORT; + return -ECANCELED; return 0; } @@ -936,7 +827,7 @@ xfs_ioc_bulkstat( return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; @@ -986,7 +877,7 @@ xfs_ioc_inumbers( return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; @@ -1038,6 +929,10 @@ xfs_ioc_ag_geometry( if (copy_from_user(&ageo, arg, sizeof(ageo))) return -EFAULT; + if (ageo.ag_flags) + return -EINVAL; + if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) + return -EINVAL; error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); if (error) @@ -1112,7 +1007,7 @@ xfs_fill_fsxattr( fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; - fa->fsx_projid = xfs_get_projid(ip); + fa->fsx_projid = ip->i_d.di_projid; if (attr) { if (ip->i_afp) { @@ -1307,11 +1202,9 @@ xfs_ioctl_setattr_dax_invalidate( * have to check the device for dax support or flush pagecache. */ if (fa->fsx_xflags & FS_XFLAG_DAX) { - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - return -EINVAL; - if (S_ISREG(inode->i_mode) && - !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), - sb->s_blocksize)) + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (!bdev_dax_supported(target->bt_bdev, sb->s_blocksize)) return -EINVAL; } @@ -1566,7 +1459,7 @@ xfs_ioctl_setattr( } if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - xfs_get_projid(ip) != fa->fsx_projid) { + ip->i_d.di_projid != fa->fsx_projid) { code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ @@ -1603,13 +1496,13 @@ xfs_ioctl_setattr( VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ - if (xfs_get_projid(ip) != fa->fsx_projid) { + if (ip->i_d.di_projid != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } ASSERT(ip->i_d.di_version > 1); - xfs_set_projid(ip, fa->fsx_projid); + ip->i_d.di_projid = fa->fsx_projid; } /* @@ -1881,7 +1774,7 @@ xfs_ioc_getfsmap( info.mp = ip->i_mount; info.data = arg; error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == -ECANCELED) { error = 0; aborted = true; } else if (error) @@ -2119,24 +2012,17 @@ xfs_file_ioctl( return xfs_ioc_setlabel(filp, mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - case XFS_IOC_ZERO_RANGE: { + case XFS_IOC_FREESP64: { xfs_flock64_t bf; if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(filp, cmd, &bf); + return xfs_ioc_space(filp, &bf); } case XFS_IOC_DIOINFO: { - struct dioattr da; - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct dioattr da; da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); @@ -2180,22 +2066,6 @@ xfs_file_ioctl( case XFS_IOC_SETXFLAGS: return xfs_ioc_setxflags(ip, filp, arg); - case XFS_IOC_FSSETDM: { - struct fsdmidata dmi; - - if (copy_from_user(&dmi, arg, sizeof(dmi))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, - dmi.fsd_dmstate); - mnt_drop_write_file(filp); - return error; - } - case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: @@ -2223,8 +2093,6 @@ xfs_file_ioctl( return -EFAULT; return xfs_open_by_handle(filp, &hreq); } - case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(filp, arg); case XFS_IOC_READLINK_BY_HANDLE: { xfs_fsop_handlereq_t hreq; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 654c0bb1bcf8..420bd95dc326 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -9,7 +9,6 @@ extern int xfs_ioc_space( struct file *filp, - unsigned int cmd, xfs_flock64_t *bf); int @@ -71,12 +70,6 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); -extern int -xfs_set_dmattrs( - struct xfs_inode *ip, - uint evmask, - uint16_t state); - struct xfs_ibulk; struct xfs_bstat; struct xfs_inogrp; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7fcf7569743f..769581a79c58 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -107,7 +107,7 @@ xfs_ioctl32_bstime_copyin( xfs_bstime_t *bstime, compat_xfs_bstime_t __user *bstime32) { - compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */ if (get_user(sec32, &bstime32->tv_sec) || get_user(bstime->tv_nsec, &bstime32->tv_nsec)) @@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); if (!kbuf) goto out_dput; @@ -450,6 +450,13 @@ xfs_compat_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, compat_ptr(ops[i].am_attrname), MAXNAMELEN); @@ -500,44 +507,6 @@ xfs_compat_attrmulti_by_handle( return error; } -STATIC int -xfs_compat_fssetdm_by_handle( - struct file *parfilp, - void __user *arg) -{ - int error; - struct fsdmidata fsd; - compat_xfs_fsop_setdm_handlereq_t dmhreq; - struct dentry *dentry; - - if (!capable(CAP_MKNOD)) - return -EPERM; - if (copy_from_user(&dmhreq, arg, - sizeof(compat_xfs_fsop_setdm_handlereq_t))) - return -EFAULT; - - dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { - error = -EPERM; - goto out; - } - - if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { - error = -EFAULT; - goto out; - } - - error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, - fsd.fsd_dmstate); - -out: - dput(dentry); - return error; -} - long xfs_file_compat_ioctl( struct file *filp, @@ -547,79 +516,23 @@ xfs_file_compat_ioctl( struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - void __user *arg = (void __user *)p; + void __user *arg = compat_ptr(p); int error; trace_xfs_file_compat_ioctl(ip); switch (cmd) { - /* No size or alignment issues on any arch */ - case XFS_IOC_DIOINFO: - case XFS_IOC_FSGEOMETRY_V4: - case XFS_IOC_FSGEOMETRY: - case XFS_IOC_AG_GEOMETRY: - case XFS_IOC_FSGETXATTR: - case XFS_IOC_FSSETXATTR: - case XFS_IOC_FSGETXATTRA: - case XFS_IOC_FSSETDM: - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - case XFS_IOC_GETBMAPX: - case XFS_IOC_FSCOUNTS: - case XFS_IOC_SET_RESBLKS: - case XFS_IOC_GET_RESBLKS: - case XFS_IOC_FSGROWFSLOG: - case XFS_IOC_GOINGDOWN: - case XFS_IOC_ERROR_INJECTION: - case XFS_IOC_ERROR_CLEARALL: - case FS_IOC_GETFSMAP: - case XFS_IOC_SCRUB_METADATA: - case XFS_IOC_BULKSTAT: - case XFS_IOC_INUMBERS: - return xfs_file_ioctl(filp, cmd, p); -#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32) - /* - * These are handled fine if no alignment issues. To support x32 - * which uses native 64-bit alignment we must emit these cases in - * addition to the ia-32 compat set below. - */ - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - case XFS_IOC_FSGEOMETRY_V1: - case XFS_IOC_FSGROWFSDATA: - case XFS_IOC_FSGROWFSRT: - case XFS_IOC_ZERO_RANGE: -#ifdef CONFIG_X86_X32 - /* - * x32 special: this gets a different cmd number from the ia-32 compat - * case below; the associated data will match native 64-bit alignment. - */ - case XFS_IOC_SWAPEXT: -#endif - return xfs_file_ioctl(filp, cmd, p); -#endif #if defined(BROKEN_X86_ALIGNMENT) case XFS_IOC_ALLOCSP_32: case XFS_IOC_FREESP_32: case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: - case XFS_IOC_RESVSP_32: - case XFS_IOC_UNRESVSP_32: - case XFS_IOC_RESVSP64_32: - case XFS_IOC_UNRESVSP64_32: - case XFS_IOC_ZERO_RANGE_32: { + case XFS_IOC_FREESP64_32: { struct xfs_flock64 bf; if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, cmd, &bf); + return xfs_ioc_space(filp, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); @@ -702,9 +615,8 @@ xfs_file_compat_ioctl( return xfs_compat_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE_32: return xfs_compat_attrmulti_by_handle(filp, arg); - case XFS_IOC_FSSETDM_BY_HANDLE_32: - return xfs_compat_fssetdm_by_handle(filp, arg); default: - return -ENOIOCTLCMD; + /* try the native version */ + return xfs_file_ioctl(filp, cmd, (unsigned long)arg); } } diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 7985344d3aa6..053de7d894cd 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -32,7 +32,7 @@ #endif typedef struct compat_xfs_bstime { - compat_time_t tv_sec; /* seconds */ + old_time32_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } compat_xfs_bstime_t; @@ -99,7 +99,7 @@ typedef struct compat_xfs_fsop_handlereq { _IOWR('X', 108, struct compat_xfs_fsop_handlereq) /* The bstat field in the swapext struct needs translation */ -typedef struct compat_xfs_swapext { +struct compat_xfs_swapext { int64_t sx_version; /* version */ int64_t sx_fdtarget; /* fd of target file */ int64_t sx_fdtmp; /* fd of tmp file */ @@ -107,7 +107,7 @@ typedef struct compat_xfs_swapext { xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */ -} __compat_packed compat_xfs_swapext_t; +} __compat_packed; #define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) @@ -143,15 +143,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { #define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) -typedef struct compat_xfs_fsop_setdm_handlereq { - struct compat_xfs_fsop_handlereq hreq; /* handle information */ - /* ptr to struct fsdmidata */ - compat_uptr_t data; /* DMAPI data */ -} compat_xfs_fsop_setdm_handlereq_t; - -#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ - _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) - #ifdef BROKEN_X86_ALIGNMENT /* on ia32 l_start is on a 32-bit boundary */ typedef struct compat_xfs_flock64 { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3a4310d7cb59..bb590a267a7f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -29,8 +29,8 @@ #include "xfs_reflink.h" -#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ - << mp->m_writeio_log) +#define XFS_ALLOC_ALIGN(mp, off) \ + (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) static int xfs_alert_fsblock_zero( @@ -54,11 +54,12 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - bool shared) + u16 flags) { struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); - if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); if (imap->br_startblock == HOLESTARTBLOCK) { @@ -77,14 +78,13 @@ xfs_bmbt_to_iomap( } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); - iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + iomap->bdev = target->bt_bdev; + iomap->dax_dev = target->bt_daxdev; + iomap->flags = flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; - if (shared) - iomap->flags |= IOMAP_F_SHARED; return 0; } @@ -95,18 +95,30 @@ xfs_hole_to_iomap( xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb) { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); - iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + iomap->bdev = target->bt_bdev; + iomap->dax_dev = target->bt_daxdev; +} + +static inline xfs_fileoff_t +xfs_iomap_end_fsb( + struct xfs_mount *mp, + loff_t offset, + loff_t count) +{ + ASSERT(offset <= mp->m_super->s_maxbytes); + return min(XFS_B_TO_FSB(mp, offset + count), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); } -xfs_extlen_t +static xfs_extlen_t xfs_eof_alignment( - struct xfs_inode *ip, - xfs_extlen_t extsize) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; xfs_extlen_t align = 0; @@ -129,111 +141,80 @@ xfs_eof_alignment( align = 0; } - /* - * Always round up the allocation request to an extent boundary - * (when file on a real-time subvolume or has di_extsize hint). - */ - if (extsize) { - if (align) - align = roundup_64(align, extsize); - else - align = extsize; - } - return align; } -STATIC int +/* + * Check if last_fsb is outside the last extent, and if so grow it to the next + * stripe unit boundary. + */ +xfs_fileoff_t xfs_iomap_eof_align_last_fsb( struct xfs_inode *ip, - xfs_extlen_t extsize, - xfs_fileoff_t *last_fsb) + xfs_fileoff_t end_fsb) { - xfs_extlen_t align = xfs_eof_alignment(ip, extsize); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + xfs_extlen_t align = xfs_eof_alignment(ip); + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + + /* + * Always round up the allocation request to the extent hint boundary. + */ + if (extsz) { + if (align) + align = roundup_64(align, extsz); + else + align = extsz; + } if (align) { - xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); - int eof, error; + xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align); - error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); - if (error) - return error; - if (eof) - *last_fsb = new_last_fsb; + xfs_iext_last(ifp, &icur); + if (!xfs_iext_get_extent(ifp, &icur, &irec) || + aligned_end_fsb >= irec.br_startoff + irec.br_blockcount) + return aligned_end_fsb; } - return 0; + + return end_fsb; } int xfs_iomap_write_direct( - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *imap, - int nmaps) + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t count_fsb, + struct xfs_bmbt_irec *imap) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t count_fsb, resaligned; - xfs_extlen_t extsz; - int nimaps; - int quota_flag; - int rt; - xfs_trans_t *tp; - uint qblocks, resblks, resrtextents; - int error; - int lockmode; - int bmapi_flags = XFS_BMAPI_PREALLOC; - uint tflags = 0; - - rt = XFS_IS_REALTIME_INODE(ip); - extsz = xfs_get_extsz_hint(ip); - lockmode = XFS_ILOCK_SHARED; /* locked by caller */ - - ASSERT(xfs_isilocked(ip, lockmode)); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + xfs_filblks_t resaligned; + int nimaps; + int quota_flag; + uint qblocks, resblks; + unsigned int resrtextents = 0; + int error; + int bmapi_flags = XFS_BMAPI_PREALLOC; + uint tflags = 0; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > XFS_ISIZE(ip)) { - /* - * Assert that the in-core extent list is present since this can - * call xfs_iread_extents() and we only have the ilock shared. - * This should be safe because the lock was held around a bmapi - * call in the caller and we only need it to access the in-core - * list. - */ - ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & - XFS_IFEXTENTS); - error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb); - if (error) - goto out_unlock; - } else { - if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) - last_fsb = min(last_fsb, (xfs_fileoff_t) - imap->br_blockcount + - imap->br_startoff); - } - count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); - resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz); - if (unlikely(rt)) { + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + xfs_get_extsz_hint(ip)); + if (unlikely(XFS_IS_REALTIME_INODE(ip))) { resrtextents = qblocks = resaligned; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { - resrtextents = 0; resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); quota_flag = XFS_QMOPT_RES_REGBLKS; } - /* - * Drop the shared lock acquired by the caller, attach the dquot if - * necessary and move on to transaction setup. - */ - xfs_iunlock(ip, lockmode); error = xfs_qm_dqattach(ip); if (error) return error; @@ -263,8 +244,7 @@ xfs_iomap_write_direct( if (error) return error; - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) @@ -277,8 +257,8 @@ xfs_iomap_write_direct( * caller gave to us. */ nimaps = 1; - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - bmapi_flags, resblks, imap, &nimaps); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, + imap, &nimaps); if (error) goto out_res_cancel; @@ -297,11 +277,11 @@ xfs_iomap_write_direct( goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) error = xfs_alert_fsblock_zero(ip, imap); out_unlock: - xfs_iunlock(ip, lockmode); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_res_cancel: @@ -410,19 +390,19 @@ xfs_iomap_prealloc_size( if (offset + count <= XFS_ISIZE(ip)) return 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && - (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))) + if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) && + (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))) return 0; /* * If an explicit allocsize is set, the file is small, or we * are writing behind a hole, then use the minimum prealloc: */ - if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || + if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || !xfs_iext_peek_prev_extent(ifp, icur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) - return mp->m_writeio_blocks; + return mp->m_allocsize_blocks; /* * Determine the initial size of the preallocation. We are beyond the @@ -515,219 +495,13 @@ xfs_iomap_prealloc_size( while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; check_writeio: - if (alloc_blocks < mp->m_writeio_blocks) - alloc_blocks = mp->m_writeio_blocks; + if (alloc_blocks < mp->m_allocsize_blocks) + alloc_blocks = mp->m_allocsize_blocks; trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, - mp->m_writeio_blocks); + mp->m_allocsize_blocks); return alloc_blocks; } -static int -xfs_file_iomap_begin_delay( - struct inode *inode, - loff_t offset, - loff_t count, - unsigned flags, - struct iomap *iomap) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t maxbytes_fsb = - XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - xfs_fileoff_t end_fsb; - struct xfs_bmbt_irec imap, cmap; - struct xfs_iext_cursor icur, ccur; - xfs_fsblock_t prealloc_blocks = 0; - bool eof = false, cow_eof = false, shared = false; - int whichfork = XFS_DATA_FORK; - int error = 0; - - ASSERT(!XFS_IS_REALTIME_INODE(ip)); - ASSERT(!xfs_get_extsz_hint(ip)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto out_unlock; - } - - XFS_STATS_INC(mp, xs_blk_mapw); - - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } - - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - - /* - * Search the data fork fork first to look up our source mapping. We - * always need the data fork map, as we have to return it to the - * iomap code so that the higher level write code can read data in to - * perform read-modify-write cycles for unaligned writes. - */ - eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); - if (eof) - imap.br_startoff = end_fsb; /* fake hole until the end */ - - /* We never need to allocate blocks for zeroing a hole. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); - goto out_unlock; - } - - /* - * Search the COW fork extent list even if we did not find a data fork - * extent. This serves two purposes: first this implements the - * speculative preallocation using cowextsize, so that we also unshare - * block adjacent to shared blocks instead of just the shared blocks - * themselves. Second the lookup in the extent list is generally faster - * than going out to the shared extent tree. - */ - if (xfs_is_cow_inode(ip)) { - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, - &ccur, &cmap); - if (!cow_eof && cmap.br_startoff <= offset_fsb) { - trace_xfs_reflink_cow_found(ip, &cmap); - whichfork = XFS_COW_FORK; - goto done; - } - } - - if (imap.br_startoff <= offset_fsb) { - /* - * For reflink files we may need a delalloc reservation when - * overwriting shared extents. This includes zeroing of - * existing extents that contain data. - */ - if (!xfs_is_cow_inode(ip) || - ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { - trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, - &imap); - goto done; - } - - xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_inode_need_cow(ip, &imap, &shared); - if (error) - goto out_unlock; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) { - trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, - &imap); - goto done; - } - - /* - * Fork all the shared blocks from our write offset until the - * end of the extent. - */ - whichfork = XFS_COW_FORK; - end_fsb = imap.br_startoff + imap.br_blockcount; - } else { - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES - * pages to keep the chunks of work done where somewhat - * symmetric with the work writeback does. This is a completely - * arbitrary number pulled out of thin air. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - - if (xfs_is_always_cow_inode(ip)) - whichfork = XFS_COW_FORK; - } - - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, - count, &icur); - if (prealloc_blocks) { - xfs_extlen_t align; - xfs_off_t end_offset; - xfs_fileoff_t p_end_fsb; - - end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); - p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + - prealloc_blocks; - - align = xfs_eof_alignment(ip, 0); - if (align) - p_end_fsb = roundup_64(p_end_fsb, align); - - p_end_fsb = min(p_end_fsb, maxbytes_fsb); - ASSERT(p_end_fsb > offset_fsb); - prealloc_blocks = p_end_fsb - end_fsb; - } - } - -retry: - error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, - whichfork == XFS_DATA_FORK ? &imap : &cmap, - whichfork == XFS_DATA_FORK ? &icur : &ccur, - whichfork == XFS_DATA_FORK ? eof : cow_eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ - trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc_blocks) { - prealloc_blocks = 0; - goto retry; - } - /*FALLTHRU*/ - default: - goto out_unlock; - } - - /* - * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch - * them out if the write happens to fail. - */ - iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, whichfork, - whichfork == XFS_DATA_FORK ? &imap : &cmap); -done: - if (whichfork == XFS_COW_FORK) { - if (imap.br_startoff > offset_fsb) { - xfs_trim_extent(&cmap, offset_fsb, - imap.br_startoff - offset_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); - goto out_unlock; - } - /* ensure we only report blocks we have a reservation for */ - xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); - shared = true; - } - error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - int xfs_iomap_write_unwritten( xfs_inode_t *ip, @@ -765,6 +539,11 @@ xfs_iomap_write_unwritten( */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + /* Attach dquots so that bmbt splits are accounted correctly. */ + error = xfs_qm_dqattach(ip); + if (error) + return error; + do { /* * Set up a transaction to convert the range of extents @@ -783,6 +562,11 @@ xfs_iomap_write_unwritten( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto error_on_bmapi_transaction; + /* * Modify the unwritten extent state of the buffer. */ @@ -814,7 +598,7 @@ xfs_iomap_write_unwritten( if (error) return error; - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) return xfs_alert_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { @@ -840,23 +624,42 @@ error_on_bmapi_transaction: static inline bool imap_needs_alloc( struct inode *inode, + unsigned flags, struct xfs_bmbt_irec *imap, int nimaps) { - return !nimaps || - imap->br_startblock == HOLESTARTBLOCK || - imap->br_startblock == DELAYSTARTBLOCK || - (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); + /* don't allocate blocks when just zeroing */ + if (flags & IOMAP_ZERO) + return false; + if (!nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK) + return true; + /* we convert unwritten extents before copying the data for DAX */ + if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) + return true; + return false; } static inline bool -needs_cow_for_zeroing( +imap_needs_cow( + struct xfs_inode *ip, + unsigned int flags, struct xfs_bmbt_irec *imap, int nimaps) { - return nimaps && - imap->br_startblock != HOLESTARTBLOCK && - imap->br_state != XFS_EXT_UNWRITTEN; + if (!xfs_is_cow_inode(ip)) + return false; + + /* when zeroing we don't have to COW holes or unwritten extents */ + if (flags & IOMAP_ZERO) { + if (!nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_state == XFS_EXT_UNWRITTEN) + return false; + } + + return true; } static int @@ -872,15 +675,8 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_cow_inode(ip) && is_write) { - /* - * FIXME: It could still overwrite on unshared extents and not - * need allocation. - */ - if (flags & IOMAP_NOWAIT) - return -EAGAIN; + if (xfs_is_cow_inode(ip) && is_write) mode = XFS_ILOCK_EXCL; - } /* * Extents not yet cached requires exclusive access, don't block. This @@ -917,111 +713,73 @@ relock: } static int -xfs_file_iomap_begin( +xfs_direct_write_iomap_begin( struct inode *inode, loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; - xfs_fileoff_t offset_fsb, end_fsb; + struct xfs_bmbt_irec imap, cmap; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; + u16 iomap_flags = 0; unsigned lockmode; + ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) && - !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { - /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, flags, - iomap); - } - /* - * Lock the inode in the manner required for the specified operation and - * check for as many conditions that would result in blocking as - * possible. This removes most of the non-blocking checks from the - * mapping code below. + * Writes that span EOF might trigger an IO size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC even if + * there is no other metadata changes pending or have been made here. */ + if (offset + length > i_size_read(inode)) + iomap_flags |= IOMAP_F_DIRTY; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; - ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset > mp->m_super->s_maxbytes - length) - length = mp->m_super->s_maxbytes - offset; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - end_fsb = XFS_B_TO_FSB(mp, offset + length); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared); - if (error) + if (imap_needs_cow(ip, flags, &imap, nimaps)) { + error = -EAGAIN; + if (flags & IOMAP_NOWAIT) goto out_unlock; - } - - /* Non-modifying mapping requested, so we are done */ - if (!(flags & (IOMAP_WRITE | IOMAP_ZERO))) - goto out_found; - - /* - * Break shared extents if necessary. Checks for non-blocking IO have - * been done up front, so we don't need to do them here. - */ - if (xfs_is_cow_inode(ip)) { - struct xfs_bmbt_irec cmap; - bool directio = (flags & IOMAP_DIRECT); - - /* if zeroing doesn't need COW allocation, then we are done. */ - if ((flags & IOMAP_ZERO) && - !needs_cow_for_zeroing(&imap, nimaps)) - goto out_found; /* may drop and re-acquire the ilock */ - cmap = imap; - error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, - directio); + error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, + &lockmode, flags & IOMAP_DIRECT); if (error) goto out_unlock; - - /* - * For buffered writes we need to report the address of the - * previous block (if there was any) so that the higher level - * write code can perform read-modify-write operations; we - * won't need the CoW fork mapping until writeback. For direct - * I/O, which must be block aligned, we need to report the - * newly allocated address. If the data fork has a hole, copy - * the COW fork mapping to avoid allocating to the data fork. - */ - if (directio || imap.br_startblock == HOLESTARTBLOCK) - imap = cmap; - + if (shared) + goto out_found_cow; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - /* Don't need to allocate over holes when doing zeroing operations. */ - if (flags & IOMAP_ZERO) - goto out_found; + if (imap_needs_alloc(inode, flags, &imap, nimaps)) + goto allocate_blocks; - if (!imap_needs_alloc(inode, &imap, nimaps)) - goto out_found; + xfs_iunlock(ip, lockmode); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); - /* If nowait is set bail since we are going to make allocations. */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; +allocate_blocks: + error = -EAGAIN; + if (flags & IOMAP_NOWAIT) goto out_unlock; - } /* * We cap the maximum length we map to a sane size to keep the chunks @@ -1033,48 +791,273 @@ xfs_file_iomap_begin( * lower level functions are updated. */ length = min_t(loff_t, length, 1024 * PAGE_SIZE); + end_fsb = xfs_iomap_end_fsb(mp, offset, length); - /* - * xfs_iomap_write_direct() expects the shared lock. It is unlocked on - * return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); + if (offset + length > XFS_ISIZE(ip)) + end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); + else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + xfs_iunlock(ip, lockmode); + + error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, + &imap); if (error) return error; - iomap->flags |= IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); -out_finish: - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); - -out_found: - ASSERT(nimaps); +out_found_cow: xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - goto out_finish; + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + if (imap.br_startblock != HOLESTARTBLOCK) { + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + if (error) + return error; + } + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); out_unlock: xfs_iunlock(ip, lockmode); return error; } +const struct iomap_ops xfs_direct_write_iomap_ops = { + .iomap_begin = xfs_direct_write_iomap_begin, +}; + static int -xfs_file_iomap_end_delalloc( - struct xfs_inode *ip, +xfs_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; + xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int allocfork = XFS_DATA_FORK; + int error = 0; + + /* we can't use delayed allocations when using extent size hints */ + if (xfs_get_extsz_hint(ip)) + return xfs_direct_write_iomap_begin(inode, offset, count, + flags, iomap, srcmap); + + ASSERT(!XFS_IS_REALTIME_INODE(ip)); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); + if (eof) + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } + + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + goto found_cow; + } + } + + if (imap.br_startoff <= offset_fsb) { + /* + * For reflink files we may need a delalloc reservation when + * overwriting shared extents. This includes zeroing of + * existing extents that contain data. + */ + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto found_imap; + } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_bmap_trim_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto found_imap; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + allocfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = xfs_iomap_end_fsb(mp, offset, count); + + if (xfs_is_always_cow_inode(ip)) + allocfork = XFS_COW_FORK; + } + + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_unlock; + + if (eof) { + prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, + count, &icur); + if (prealloc_blocks) { + xfs_extlen_t align; + xfs_off_t end_offset; + xfs_fileoff_t p_end_fsb; + + end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1); + p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; + + align = xfs_eof_alignment(ip); + if (align) + p_end_fsb = roundup_64(p_end_fsb, align); + + p_end_fsb = min(p_end_fsb, + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + ASSERT(p_end_fsb > offset_fsb); + prealloc_blocks = p_end_fsb - end_fsb; + } + } + +retry: + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + allocfork == XFS_DATA_FORK ? &imap : &cmap, + allocfork == XFS_DATA_FORK ? &icur : &ccur, + allocfork == XFS_DATA_FORK ? eof : cow_eof); + switch (error) { + case 0: + break; + case -ENOSPC: + case -EDQUOT: + /* retry without any preallocation */ + trace_xfs_delalloc_enospc(ip, offset, count); + if (prealloc_blocks) { + prealloc_blocks = 0; + goto retry; + } + /*FALLTHRU*/ + default: + goto out_unlock; + } + + if (allocfork == XFS_COW_FORK) { + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); + goto found_cow; + } + + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + +found_imap: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + +found_cow: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (imap.br_startoff <= offset_fsb) { + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + if (error) + return error; + } else { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + } + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +static int +xfs_buffered_write_iomap_end( + struct inode *inode, loff_t offset, loff_t length, ssize_t written, + unsigned flags, struct iomap *iomap) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; xfs_fileoff_t end_fsb; int error = 0; + if (iomap->type != IOMAP_DELALLOC) + return 0; + /* * Behave as if the write failed if drop writes is enabled. Set the NEW * flag to force delalloc cleanup. @@ -1119,24 +1102,51 @@ xfs_file_iomap_end_delalloc( return 0; } +const struct iomap_ops xfs_buffered_write_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, + .iomap_end = xfs_buffered_write_iomap_end, +}; + static int -xfs_file_iomap_end( +xfs_read_iomap_begin( struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { - if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) - return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written, iomap); - return 0; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + int nimaps = 1, error = 0; + bool shared = false; + unsigned lockmode; + + ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, 0); + if (!error && (flags & IOMAP_REPORT)) + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + xfs_iunlock(ip, lockmode); + + if (error) + return error; + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); } -const struct iomap_ops xfs_iomap_ops = { - .iomap_begin = xfs_file_iomap_begin, - .iomap_end = xfs_file_iomap_end, +const struct iomap_ops xfs_read_iomap_ops = { + .iomap_begin = xfs_read_iomap_begin, }; static int @@ -1145,7 +1155,8 @@ xfs_seek_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1178,8 +1189,7 @@ xfs_seek_iomap_begin( /* * Fake a hole until the end of the file. */ - data_fsb = min(XFS_B_TO_FSB(mp, offset + length), - XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + data_fsb = xfs_iomap_end_fsb(mp, offset, length); } /* @@ -1193,7 +1203,7 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1215,7 +1225,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1231,7 +1241,8 @@ xfs_xattr_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1261,7 +1272,7 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, false); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 5c2f6aa6d78f..7d3703556d0e 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -11,13 +11,14 @@ struct xfs_inode; struct xfs_bmbt_irec; -int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int); +int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, + xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); +xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, + xfs_fileoff_t end_fsb); int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, bool shared); -xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); + struct xfs_bmbt_irec *, u16); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -39,7 +40,9 @@ xfs_aligned_fsb_count( return count_fsb; } -extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ff3c1fae5357..81f2f93caec0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -20,6 +20,7 @@ #include "xfs_symlink.h" #include "xfs_dir2.h" #include "xfs_iomap.h" +#include "xfs_error.h" #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -49,8 +50,10 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = xfs_attr_set(ip, xattr->name, + strlen(xattr->name), + xattr->value, xattr->value_len, + ATTR_SECURE); if (error < 0) break; } @@ -470,20 +473,57 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + struct xfs_inode *ip = XFS_I(inode); char *link; - ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + ASSERT(ip->i_df.if_flags & XFS_IFINLINE); /* * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if * if_data is junk. */ - link = XFS_I(inode)->i_df.if_u1.if_data; - if (!link) + link = ip->i_df.if_u1.if_data; + if (XFS_IS_CORRUPT(ip->i_mount, !link)) return ERR_PTR(-EFSCORRUPTED); return link; } +static uint32_t +xfs_stat_blksize( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If the file blocks are being allocated from a realtime volume, then + * always return the realtime extent size. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + + /* + * Allow large block sizes to be reported to userspace programs if the + * "largeio" mount option is used. + * + * If compatibility mode is specified, simply return the basic unit of + * caching so that we don't get inefficient read/modify/write I/O from + * user apps. Otherwise.... + * + * If the underlying volume is a stripe, then return the stripe width in + * bytes as the recommended I/O size. It is not a stripe and we've set a + * default buffered I/O size, return that, otherwise return the compat + * default. + */ + if (mp->m_flags & XFS_MOUNT_LARGEIO) { + if (mp->m_swidth) + return mp->m_swidth << mp->m_sb.sb_blocklog; + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + return 1U << mp->m_allocsize_log; + } + + return PAGE_SIZE; +} + STATIC int xfs_vn_getattr( const struct path *path, @@ -516,8 +556,7 @@ xfs_vn_getattr( if (ip->i_d.di_version == 3) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; - stat->btime.tv_sec = ip->i_d.di_crtime.t_sec; - stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec; + stat->btime = ip->i_d.di_crtime; } } @@ -543,16 +582,7 @@ xfs_vn_getattr( stat->rdev = inode->i_rdev; break; default: - if (XFS_IS_REALTIME_INODE(ip)) { - /* - * If the file blocks are being allocated from a - * realtime volume, then return the inode's realtime - * extent size or the realtime volume's extent size. - */ - stat->blksize = - xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; - } else - stat->blksize = xfs_preferred_iosize(mp); + stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; break; } @@ -664,7 +694,7 @@ xfs_setattr_nonsize( ASSERT(gdqp == NULL); error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), xfs_kgid_to_gid(gid), - xfs_get_projid(ip), + ip->i_d.di_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -793,6 +823,7 @@ xfs_setattr_nonsize( out_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -882,10 +913,10 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_iomap_ops); + &did_zeroing, &xfs_buffered_write_iomap_ops); } else { error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_iomap_ops); + &xfs_buffered_write_iomap_ops); } if (error) @@ -1113,7 +1144,7 @@ xfs_vn_fiemap( &xfs_xattr_iomap_ops); } else { error = iomap_fiemap(inode, fieinfo, start, length, - &xfs_iomap_ops); + &xfs_read_iomap_ops); } xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); @@ -1226,7 +1257,7 @@ xfs_inode_supports_dax( return false; /* Device has to support DAX too. */ - return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL; + return xfs_inode_buftarg(ip)->bt_daxdev != NULL; } STATIC void @@ -1289,9 +1320,7 @@ xfs_setup_inode( lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_dir_key); lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); - ip->d_ops = ip->i_mount->m_dir_inode_ops; } else { - ip->d_ops = ip->i_mount->m_nondir_inode_ops; lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f5c955d35be4..4b31c29b7e6b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -84,7 +84,7 @@ xfs_bulkstat_one_int( /* xfs_iget returns the following without needing * further change. */ - buf->bs_projectid = xfs_get_projid(ip); + buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; @@ -97,8 +97,8 @@ xfs_bulkstat_one_int( buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; buf->bs_ctime = inode->i_ctime.tv_sec; buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; - buf->bs_btime = dic->di_crtime.t_sec; - buf->bs_btime_nsec = dic->di_crtime.t_nsec; + buf->bs_btime = dic->di_crtime.tv_sec; + buf->bs_btime_nsec = dic->di_crtime.tv_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; @@ -137,7 +137,7 @@ xfs_bulkstat_one_int( xfs_irele(ip); error = bc->formatter(bc->breq, buf); - if (error == XFS_IBULK_ABORT) + if (error == -ECANCELED) goto out_advance; if (error) goto out; @@ -169,7 +169,7 @@ xfs_bulkstat_one( ASSERT(breq->icount == 1); bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -181,7 +181,7 @@ xfs_bulkstat_one( * If we reported one inode to userspace then we abort because we hit * the end of the buffer. Don't leak that back to userspace. */ - if (error == XFS_IWALK_ABORT) + if (error == -ECANCELED) error = 0; return error; @@ -243,7 +243,7 @@ xfs_bulkstat( return 0; bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -342,7 +342,7 @@ xfs_inumbers_walk( int error; error = ic->formatter(ic->breq, &inogrp); - if (error && error != XFS_IBULK_ABORT) + if (error && error != -ECANCELED) return error; ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index e90c1fc5b981..96a1e2a9be3f 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -18,9 +18,6 @@ struct xfs_ibulk { /* Only iterate within the same AG as startino */ #define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) -/* Return value that means we want to abort the walk. */ -#define XFS_IBULK_ABORT (XFS_IWALK_ABORT) - /* * Advance the user buffer pointer by one record of the given size. If the * buffer is now full, return the appropriate error code. @@ -34,13 +31,21 @@ xfs_ibulk_advance( breq->ubuffer = b + bytes; breq->ocount++; - return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; + return breq->ocount == breq->icount ? -ECANCELED : 0; } /* * Return stat information in bulk (by-inode) for the filesystem. */ +/* + * Return codes for the formatter function are 0 to continue iterating, and + * non-zero to stop iterating. Any non-zero value will be passed up to the + * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop + * iteration, as neither bulkstat nor inumbers will ever generate that error + * code on their own. + */ + typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 8c7d727149ea..233dcc8784db 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -31,7 +31,7 @@ * inode it finds, it calls a walk function with the relevant inode number and * a pointer to caller-provided data. The walk function can return the usual * negative error code to stop the iteration; 0 to continue the iteration; or - * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the + * -ECANCELED to stop the iteration. This return value is returned to the * caller. * * Internally, we allow the walk function to do anything, which means that we @@ -298,7 +298,8 @@ xfs_iwalk_ag_start( error = xfs_inobt_get_rec(*curpp, irec, has_more); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1); + if (XFS_IS_CORRUPT(mp, *has_more != 1)) + return -EFSCORRUPTED; /* * If the LE lookup yielded an inobt record before the cursor position, @@ -616,7 +617,7 @@ xfs_iwalk_threaded( if (xfs_pwork_ctl_want_abort(&pctl)) break; - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); iwag->mp = mp; iwag->iwalk_fn = iwalk_fn; iwag->data = data; diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 6c960e10ed4d..37a795f03267 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -6,12 +6,17 @@ #ifndef __XFS_IWALK_H__ #define __XFS_IWALK_H__ +/* + * Return codes for the inode/inobt walk function are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iwalk or inobt_walk caller. The special value -ECANCELED can be used to + * stop iteration, as neither iwalk nor inobt_walk will ever generate that + * error code on their own. + */ + /* Walk all inodes in the filesystem starting from @startino. */ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, void *data); -/* Return values for xfs_iwalk_fn. */ -#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) -#define XFS_IWALK_ABORT (XFS_ITER_ABORT) int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, @@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, const struct xfs_inobt_rec_incore *irec, void *data); -/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */ -#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT) int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ca15105681ca..8738bb03f253 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -223,26 +223,32 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); #define ASSERT_ALWAYS(expr) \ - (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) #ifdef DEBUG #define ASSERT(expr) \ - (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) #else /* !DEBUG */ #ifdef XFS_WARN #define ASSERT(expr) \ - (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__)) #else /* !DEBUG && !XFS_WARN */ -#define ASSERT(expr) ((void)0) +#define ASSERT(expr) ((void)0) #endif /* XFS_WARN */ #endif /* DEBUG */ +#define XFS_IS_CORRUPT(mp, expr) \ + (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \ + NULL, 0, __FILE__, __LINE__, \ + __this_address), \ + true : false) + #define STATIC static noinline #ifdef CONFIG_XFS_RT diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7fc3c1ad36bc..f6006d94a581 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -57,10 +57,6 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *continued_write, int *logoffsetp); -STATIC int -xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog); STATIC void xlog_state_switch_iclogs( struct xlog *log, @@ -83,7 +79,10 @@ STATIC void xlog_ungrant_log_space( struct xlog *log, struct xlog_ticket *ticket); - +STATIC void +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); #if defined(DEBUG) STATIC void xlog_verify_dest_ptr( @@ -214,15 +213,42 @@ xlog_grant_head_wake( { struct xlog_ticket *tic; int need_bytes; + bool woken_task = false; list_for_each_entry(tic, &head->waiters, t_queue) { + + /* + * There is a chance that the size of the CIL checkpoints in + * progress at the last AIL push target calculation resulted in + * limiting the target to the log head (l_last_sync_lsn) at the + * time. This may not reflect where the log head is now as the + * CIL checkpoints may have completed. + * + * Hence when we are woken here, it may be that the head of the + * log that has moved rather than the tail. As the tail didn't + * move, there still won't be space available for the + * reservation we require. However, if the AIL has already + * pushed to the target defined by the old log head location, we + * will hang here waiting for something else to update the AIL + * push target. + * + * Therefore, if there isn't space to wake the first waiter on + * the grant head, we need to push the AIL again to ensure the + * target reflects both the current log tail and log head + * position before we wait for the tail to move again. + */ + need_bytes = xlog_ticket_reservation(log, head, tic); - if (*free_bytes < need_bytes) + if (*free_bytes < need_bytes) { + if (!woken_task) + xlog_grant_push_ail(log, need_bytes); return false; + } *free_bytes -= need_bytes; trace_xfs_log_grant_wake_up(log, tic); wake_up_process(tic->t_task); + woken_task = true; } return true; @@ -428,8 +454,7 @@ xfs_log_reserve( XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, - KM_SLEEP); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -526,16 +551,71 @@ xfs_log_done( return lsn; } +static bool +__xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + lockdep_assert_held(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); + + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + /* cycle incremented when incrementing curr_block */ + return true; + } + + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + return false; +} + +/* + * Flush iclog to disk if this is the last reference to the given iclog and the + * it is in the WANT_SYNC state. + */ +static int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + lockdep_assert_held(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_IOERROR) + return -EIO; + + if (atomic_dec_and_test(&iclog->ic_refcnt) && + __xlog_state_release_iclog(log, iclog)) { + spin_unlock(&log->l_icloglock); + xlog_sync(log, iclog); + spin_lock(&log->l_icloglock); + } + + return 0; +} + int xfs_log_release_iclog( - struct xfs_mount *mp, + struct xfs_mount *mp, struct xlog_in_core *iclog) { - if (xlog_state_release_iclog(mp->m_log, iclog)) { + struct xlog *log = mp->m_log; + bool sync; + + if (iclog->ic_state == XLOG_STATE_IOERROR) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return -EIO; } + if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { + sync = __xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + if (sync) + xlog_sync(log, iclog); + } return 0; } @@ -840,10 +920,7 @@ out_err: iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); error = xlog_state_release_iclog(log, iclog); - - spin_lock(&log->l_icloglock); switch (iclog->ic_state) { default: if (!XLOG_FORCED_SHUTDOWN(log)) { @@ -898,8 +975,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) #ifdef DEBUG first_iclog = iclog = log->l_iclog; do { - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); + if (iclog->ic_state != XLOG_STATE_IOERROR) { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); ASSERT(iclog->ic_offset == 0); } iclog = iclog->ic_next; @@ -924,21 +1001,17 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); error = xlog_state_release_iclog(log, iclog); - - spin_lock(&log->l_icloglock); - - if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE - || iclog->ic_state == XLOG_STATE_DIRTY - || iclog->ic_state == XLOG_STATE_IOERROR) ) { - - xlog_wait(&iclog->ic_force_wait, - &log->l_icloglock); - } else { + switch (iclog->ic_state) { + case XLOG_STATE_ACTIVE: + case XLOG_STATE_DIRTY: + case XLOG_STATE_IOERROR: spin_unlock(&log->l_icloglock); + break; + default: + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + break; } } @@ -1228,7 +1301,7 @@ xlog_ioend_work( * didn't succeed. */ aborted = true; - } else if (iclog->ic_state & XLOG_STATE_IOERROR) { + } else if (iclog->ic_state == XLOG_STATE_IOERROR) { aborted = true; } @@ -1404,6 +1477,7 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { + int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1415,8 +1489,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, - KM_MAYFAIL); + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, + KM_MAYFAIL | KM_ZERO); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG @@ -1452,7 +1526,7 @@ xlog_alloc_log( log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, - mp->m_fsname); + mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1468,6 +1542,8 @@ out_free_iclog: prev_iclog = iclog->ic_next; kmem_free(iclog->ic_data); kmem_free(iclog); + if (prev_iclog == log->l_iclog) + break; } out_free_log: kmem_free(log); @@ -1700,7 +1776,7 @@ xlog_write_iclog( * across the log IO to archieve that. */ down(&iclog->ic_sema); - if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) { + if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1708,13 +1784,11 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog, XFS_LI_ABORTED); + xlog_state_done_syncing(iclog, true); up(&iclog->ic_sema); return; } - iclog->ic_io_size = count; - bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; @@ -1724,9 +1798,9 @@ xlog_write_iclog( if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; - xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size); + xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count); if (is_vmalloc_addr(iclog->ic_data)) - flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size); + flush_kernel_vmap_range(iclog->ic_data, count); /* * If this log buffer would straddle the end of the log we will have @@ -1942,7 +2016,6 @@ xlog_dealloc_log( /* * Update counters atomically now that memcpy is done. */ -/* ARGSUSED */ static inline void xlog_state_finish_copy( struct xlog *log, @@ -1950,16 +2023,11 @@ xlog_state_finish_copy( int record_cnt, int copy_bytes) { - spin_lock(&log->l_icloglock); + lockdep_assert_held(&log->l_icloglock); be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; - - spin_unlock(&log->l_icloglock); -} /* xlog_state_finish_copy */ - - - +} /* * print out info relating to regions written which consume @@ -2236,15 +2304,18 @@ xlog_write_copy_finish( int log_offset, struct xlog_in_core **commit_iclog) { + int error; + if (*partial_copy) { /* * This iclog has already been marked WANT_SYNC by * xlog_state_get_iclog_space. */ + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); *record_cnt = 0; *data_cnt = 0; - return xlog_state_release_iclog(log, iclog); + goto release_iclog; } *partial_copy = 0; @@ -2252,21 +2323,25 @@ xlog_write_copy_finish( if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { /* no more space in this iclog - push it. */ + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); *record_cnt = 0; *data_cnt = 0; - spin_lock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - if (!commit_iclog) - return xlog_state_release_iclog(log, iclog); + goto release_iclog; + spin_unlock(&log->l_icloglock); ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } return 0; + +release_iclog: + error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + return error; } /* @@ -2328,7 +2403,7 @@ xlog_write( int contwr = 0; int record_cnt = 0; int data_cnt = 0; - int error; + int error = 0; *start_lsn = 0; @@ -2479,13 +2554,17 @@ next_lv: ASSERT(len == 0); + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (!commit_iclog) - return xlog_state_release_iclog(log, iclog); + if (commit_iclog) { + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } else { + error = xlog_state_release_iclog(log, iclog); + } + spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - return 0; + return error; } @@ -2496,21 +2575,35 @@ next_lv: ***************************************************************************** */ -/* Clean iclogs starting from the head. This ordering must be - * maintained, so an iclog doesn't become ACTIVE beyond one that - * is SYNCING. This is also required to maintain the notion that we use - * a ordered wait queue to hold off would be writers to the log when every - * iclog is trying to sync to disk. +/* + * An iclog has just finished IO completion processing, so we need to update + * the iclog state and propagate that up into the overall log state. Hence we + * prepare the iclog for cleaning, and then clean all the pending dirty iclogs + * starting from the head, and then wake up any threads that are waiting for the + * iclog to be marked clean. + * + * The ordering of marking iclogs ACTIVE must be maintained, so an iclog + * doesn't become ACTIVE beyond one that is SYNCING. This is also required to + * maintain the notion that we use a ordered wait queue to hold off would be + * writers to the log when every iclog is trying to sync to disk. * - * State Change: DIRTY -> ACTIVE + * Caller must hold the icloglock before calling us. + * + * State Change: !IOERROR -> DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log( - struct xlog *log) +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) { - xlog_in_core_t *iclog; - int changed = 0; + struct xlog_in_core *iclog; + int changed = 0; + + /* Prepare the completed iclog. */ + if (dirty_iclog->ic_state != XLOG_STATE_IOERROR) + dirty_iclog->ic_state = XLOG_STATE_DIRTY; + /* Walk all the iclogs to update the ordered active state. */ iclog = log->l_iclog; do { if (iclog->ic_state == XLOG_STATE_DIRTY) { @@ -2548,7 +2641,13 @@ xlog_state_clean_log( iclog = iclog->ic_next; } while (iclog != log->l_iclog); - /* log is locked when we are called */ + + /* + * Wake up threads waiting in xfs_log_force() for the dirty iclog + * to be cleaned. + */ + wake_up_all(&dirty_iclog->ic_force_wait); + /* * Change state for the dummy log recording. * We usually go to NEED. But we go to NEED2 if the changed indicates @@ -2582,7 +2681,7 @@ xlog_state_clean_log( ASSERT(0); } } -} /* xlog_state_clean_log */ +} STATIC xfs_lsn_t xlog_get_lowest_lsn( @@ -2592,7 +2691,8 @@ xlog_get_lowest_lsn( xfs_lsn_t lowest_lsn = 0, lsn; do { - if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) continue; lsn = be64_to_cpu(iclog->ic_header.h_lsn); @@ -2603,30 +2703,150 @@ xlog_get_lowest_lsn( return lowest_lsn; } +/* + * Completion of a iclog IO does not imply that a transaction has completed, as + * transactions can be large enough to span many iclogs. We cannot change the + * tail of the log half way through a transaction as this may be the only + * transaction in the log and moving the tail to point to the middle of it + * will prevent recovery from finding the start of the transaction. Hence we + * should only update the last_sync_lsn if this iclog contains transaction + * completion callbacks on it. + * + * We have to do this before we drop the icloglock to ensure we are the only one + * that can update it. + * + * If we are moving the last_sync_lsn forwards, we also need to ensure we kick + * the reservation grant head pushing. This is due to the fact that the push + * target is bound by the current last_sync_lsn value. Hence if we have a large + * amount of log space bound up in this committing transaction then the + * last_sync_lsn value may be the limiting factor preventing tail pushing from + * freeing space in the log. Hence once we've updated the last_sync_lsn we + * should push the AIL to ensure the push target (and hence the grant head) is + * no longer bound by the old log head location and can move forwards and make + * progress again. + */ +static void +xlog_state_set_callback( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t header_lsn) +{ + iclog->ic_state = XLOG_STATE_CALLBACK; + + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + header_lsn) <= 0); + + if (list_empty_careful(&iclog->ic_callbacks)) + return; + + atomic64_set(&log->l_last_sync_lsn, header_lsn); + xlog_grant_push_ail(log, 0); +} + +/* + * Return true if we need to stop processing, false to continue to the next + * iclog. The caller will need to run callbacks if the iclog is returned in the + * XLOG_STATE_CALLBACK state. + */ +static bool +xlog_state_iodone_process_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + bool *ioerror) +{ + xfs_lsn_t lowest_lsn; + xfs_lsn_t header_lsn; + + switch (iclog->ic_state) { + case XLOG_STATE_ACTIVE: + case XLOG_STATE_DIRTY: + /* + * Skip all iclogs in the ACTIVE & DIRTY states: + */ + return false; + case XLOG_STATE_IOERROR: + /* + * Between marking a filesystem SHUTDOWN and stopping the log, + * we do flush all iclogs to disk (if there wasn't a log I/O + * error). So, we do want things to go smoothly in case of just + * a SHUTDOWN w/o a LOG_IO_ERROR. + */ + *ioerror = true; + return false; + case XLOG_STATE_DONE_SYNC: + /* + * Now that we have an iclog that is in the DONE_SYNC state, do + * one more check here to see if we have chased our tail around. + * If this is not the lowest lsn iclog, then we will leave it + * for another completion to process. + */ + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) + return false; + xlog_state_set_callback(log, iclog, header_lsn); + return false; + default: + /* + * Can only perform callbacks in order. Since this iclog is not + * in the DONE_SYNC state, we skip the rest and just try to + * clean up. + */ + return true; + } +} + +/* + * Keep processing entries in the iclog callback list until we come around and + * it is empty. We need to atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more callbacks being added. + * + * This function is called with the icloglock held and returns with it held. We + * drop it while running callbacks, however, as holding it over thousands of + * callbacks is unnecessary and causes excessive contention if we do. + */ +static void +xlog_state_do_iclog_callbacks( + struct xlog *log, + struct xlog_in_core *iclog, + bool aborted) + __releases(&log->l_icloglock) + __acquires(&log->l_icloglock) +{ + spin_unlock(&log->l_icloglock); + spin_lock(&iclog->ic_callback_lock); + while (!list_empty(&iclog->ic_callbacks)) { + LIST_HEAD(tmp); + + list_splice_init(&iclog->ic_callbacks, &tmp); + + spin_unlock(&iclog->ic_callback_lock); + xlog_cil_process_committed(&tmp, aborted); + spin_lock(&iclog->ic_callback_lock); + } + + /* + * Pick up the icloglock while still holding the callback lock so we + * serialise against anyone trying to add more callbacks to this iclog + * now we've finished processing. + */ + spin_lock(&log->l_icloglock); + spin_unlock(&iclog->ic_callback_lock); +} + STATIC void xlog_state_do_callback( struct xlog *log, - bool aborted, - struct xlog_in_core *ciclog) -{ - xlog_in_core_t *iclog; - xlog_in_core_t *first_iclog; /* used to know when we've - * processed all iclogs once */ - int flushcnt = 0; - xfs_lsn_t lowest_lsn; - int ioerrors; /* counter: iclogs with errors */ - int loopdidcallbacks; /* flag: inner loop did callbacks*/ - int funcdidcallbacks; /* flag: function did callbacks */ - int repeats; /* for issuing console warnings if - * looping too many times */ - int wake = 0; + bool aborted) +{ + struct xlog_in_core *iclog; + struct xlog_in_core *first_iclog; + bool cycled_icloglock; + bool ioerror; + int flushcnt = 0; + int repeats = 0; spin_lock(&log->l_icloglock); - first_iclog = iclog = log->l_iclog; - ioerrors = 0; - funcdidcallbacks = 0; - repeats = 0; - do { /* * Scan all iclogs starting with the one pointed to by the @@ -2638,134 +2858,29 @@ xlog_state_do_callback( */ first_iclog = log->l_iclog; iclog = log->l_iclog; - loopdidcallbacks = 0; + cycled_icloglock = false; + ioerror = false; repeats++; do { + if (xlog_state_iodone_process_iclog(log, iclog, + &ioerror)) + break; - /* skip all iclogs in the ACTIVE & DIRTY states */ - if (iclog->ic_state & - (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + if (iclog->ic_state != XLOG_STATE_CALLBACK && + iclog->ic_state != XLOG_STATE_IOERROR) { iclog = iclog->ic_next; continue; } /* - * Between marking a filesystem SHUTDOWN and stopping - * the log, we do flush all iclogs to disk (if there - * wasn't a log I/O error). So, we do want things to - * go smoothly in case of just a SHUTDOWN w/o a - * LOG_IO_ERROR. + * Running callbacks will drop the icloglock which means + * we'll have to run at least one more complete loop. */ - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - /* - * Can only perform callbacks in order. Since - * this iclog is not in the DONE_SYNC/ - * DO_CALLBACK state, we skip the rest and - * just try to clean up. If we set our iclog - * to DO_CALLBACK, we will not process it when - * we retry since a previous iclog is in the - * CALLBACK and the state cannot change since - * we are holding the l_icloglock. - */ - if (!(iclog->ic_state & - (XLOG_STATE_DONE_SYNC | - XLOG_STATE_DO_CALLBACK))) { - if (ciclog && (ciclog->ic_state == - XLOG_STATE_DONE_SYNC)) { - ciclog->ic_state = XLOG_STATE_DO_CALLBACK; - } - break; - } - /* - * We now have an iclog that is in either the - * DO_CALLBACK or DONE_SYNC states. The other - * states (WANT_SYNC, SYNCING, or CALLBACK were - * caught by the above if and are going to - * clean (i.e. we aren't doing their callbacks) - * see the above if. - */ - - /* - * We will do one more check here to see if we - * have chased our tail around. - */ - - lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && - XFS_LSN_CMP(lowest_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { - iclog = iclog->ic_next; - continue; /* Leave this iclog for - * another thread */ - } - - iclog->ic_state = XLOG_STATE_CALLBACK; - - - /* - * Completion of a iclog IO does not imply that - * a transaction has completed, as transactions - * can be large enough to span many iclogs. We - * cannot change the tail of the log half way - * through a transaction as this may be the only - * transaction in the log and moving th etail to - * point to the middle of it will prevent - * recovery from finding the start of the - * transaction. Hence we should only update the - * last_sync_lsn if this iclog contains - * transaction completion callbacks on it. - * - * We have to do this before we drop the - * icloglock to ensure we are the only one that - * can update it. - */ - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - if (!list_empty_careful(&iclog->ic_callbacks)) - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); - - } else - ioerrors++; - - spin_unlock(&log->l_icloglock); - - /* - * Keep processing entries in the callback list until - * we come around and it is empty. We need to - * atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more - * callbacks being added. - */ - spin_lock(&iclog->ic_callback_lock); - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp); - - list_splice_init(&iclog->ic_callbacks, &tmp); - - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); - spin_lock(&iclog->ic_callback_lock); - } - - loopdidcallbacks++; - funcdidcallbacks++; - - spin_lock(&log->l_icloglock); - spin_unlock(&iclog->ic_callback_lock); - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) - iclog->ic_state = XLOG_STATE_DIRTY; - - /* - * Transition from DIRTY to ACTIVE if applicable. - * NOP if STATE_IOERROR. - */ - xlog_state_clean_log(log); - - /* wake up threads waiting in xfs_log_force() */ - wake_up_all(&iclog->ic_force_wait); + cycled_icloglock = true; + xlog_state_do_iclog_callbacks(log, iclog, aborted); + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2776,50 +2891,13 @@ xlog_state_do_callback( "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerrors && loopdidcallbacks); + } while (!ioerror && cycled_icloglock); -#ifdef DEBUG - /* - * Make one last gasp attempt to see if iclogs are being left in limbo. - * If the above loop finds an iclog earlier than the current iclog and - * in one of the syncing states, the current iclog is put into - * DO_CALLBACK and the callbacks are deferred to the completion of the - * earlier iclog. Walk the iclogs in order and make sure that no iclog - * is in DO_CALLBACK unless an earlier iclog is in one of the syncing - * states. - * - * Note that SYNCING|IOABORT is a valid state so we cannot just check - * for ic_state == SYNCING. - */ - if (funcdidcallbacks) { - first_iclog = iclog = log->l_iclog; - do { - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); - /* - * Terminate the loop if iclogs are found in states - * which will cause other threads to clean up iclogs. - * - * SYNCING - i/o completion will go through logs - * DONE_SYNC - interrupt thread should be waiting for - * l_icloglock - * IOERROR - give up hope all ye who enter here - */ - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state & XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_DONE_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR ) - break; - iclog = iclog->ic_next; - } while (first_iclog != iclog); - } -#endif + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || + log->l_iclog->ic_state == XLOG_STATE_IOERROR) + wake_up_all(&log->l_flush_wait); - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) - wake = 1; spin_unlock(&log->l_icloglock); - - if (wake) - wake_up_all(&log->l_flush_wait); } @@ -2845,8 +2923,6 @@ xlog_state_done_syncing( spin_lock(&log->l_icloglock); - ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_IOERROR); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* @@ -2855,8 +2931,10 @@ xlog_state_done_syncing( * and none should ever be attempted to be written to disk * again. */ - if (iclog->ic_state != XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_SYNCING) iclog->ic_state = XLOG_STATE_DONE_SYNC; + else + ASSERT(iclog->ic_state == XLOG_STATE_IOERROR); /* * Someone could be sleeping prior to writing out the next @@ -2865,7 +2943,7 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ + xlog_state_do_callback(log, aborted); /* also cleans log */ } /* xlog_state_done_syncing */ @@ -2899,7 +2977,6 @@ xlog_state_get_iclog_space( int log_offset; xlog_rec_header_t *head; xlog_in_core_t *iclog; - int error; restart: spin_lock(&log->l_icloglock); @@ -2948,24 +3025,22 @@ restart: * can fit into remaining data section. */ if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + int error = 0; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); /* - * If I'm the only one writing to this iclog, sync it to disk. - * We need to do an atomic compare and decrement here to avoid - * racing with concurrent atomic_dec_and_lock() calls in + * If we are the only one writing to this iclog, sync it to + * disk. We need to do an atomic compare and decrement here to + * avoid racing with concurrent atomic_dec_and_lock() calls in * xlog_state_release_iclog() when there is more than one * reference to the iclog. */ - if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { - /* we are the only one */ - spin_unlock(&log->l_icloglock); + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) error = xlog_state_release_iclog(log, iclog); - if (error) - return error; - } else { - spin_unlock(&log->l_icloglock); - } + spin_unlock(&log->l_icloglock); + if (error) + return error; goto restart; } @@ -3077,60 +3152,6 @@ xlog_ungrant_log_space( } /* - * Flush iclog to disk if this is the last reference to the given iclog and - * the WANT_SYNC bit is set. - * - * When this function is entered, the iclog is not necessarily in the - * WANT_SYNC state. It may be sitting around waiting to get filled. - * - * - */ -STATIC int -xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog) -{ - int sync = 0; /* do we sync? */ - - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - - ASSERT(atomic_read(&iclog->ic_refcnt) > 0); - if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) - return 0; - - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_WANT_SYNC); - - if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { - /* update tail before writing to iclog */ - xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); - sync++; - iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); - xlog_verify_tail_lsn(log, iclog, tail_lsn); - /* cycle incremented when incrementing curr_block */ - } - spin_unlock(&log->l_icloglock); - - /* - * We let the log lock go, so it's possible that we hit a log I/O - * error or some other SHUTDOWN condition that marks the iclog - * as XLOG_STATE_IOERROR before the bwrite. However, we know that - * this iclog has consistent data, so we ignore IOERROR - * flags after this point. - */ - if (sync) - xlog_sync(log, iclog); - return 0; -} /* xlog_state_release_iclog */ - - -/* * This routine will mark the current iclog in the ring as WANT_SYNC * and move the current iclog pointer to the next iclog in the ring. * When this routine is called from xlog_state_get_iclog_space(), the @@ -3223,7 +3244,7 @@ xfs_log_force( spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; if (iclog->ic_state == XLOG_STATE_DIRTY || @@ -3253,12 +3274,9 @@ xfs_log_force( atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); - if (xlog_state_release_iclog(log, iclog)) - return -EIO; + goto out_error; - spin_lock(&log->l_icloglock); if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || iclog->ic_state == XLOG_STATE_DIRTY) goto out_unlock; @@ -3283,11 +3301,11 @@ xfs_log_force( if (!(flags & XFS_LOG_SYNC)) goto out_unlock; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO; return 0; @@ -3312,7 +3330,7 @@ __xfs_log_force_lsn( spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { @@ -3341,10 +3359,8 @@ __xfs_log_force_lsn( * will go out then. */ if (!already_slept && - (iclog->ic_prev->ic_state & - (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - + (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_prev->ic_write_wait, @@ -3353,24 +3369,23 @@ __xfs_log_force_lsn( } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return -EIO; + goto out_error; if (log_flushed) *log_flushed = 1; - spin_lock(&log->l_icloglock); } if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) + (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY)) goto out_unlock; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO; return 0; @@ -3433,8 +3448,8 @@ xlog_state_want_sync( if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); } else { - ASSERT(iclog->ic_state & - (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); } } @@ -3455,7 +3470,7 @@ xfs_log_ticket_put( { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) - kmem_zone_free(xfs_log_ticket_zone, ticket); + kmem_cache_free(xfs_log_ticket_zone, ticket); } xlog_ticket_t * @@ -3811,7 +3826,7 @@ xlog_state_ioerror( xlog_in_core_t *iclog, *ic; iclog = log->l_iclog; - if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + if (iclog->ic_state != XLOG_STATE_IOERROR) { /* * Mark all the incore logs IOERROR. * From now on, no log flushes will result. @@ -3871,7 +3886,7 @@ xfs_log_force_umount( * Somebody could've already done the hard work for us. * No need to get locks for this. */ - if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } @@ -3919,22 +3934,11 @@ xfs_log_force_umount( * item committed callback functions will do this again under lock to * avoid races. */ + spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); - xlog_state_do_callback(log, true, NULL); + spin_unlock(&log->l_cilp->xc_push_lock); + xlog_state_do_callback(log, true); -#ifdef XFSERRORDEBUG - { - xlog_in_core_t *iclog; - - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - do { - ASSERT(iclog->ic_callback == 0); - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - spin_unlock(&log->l_icloglock); - } -#endif /* return non-zero if log IOERROR transition had already happened */ return retval; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fa5602d0fd7f..48435cf2aa16 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -38,7 +38,7 @@ xlog_cil_ticket_alloc( struct xlog_ticket *tic; tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, - KM_SLEEP|KM_NOFS); + KM_NOFS); /* * set the current reservation to zero so we know to steal the basic @@ -179,14 +179,14 @@ xlog_cil_alloc_shadow_bufs( /* * We free and allocate here as a realloc would copy - * unecessary data. We don't use kmem_zalloc() for the + * unnecessary data. We don't use kmem_zalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); + lv = kmem_alloc_large(buf_size, KM_NOFS); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -660,7 +660,7 @@ xlog_cil_push( if (!cil) return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); down_write(&cil->xc_ctx_lock); @@ -682,7 +682,7 @@ xlog_cil_push( } - /* check for a previously pushed seqeunce */ + /* check for a previously pushed sequence */ if (push_seq < cil->xc_ctx->sequence) { spin_unlock(&cil->xc_push_lock); goto out_skip; @@ -847,7 +847,7 @@ restart: goto out_abort; spin_lock(&commit_iclog->ic_callback_lock); - if (commit_iclog->ic_state & XLOG_STATE_IOERROR) { + if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { spin_unlock(&commit_iclog->ic_callback_lock); goto out_abort; } @@ -1179,11 +1179,11 @@ xlog_cil_init( struct xfs_cil *cil; struct xfs_cil_ctx *ctx; - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) return -ENOMEM; - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); if (!ctx) { kmem_free(cil); return -ENOMEM; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b880c23cb6e4..b192c5a9f9fd 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -40,17 +40,15 @@ static inline uint xlog_get_client_id(__be32 i) /* * In core log state */ -#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ -#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ -#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ -#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ -#define XLOG_STATE_DO_CALLBACK \ - 0x0010 /* Process callback functions */ -#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ -#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ -#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ -#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ -#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ +enum xlog_iclog_state { + XLOG_STATE_ACTIVE, /* Current IC log being written to */ + XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */ + XLOG_STATE_SYNCING, /* This IC log is syncing */ + XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ + XLOG_STATE_CALLBACK, /* Callback functions now */ + XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ + XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ +}; /* * Flags to log ticket @@ -179,8 +177,6 @@ typedef struct xlog_ticket { * - ic_next is the pointer to the next iclog in the ring. * - ic_log is a pointer back to the global log structure. * - ic_size is the full size of the log buffer, minus the cycle headers. - * - ic_io_size is the size of the currently pending log buffer write, which - * might be smaller than ic_size * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. @@ -205,9 +201,8 @@ typedef struct xlog_in_core { struct xlog_in_core *ic_prev; struct xlog *ic_log; u32 ic_size; - u32 ic_io_size; u32 ic_offset; - unsigned short ic_state; + enum xlog_iclog_state ic_state; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ @@ -399,8 +394,6 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; - /* log record crc error injection factor */ - uint32_t l_badcrc_factor; #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; @@ -542,7 +535,11 @@ xlog_cil_force(struct xlog *log) * by a spinlock. This matches the semantics of all the wait queues used in the * log code. */ -static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +static inline void +xlog_wait( + struct wait_queue_head *wq, + struct spinlock *lock) + __releases(lock) { DECLARE_WAITQUEUE(wait, current); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13d1d3e95b88..25cfc85dbaa7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -97,14 +97,15 @@ xlog_alloc_buffer( struct xlog *log, int nbblks) { + int align_mask = xfs_buftarg_dma_alignment(log->l_targ); + /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. */ - if (!xlog_verify_bno(log, 0, nbblks)) { + if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } @@ -125,7 +126,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); } /* @@ -150,11 +151,10 @@ xlog_do_io( { int error; - if (!xlog_verify_bno(log, blk_no, nbblks)) { + if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) { xfs_warn(log->l_mp, "Invalid log block/length (0x%llx, 0x%x) for buffer", blk_no, nbblks); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return -EFSCORRUPTED; } @@ -242,19 +242,17 @@ xlog_header_check_recover( * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) { xfs_warn(mp, "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); - XFS_ERROR_REPORT("xlog_header_check_recover(1)", - XFS_ERRLEVEL_HIGH, mp); return -EFSCORRUPTED; - } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + } + if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid, + &head->h_fs_uuid))) { xfs_warn(mp, "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); - XFS_ERROR_REPORT("xlog_header_check_recover(2)", - XFS_ERRLEVEL_HIGH, mp); return -EFSCORRUPTED; } return 0; @@ -277,11 +275,10 @@ xlog_header_check_mount( * by IRIX and continue. */ xfs_warn(mp, "null uuid in log - IRIX style log"); - } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid, + &head->h_fs_uuid))) { xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); - XFS_ERROR_REPORT("xlog_header_check_mount", - XFS_ERRLEVEL_HIGH, mp); return -EFSCORRUPTED; } return 0; @@ -297,7 +294,7 @@ xlog_recover_iodone( * this during recovery. One strike! */ if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); } } @@ -469,7 +466,7 @@ xlog_find_verify_log_record( xfs_warn(log->l_mp, "Log inconsistent (didn't find previous header)"); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; goto out; } @@ -1345,10 +1342,11 @@ xlog_find_tail( error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer, &rhead_blk, &rhead, &wrapped); if (error < 0) - return error; + goto done; if (!error) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - return -EIO; + error = -EFSCORRUPTED; + goto done; } *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); @@ -1697,11 +1695,10 @@ xlog_clear_stale_blocks( * the distance from the beginning of the log to the * tail. */ - if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { - XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + head_block < tail_block || + head_block >= log->l_logBBsize)) return -EFSCORRUPTED; - } tail_distance = tail_block + (log->l_logBBsize - head_block); } else { /* @@ -1709,11 +1706,10 @@ xlog_clear_stale_blocks( * so the distance from the head to the tail is just * the tail block minus the head block. */ - if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ - XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + head_block >= tail_block || + head_cycle != tail_cycle + 1)) return -EFSCORRUPTED; - } tail_distance = tail_block - head_block; } @@ -1938,6 +1934,12 @@ xlog_recover_buffer_pass1( struct list_head *bucket; struct xfs_buf_cancel *bcp; + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + /* * If this isn't a cancel buffer item, then just return. */ @@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1( } } - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); bcp->bc_blkno = buf_f->blf_blkno; bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; @@ -2133,13 +2135,11 @@ xlog_recover_do_inode_buffer( */ logged_nextp = item->ri_buf[item_index].i_addr + next_unlinked_offset - reg_buf_offset; - if (unlikely(*logged_nextp == 0)) { + if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { xfs_alert(mp, "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); - XFS_ERROR_REPORT("xlog_recover_do_inode_buf", - XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -2574,6 +2574,7 @@ xlog_recover_do_reg_buffer( int bit; int nbits; xfs_failaddr_t fa; + const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); @@ -2616,7 +2617,7 @@ xlog_recover_do_reg_buffer( "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + if (item->ri_buf[i].i_len < size_disk_dquot) { xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); @@ -2744,15 +2745,10 @@ xlog_recover_buffer_pass2( if (buf_f->blf_flags & XFS_BLF_INODE_BUF) buf_flags |= XBF_UNMAPPED; - bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, NULL); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); - goto out_release; - } + error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, &bp, NULL); + if (error) + return error; /* * Recover the buffer only if we get an LSN from it and it's less than @@ -2930,7 +2926,7 @@ xlog_recover_inode_pass2( if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -2949,17 +2945,10 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - &xfs_inode_buf_ops); - if (!bp) { - error = -ENOMEM; + error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + 0, &bp, &xfs_inode_buf_ops); + if (error) goto error; - } - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - goto out_release; - } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2967,22 +2956,18 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { + if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); - XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", - XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_release; } ldip = item->ri_buf[1].i_addr; - if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) { + if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", __func__, item, in_f->ilf_ino); - XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", - XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_release; } @@ -3164,7 +3149,7 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; goto out_release; } } @@ -3245,12 +3230,12 @@ xlog_recover_dquot_pass2( recddq = item->ri_buf[1].i_addr; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return -EIO; + return -EFSCORRUPTED; } - if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); - return -EIO; + return -EFSCORRUPTED; } /* @@ -3277,7 +3262,7 @@ xlog_recover_dquot_pass2( if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); - return -EIO; + return -EFSCORRUPTED; } ASSERT(dq_f->qlf_len == 1); @@ -3535,6 +3520,7 @@ xfs_cui_copy_format( memcpy(dst_cui_fmt, src_cui_fmt, len); return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -3599,8 +3585,10 @@ xlog_recover_cud_pass2( struct xfs_ail *ailp = log->l_ailp; cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } cui_id = cud_formatp->cud_cui_id; /* @@ -3652,6 +3640,7 @@ xfs_bui_copy_format( memcpy(dst_bui_fmt, src_bui_fmt, len); return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -3675,8 +3664,10 @@ xlog_recover_bui_pass2( bui_formatp = item->ri_buf[0].i_addr; - if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } buip = xfs_bui_init(mp); error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); if (error) { @@ -3718,8 +3709,10 @@ xlog_recover_bud_pass2( struct xfs_ail *ailp = log->l_ailp; bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } bui_id = bud_formatp->bud_bui_id; /* @@ -4016,7 +4009,7 @@ xlog_recover_commit_pass1( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } } @@ -4064,7 +4057,7 @@ xlog_recover_commit_pass2( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } } @@ -4161,7 +4154,7 @@ xlog_recover_add_item( { xlog_recover_item_t *item; - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -4185,7 +4178,7 @@ xlog_recover_add_to_cont_trans( ASSERT(len <= sizeof(struct xfs_trans_header)); if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); - return -EIO; + return -EFSCORRUPTED; } xlog_recover_add_item(&trans->r_itemq); @@ -4201,7 +4194,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, 0); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4241,13 +4234,13 @@ xlog_recover_add_to_trans( xfs_warn(log->l_mp, "%s: bad header magic number", __func__); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } /* @@ -4261,7 +4254,7 @@ xlog_recover_add_to_trans( return 0; } - ptr = kmem_alloc(len, KM_SLEEP); + ptr = kmem_alloc(len, 0); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; @@ -4283,15 +4276,24 @@ xlog_recover_add_to_trans( in_f->ilf_size); ASSERT(0); kmem_free(ptr); - return -EIO; + return -EFSCORRUPTED; } item->ri_total = in_f->ilf_size; item->ri_buf = kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - KM_SLEEP); + 0); + } + + if (item->ri_total <= item->ri_cnt) { + xfs_warn(log->l_mp, + "log item region count (%d) overflowed size (%d)", + item->ri_cnt, item->ri_total); + ASSERT(0); + kmem_free(ptr); + return -EFSCORRUPTED; } - ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; @@ -4378,7 +4380,7 @@ xlog_recovery_process_trans( default: xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; break; } if (error || freeit) @@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans( * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ - trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans = kmem_zalloc(sizeof(struct xlog_recover), 0); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); @@ -4458,7 +4460,7 @@ xlog_recover_process_ophdr( xfs_warn(log->l_mp, "%s: bad clientid 0x%x", __func__, ohead->oh_clientid); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } /* @@ -4468,7 +4470,7 @@ xlog_recover_process_ophdr( if (dp + len > end) { xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); WARN_ON(1); - return -EIO; + return -EFSCORRUPTED; } trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); @@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink( } /* - * xlog_iunlink_recover + * Recover AGI unlinked lists + * + * This is called during recovery to process any inodes which we unlinked but + * not freed when the system crashed. These inodes will be on the lists in the + * AGI blocks. What we do here is scan all the AGIs and fully truncate and free + * any inodes found on the lists. Each inode is removed from the lists when it + * has been fully truncated and is freed. The freeing of the inode and its + * removal from the list must be atomic. + * + * If everything we touch in the agi processing loop is already in memory, this + * loop can hold the cpu for a long time. It runs without lock contention, + * memory allocation contention, the need wait for IO, etc, and so will run + * until we either run out of inodes to process, run low on memory or we run out + * of log space. * - * This is called during recovery to process any inodes which - * we unlinked but not freed when the system crashed. These - * inodes will be on the lists in the AGI blocks. What we do - * here is scan all the AGIs and fully truncate and free any - * inodes found on the lists. Each inode is removed from the - * lists when it has been fully truncated and is freed. The - * freeing of the inode and its removal from the list must be - * atomic. + * This behaviour is bad for latency on single CPU and non-preemptible kernels, + * and can prevent other filesytem work (such as CIL pushes) from running. This + * can lead to deadlocks if the recovery process runs out of log reservation + * space. Hence we need to yield the CPU when there is other kernel work + * scheduled on this CPU to ensure other scheduled work can run without undue + * latency. */ STATIC void xlog_recover_process_iunlinks( @@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks( while (agino != NULLAGINO) { agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); + cond_resched(); } } xfs_buf_rele(agibp); @@ -5158,8 +5172,10 @@ xlog_recover_process( * If the filesystem is CRC enabled, this mismatch becomes a * fatal log corruption failure. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } } xlog_unpack_data(rhead, dp, log); @@ -5176,31 +5192,25 @@ xlog_valid_rec_header( { int hlen; - if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { - XFS_ERROR_REPORT("xlog_valid_rec_header(1)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) return -EFSCORRUPTED; - } - if (unlikely( - (!rhead->h_version || - (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { + if (XFS_IS_CORRUPT(log->l_mp, + (!rhead->h_version || + (be32_to_cpu(rhead->h_version) & + (~XLOG_VERSION_OKBITS))))) { xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); - return -EIO; + return -EFSCORRUPTED; } /* LR body must have data or it wouldn't have been written */ hlen = be32_to_cpu(rhead->h_len); - if (unlikely( hlen <= 0 || hlen > INT_MAX )) { - XFS_ERROR_REPORT("xlog_valid_rec_header(2)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX)) return -EFSCORRUPTED; - } - if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { - XFS_ERROR_REPORT("xlog_valid_rec_header(3)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + blkno > log->l_logBBsize || blkno > INT_MAX)) return -EFSCORRUPTED; - } return 0; } @@ -5282,8 +5292,12 @@ xlog_do_recovery_pass( "invalid iclog size (%d bytes), using lsunit (%d bytes)", h_size, log->l_mp->m_logbsize); h_size = log->l_mp->m_logbsize; - } else - return -EFSCORRUPTED; + } else { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, + log->l_mp); + error = -EFSCORRUPTED; + goto bread_err1; + } } if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && @@ -5527,7 +5541,7 @@ xlog_do_log_recovery( */ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * sizeof(struct list_head), - KM_SLEEP); + 0); for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); @@ -5613,7 +5627,7 @@ xlog_do_recover( error = xfs_buf_submit(bp); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); ASSERT(0); } xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 9804efe525a9..e0f9d3b6abe9 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -20,8 +20,8 @@ __xfs_printk( const struct xfs_mount *mp, struct va_format *vaf) { - if (mp && mp->m_fsname) { - printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + if (mp && mp->m_super) { + printk("%sXFS (%s): %pV\n", level, mp->m_super->s_id, vaf); return; } printk("%sXFS: %pV\n", level, vaf); @@ -86,17 +86,25 @@ xfs_alert_tag( } void -asswarn(char *expr, char *file, int line) +asswarn( + struct xfs_mount *mp, + char *expr, + char *file, + int line) { - xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + xfs_warn(mp, "Assertion failed: %s, file: %s, line: %d", expr, file, line); WARN_ON(1); } void -assfail(char *expr, char *file, int line) +assfail( + struct xfs_mount *mp, + char *expr, + char *file, + int line) { - xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + xfs_emerg(mp, "Assertion failed: %s, file: %s, line: %d", expr, file, line); if (xfs_globals.bug_on_assert) BUG(); @@ -105,7 +113,7 @@ assfail(char *expr, char *file, int line) } void -xfs_hex_dump(void *p, int length) +xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 34447dca97d1..0b05e10995a0 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -57,9 +57,9 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) -extern void assfail(char *expr, char *f, int l); -extern void asswarn(char *expr, char *f, int l); +void assfail(struct xfs_mount *mp, char *expr, char *f, int l); +void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); -extern void xfs_hex_dump(void *p, int length); +extern void xfs_hex_dump(const void *p, int length); #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 322da6909290..56efe140c923 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -82,7 +82,7 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - KM_SLEEP); + 0); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -214,7 +214,7 @@ xfs_initialize_perag( spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - BUG(); + WARN_ON_ONCE(1); spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; @@ -360,108 +360,122 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } - } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && - xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; } return 0; } -/* - * Set the default minimum read and write sizes unless - * already specified in a mount option. - * We use smaller I/O sizes when the file system - * is being used for NFS service (wsync mount option). - */ -STATIC void -xfs_set_rw_sizes(xfs_mount_t *mp) +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); - int readio_log, writeio_log; + struct xfs_sb *sbp = &mp->m_sb; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { - if (mp->m_flags & XFS_MOUNT_WSYNC) { - readio_log = XFS_WSYNC_READIO_LOG; - writeio_log = XFS_WSYNC_WRITEIO_LOG; - } else { - readio_log = XFS_READIO_LOG_LARGE; - writeio_log = XFS_WRITEIO_LOG_LARGE; - } - } else { - readio_log = mp->m_readio_log; - writeio_log = mp->m_writeio_log; - } + if (mp->m_dalign) { + bool update_sb; + int error; - if (sbp->sb_blocklog > readio_log) { - mp->m_readio_log = sbp->sb_blocklog; - } else { - mp->m_readio_log = readio_log; - } - mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); - if (sbp->sb_blocklog > writeio_log) { - mp->m_writeio_log = sbp->sb_blocklog; - } else { - mp->m_writeio_log = writeio_log; + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; + } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && + xfs_sb_version_hasdalign(&mp->m_sb)) { + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } - mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); + + return 0; } /* @@ -687,12 +701,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -703,10 +717,22 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); if (error) goto out; @@ -728,9 +754,12 @@ xfs_mountfs( goto out_remove_errortag; /* - * Set the minimum read and write sizes + * Update the preferred write size based on the information from the + * on-disk superblock. */ - xfs_set_rw_sizes(mp); + mp->m_allocsize_log = + max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log); + mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog); /* set the low space thresholds for dynamic preallocation */ xfs_set_low_space_thresholds(mp); @@ -796,9 +825,8 @@ xfs_mountfs( goto out_free_dir; } - if (!sbp->sb_logblocks) { + if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { xfs_warn(mp, "no log defined"); - XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_free_perag; } @@ -836,12 +864,10 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) { + if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); - XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, - mp); error = -EFSCORRUPTED; goto out_rele_rip; } @@ -1277,7 +1303,7 @@ xfs_mod_fdblocks( printk_once(KERN_WARNING "Filesystem \"%s\": reserve blocks depleted! " "Consider increasing reserve pool size.", - mp->m_fsname); + mp->m_super->s_id); fdblocks_enospc: spin_unlock(&mp->m_sb_lock); return -ENOSPC; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4adb6837439a..88ab09ed29e7 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -9,10 +9,8 @@ struct xlog; struct xfs_inode; struct xfs_mru_cache; -struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; -struct xfs_dir_ops; struct xfs_da_geometry; /* dynamic preallocation free space thresholds, 5% down to 1% */ @@ -59,7 +57,6 @@ struct xfs_error_cfg { typedef struct xfs_mount { struct super_block *m_super; - xfs_tid_t m_tid; /* next unused tid for fs */ /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -89,8 +86,6 @@ typedef struct xfs_mount { struct percpu_counter m_delalloc_blks; struct xfs_buf *m_sb_bp; /* buffer for superblock */ - char *m_fsname; /* filesystem name */ - int m_fsname_len; /* strlen of fs name */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ int m_bsize; /* fs logical block size */ @@ -98,10 +93,8 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ spinlock_t m_agirotor_lock;/* .. and lock protecting it */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_readio_log; /* min read size log bytes */ - uint m_readio_blocks; /* min read size blocks */ - uint m_writeio_log; /* min write size log bytes */ - uint m_writeio_blocks; /* min write size blocks */ + uint m_allocsize_log;/* min write size log bytes */ + uint m_allocsize_blocks; /* min write size blocks */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ @@ -159,10 +152,6 @@ typedef struct xfs_mount { int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ - const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ - const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ - const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ - uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ @@ -229,7 +218,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ #define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ -#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ +#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */ #define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */ #define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */ #define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ @@ -238,7 +227,7 @@ typedef struct xfs_mount { * allocation */ #define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ -#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred +#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred * I/O size in stat() */ #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ @@ -246,13 +235,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ - -/* - * Default minimum read and write sizes. - */ -#define XFS_READIO_LOG_LARGE 16 -#define XFS_WRITEIO_LOG_LARGE 16 - /* * Max and min values for mount-option defined I/O * preallocation sizes. @@ -260,37 +242,6 @@ typedef struct xfs_mount { #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -/* - * Synchronous read and write sizes. This should be - * better for NFSv2 wsync filesystems. - */ -#define XFS_WSYNC_READIO_LOG 15 /* 32k */ -#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ - -/* - * Allow large block sizes to be reported to userspace programs if the - * "largeio" mount option is used. - * - * If compatibility mode is specified, simply return the basic unit of caching - * so that we don't get inefficient read/modify/write I/O from user apps. - * Otherwise.... - * - * If the underlying volume is a stripe, then return the stripe width in bytes - * as the recommended I/O size. It is not a stripe and we've set a default - * buffered I/O size, return that, otherwise return the compat default. - */ -static inline unsigned long -xfs_preferred_iosize(xfs_mount_t *mp) -{ - if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE) - return PAGE_SIZE; - return (mp->m_swidth ? - (mp->m_swidth << mp->m_sb.sb_blocklog) : - ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? - (1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) : - PAGE_SIZE)); -} - #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) @@ -327,13 +278,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* per-AG block reservation data structures*/ -enum xfs_ag_resv_type { - XFS_AG_RESV_NONE = 0, - XFS_AG_RESV_AGFL, - XFS_AG_RESV_METADATA, - XFS_AG_RESV_RMAPBT, -}; - struct xfs_ag_resv { /* number of blocks originally reserved here */ xfs_extlen_t ar_orig_reserved; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 74738813f60d..a06661dac5be 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -333,12 +333,12 @@ xfs_mru_cache_create( if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) return -EINVAL; - if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + if (!(mru = kmem_zalloc(sizeof(*mru), 0))) return -ENOMEM; /* An extra list is needed to avoid reaping up to a grp_time early. */ mru->grp_count = grp_count + 1; - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); if (!mru->lists) { err = -ENOMEM; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index b6701b4f59a9..5f04d8a5ab2a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -111,6 +111,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); /* log structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0c954cad7449..bb3008d390aa 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -12,6 +12,7 @@ #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_iomap.h" +#include "xfs_pnfs.h" /* * Ensure that we do not have any outstanding pNFS layouts that can be used by @@ -32,7 +33,7 @@ xfs_break_leased_layouts( struct xfs_inode *ip = XFS_I(inode); int error; - while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + while ((error = break_layout(inode, false)) == -EWOULDBLOCK) { xfs_iunlock(ip, *iolock); *did_unlock = true; error = break_layout(inode, true); @@ -59,7 +60,7 @@ xfs_fs_get_uuid( printk_once(KERN_NOTICE "XFS (%s): using experimental pNFS feature, use at your own risk!\n", - mp->m_fsname); + mp->m_super->s_id); if (*len < sizeof(uuid_t)) return -EINVAL; @@ -142,43 +143,38 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); - xfs_iunlock(ip, lock_flags); - if (error) - goto out_unlock; + ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); + + if (!error && write && + (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { + if (offset + length > XFS_ISIZE(ip)) + end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); + else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + end_fsb = min(end_fsb, imap.br_startoff + + imap.br_blockcount); + xfs_iunlock(ip, lock_flags); + + error = xfs_iomap_write_direct(ip, offset_fsb, + end_fsb - offset_fsb, &imap); + if (error) + goto out_unlock; - if (write) { - enum xfs_prealloc_flags flags = 0; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - - if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { - /* - * xfs_iomap_write_direct() expects to take ownership of - * the shared ilock. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_iomap_write_direct(ip, offset, length, - &imap, nimaps); - if (error) - goto out_unlock; - - /* - * Ensure the next transaction is committed - * synchronously so that the blocks allocated and - * handed out to the client are guaranteed to be - * present even after a server crash. - */ - flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; - } - - error = xfs_update_prealloc_flags(ip, flags); + /* + * Ensure the next transaction is committed synchronously so + * that the blocks allocated and handed out to the client are + * guaranteed to be present even after a server crash. + */ + error = xfs_update_prealloc_flags(ip, + XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); if (error) goto out_unlock; + } else { + xfs_iunlock(ip, lock_flags); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5e7a37f0cf84..0b0909657bad 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -22,6 +22,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_error.h" /* * The global quota manager. There is only one of these for the entire @@ -29,10 +30,10 @@ * quota functionality, including maintaining the freelist and hash * tables of dquots. */ -STATIC int xfs_qm_init_quotainos(xfs_mount_t *); -STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); +STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp); +STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp); -STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); +STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -243,14 +244,14 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqattach_one( - xfs_inode_t *ip, - xfs_dqid_t id, - uint type, - bool doalloc, - xfs_dquot_t **IO_idqpp) + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + bool doalloc, + struct xfs_dquot **IO_idqpp) { - xfs_dquot_t *dqp; - int error; + struct xfs_dquot *dqp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; @@ -341,7 +342,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -539,12 +540,12 @@ xfs_qm_shrink_count( STATIC void xfs_qm_set_defquota( - xfs_mount_t *mp, - uint type, - xfs_quotainfo_t *qinf) + struct xfs_mount *mp, + uint type, + struct xfs_quotainfo *qinf) { - xfs_dquot_t *dqp; - struct xfs_def_quota *defq; + struct xfs_dquot *dqp; + struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; int error; @@ -642,7 +643,7 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0); error = list_lru_init(&qinf->qi_lru); if (error) @@ -709,9 +710,9 @@ out_free_qinf: */ void xfs_qm_destroy_quotainfo( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_quotainfo_t *qi; + struct xfs_quotainfo *qi; qi = mp->m_quotainfo; ASSERT(qi != NULL); @@ -754,11 +755,15 @@ xfs_qm_qino_alloc( if ((flags & XFS_QMOPT_PQUOTA) && (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; - ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + if (XFS_IS_CORRUPT(mp, + mp->m_sb.sb_pquotino != NULLFSINO)) + return -EFSCORRUPTED; } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; - ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + if (XFS_IS_CORRUPT(mp, + mp->m_sb.sb_gquotino != NULLFSINO)) + return -EFSCORRUPTED; } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ip); @@ -978,7 +983,7 @@ xfs_qm_reset_dqcounts_buf( if (qip->i_d.di_nblocks == 0) return 0; - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); @@ -1559,7 +1564,7 @@ error_rele: STATIC void xfs_qm_destroy_quotainos( - xfs_quotainfo_t *qi) + struct xfs_quotainfo *qi) { if (qi->qi_uquotaip) { xfs_irele(qi->qi_uquotaip); @@ -1693,7 +1698,7 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { - if (xfs_get_projid(ip) != prid) { + if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, true, &pq); @@ -1737,14 +1742,14 @@ error_rele: * Actually transfer ownership, and do dquot modifications. * These were already reserved. */ -xfs_dquot_t * +struct xfs_dquot * xfs_qm_vop_chown( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t **IO_olddq, - xfs_dquot_t *newdq) + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot **IO_olddq, + struct xfs_dquot *newdq) { - xfs_dquot_t *prevdq; + struct xfs_dquot *prevdq; uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -1827,7 +1832,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { @@ -1928,7 +1933,7 @@ xfs_qm_vop_create_dqattach( } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index b41b75089548..4e57edca8bce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -54,7 +54,7 @@ struct xfs_def_quota { * Various quota information for individual filesystems. * The mount structure keeps a pointer to this. */ -typedef struct xfs_quotainfo { +struct xfs_quotainfo { struct radix_tree_root qi_uquota_tree; struct radix_tree_root qi_gquota_tree; struct radix_tree_root qi_pquota_tree; @@ -64,9 +64,9 @@ typedef struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct list_lru qi_lru; int qi_dquots; - time_t qi_btimelimit; /* limit for blks timer */ - time_t qi_itimelimit; /* limit for inodes timer */ - time_t qi_rtbtimelimit;/* limit for rt blks timer */ + time64_t qi_btimelimit; /* limit for blks timer */ + time64_t qi_itimelimit; /* limit for inodes timer */ + time64_t qi_rtbtimelimit;/* limit for rt blks timer */ xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ @@ -76,8 +76,8 @@ typedef struct xfs_quotainfo { struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; -} xfs_quotainfo_t; + struct shrinker qi_shrinker; +}; static inline struct radix_tree_root * xfs_dquot_tree( diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 5d72e88598b4..fc2fa418919f 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -54,13 +54,13 @@ xfs_fill_statvfs_from_dquot( */ void xfs_qm_statvfs( - xfs_inode_t *ip, + struct xfs_inode *ip, struct kstatfs *statp) { - xfs_mount_t *mp = ip->i_mount; - xfs_dquot_t *dqp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index da7ad0383037..1ea82764bf89 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -19,9 +19,72 @@ #include "xfs_qm.h" #include "xfs_icache.h" -STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); -STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, - uint); +STATIC int +xfs_qm_log_quotaoff( + struct xfs_mount *mp, + struct xfs_qoff_logitem **qoffstartp, + uint flags) +{ + struct xfs_trans *tp; + int error; + struct xfs_qoff_logitem *qoffi; + + *qoffstartp = NULL; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) + goto out; + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_log_sb(tp); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + if (error) + goto out; + + *qoffstartp = qoffi; +out: + return error; +} + +STATIC int +xfs_qm_log_quotaoff_end( + struct xfs_mount *mp, + struct xfs_qoff_logitem *startqoff, + uint flags) +{ + struct xfs_trans *tp; + int error; + struct xfs_qoff_logitem *qoffi; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) + return error; + + qoffi = xfs_trans_get_qoff_item(tp, startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); +} /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -40,7 +103,7 @@ xfs_qm_scall_quotaoff( uint dqtype; int error; uint inactivate_flags; - xfs_qoff_logitem_t *qoffstart; + struct xfs_qoff_logitem *qoffstart; /* * No file system can have quotas enabled on disk but not in core. @@ -538,74 +601,6 @@ out_unlock: return error; } -STATIC int -xfs_qm_log_quotaoff_end( - xfs_mount_t *mp, - xfs_qoff_logitem_t *startqoff, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); - if (error) - return error; - - qoffi = xfs_trans_get_qoff_item(tp, startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - - -STATIC int -xfs_qm_log_quotaoff( - xfs_mount_t *mp, - xfs_qoff_logitem_t **qoffstartp, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - *qoffstartp = NULL; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); - if (error) - goto out; - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - xfs_log_sb(tp); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp); - if (error) - goto out; - - *qoffstartp = qoffi; -out: - return error; -} - /* Fill out the quota context. */ static void xfs_qm_scall_getquota_fill_qc( diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index cd6c7210a373..38669e827206 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,9 +37,9 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = q->qi_btimelimit; - tstate->ino_timelimit = q->qi_itimelimit; - tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_timelimit = (u32)q->qi_btimelimit; + tstate->ino_timelimit = (u32)q->qi_itimelimit; + tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; tstate->spc_warnlimit = q->qi_bwarnlimit; tstate->ino_warnlimit = q->qi_iwarnlimit; tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; @@ -201,6 +201,9 @@ xfs_fs_rm_xquota( if (XFS_IS_QUOTA_ON(mp)) return -EINVAL; + if (uflags & ~(FS_USER_QUOTA | FS_GROUP_QUOTA | FS_PROJ_QUOTA)) + return -EINVAL; + if (uflags & FS_USER_QUOTA) flags |= XFS_DQ_USER; if (uflags & FS_GROUP_QUOTA) diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d8288aa0670a..8eeed73928cd 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -17,7 +17,7 @@ #include "xfs_refcount_item.h" #include "xfs_log.h" #include "xfs_refcount.h" - +#include "xfs_error.h" kmem_zone_t *xfs_cui_zone; kmem_zone_t *xfs_cud_zone; @@ -34,7 +34,7 @@ xfs_cui_item_free( if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else - kmem_zone_free(xfs_cui_zone, cuip); + kmem_cache_free(xfs_cui_zone, cuip); } /* @@ -144,9 +144,9 @@ xfs_cui_init( ASSERT(nextents > 0); if (nextents > XFS_CUI_MAX_FAST_EXTENTS) cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - KM_SLEEP); + 0); else - cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; @@ -206,7 +206,7 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); - kmem_zone_free(xfs_cud_zone, cudp); + kmem_cache_free(xfs_cud_zone, cudp); } static const struct xfs_item_ops xfs_cud_item_ops = { @@ -223,7 +223,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + cudp = kmem_zone_zalloc(xfs_cud_zone, 0); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; @@ -497,7 +497,7 @@ xfs_cui_recover( */ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_cui_release(cuip); - return -EIO; + return -EFSCORRUPTED; } } @@ -536,6 +536,7 @@ xfs_cui_recover( type = refc_type; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto abort_error; } @@ -555,26 +556,24 @@ xfs_cui_recover( irec.br_blockcount = new_len; switch (type) { case XFS_REFCOUNT_INCREASE: - error = xfs_refcount_increase_extent(tp, &irec); + xfs_refcount_increase_extent(tp, &irec); break; case XFS_REFCOUNT_DECREASE: - error = xfs_refcount_decrease_extent(tp, &irec); + xfs_refcount_decrease_extent(tp, &irec); break; case XFS_REFCOUNT_ALLOC_COW: - error = xfs_refcount_alloc_cow_extent(tp, + xfs_refcount_alloc_cow_extent(tp, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: - error = xfs_refcount_free_cow_extent(tp, + xfs_refcount_free_cow_extent(tp, irec.br_startblock, irec.br_blockcount); break; default: ASSERT(0); } - if (error) - goto abort_error; requeue_only = true; } } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c4ec7afd1170..b0ce04ffd3cd 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -143,8 +143,6 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; - if (!agbp) - return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); @@ -223,8 +221,8 @@ xfs_reflink_trim_around_shared( } } -bool -xfs_inode_need_cow( +int +xfs_bmap_trim_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared) @@ -308,13 +306,13 @@ static int xfs_find_trim_cow_extent( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, bool *shared, bool *found) { xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_iext_cursor icur; - struct xfs_bmbt_irec got; *found = false; @@ -322,23 +320,22 @@ xfs_find_trim_cow_extent( * If we don't find an overlapping extent, trim the range we need to * allocate to fit the hole we found. */ - if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) - got.br_startoff = offset_fsb + count_fsb; - if (got.br_startoff > offset_fsb) { + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap)) + cmap->br_startoff = offset_fsb + count_fsb; + if (cmap->br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, - got.br_startoff - imap->br_startoff); - return xfs_inode_need_cow(ip, imap, shared); + cmap->br_startoff - imap->br_startoff); + return xfs_bmap_trim_cow(ip, imap, shared); } *shared = true; - if (isnullstartblock(got.br_startblock)) { - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + if (isnullstartblock(cmap->br_startblock)) { + xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount); return 0; } /* real extent found - no need to allocate */ - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; + xfs_trim_extent(cmap, offset_fsb, count_fsb); *found = true; return 0; } @@ -348,6 +345,7 @@ int xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now) @@ -367,7 +365,7 @@ xfs_reflink_allocate_cow( xfs_ifork_init_cow(ip); } - error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) return error; if (found) @@ -392,7 +390,7 @@ xfs_reflink_allocate_cow( /* * Check for an overlapping extent again now that we dropped the ilock. */ - error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; if (found) { @@ -410,8 +408,8 @@ xfs_reflink_allocate_cow( /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, - resblks, imap, &nimaps); + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, + &nimaps); if (error) goto out_unreserve; @@ -427,15 +425,15 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - xfs_trim_extent(imap, offset_fsb, count_fsb); + xfs_trim_extent(cmap, offset_fsb, count_fsb); /* * COW fork extents are supposed to remain unwritten until we're ready * to initiate a disk write. For direct I/O we are going to write the * data and need the conversion, but for buffered writes we're done. */ - if (!convert_now || imap->br_state == XFS_EXT_NORM) + if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; - trace_xfs_reflink_convert_cow(ip, imap); + trace_xfs_reflink_convert_cow(ip, cmap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: @@ -495,10 +493,8 @@ xfs_reflink_cancel_cow_blocks( ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(*tpp, - del.br_startblock, del.br_blockcount); - if (error) - break; + xfs_refcount_free_cow_extent(*tpp, del.br_startblock, + del.br_blockcount); xfs_bmap_add_free(*tpp, del.br_startblock, del.br_blockcount, NULL); @@ -675,15 +671,10 @@ xfs_reflink_end_cow_extent( trace_xfs_reflink_cow_remap(ip, &del); /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, - del.br_blockcount); - if (error) - goto out_cancel; + xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &del); - if (error) - goto out_cancel; + xfs_bmap_map_extent(tp, ip, &del); /* Charge this new data fork mapping to the on-disk quota. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, @@ -1070,14 +1061,10 @@ xfs_reflink_remap_extent( uirec.br_blockcount, uirec.br_startblock); /* Update the refcount tree */ - error = xfs_refcount_increase_extent(tp, &uirec); - if (error) - goto out_cancel; + xfs_refcount_increase_extent(tp, &uirec); /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &uirec); - if (error) - goto out_cancel; + xfs_bmap_map_extent(tp, ip, &uirec); /* Update quota accounting. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, @@ -1190,11 +1177,11 @@ xfs_reflink_remap_blocks( } /* - * Grab the exclusive iolock for a data copy from src to dest, making - * sure to abide vfs locking order (lowest pointer value goes first) and - * breaking the pnfs layout leases on dest before proceeding. The loop - * is needed because we cannot call the blocking break_layout() with the - * src iolock held, and therefore have to back out both locks. + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. */ static int xfs_iolock_two_inodes_and_break_layout( @@ -1203,33 +1190,44 @@ xfs_iolock_two_inodes_and_break_layout( { int error; -retry: - if (src < dest) { - inode_lock_shared(src); - inode_lock_nested(dest, I_MUTEX_NONDIR2); - } else { - /* src >= dest */ - inode_lock(dest); - } + if (src > dest) + swap(src, dest); - error = break_layout(dest, false); - if (error == -EWOULDBLOCK) { - inode_unlock(dest); - if (src < dest) - inode_unlock_shared(src); +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { error = break_layout(dest, true); if (error) return error; - goto retry; } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); if (error) { + inode_unlock(src); inode_unlock(dest); - if (src < dest) - inode_unlock_shared(src); + if (error == -EWOULDBLOCK) + goto retry; return error; } - if (src > dest) - inode_lock_shared_nested(src, I_MUTEX_NONDIR2); + return 0; } @@ -1247,10 +1245,10 @@ xfs_reflink_remap_unlock( xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); inode_unlock(inode_out); if (!same_inode) - inode_unlock_shared(inode_in); + inode_unlock(inode_in); } /* @@ -1270,7 +1268,7 @@ xfs_reflink_zero_posteof( trace_xfs_zero_eof(ip, isize, pos - isize); return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, - &xfs_iomap_ops); + &xfs_buffered_write_iomap_ops); } /* @@ -1325,7 +1323,7 @@ xfs_reflink_remap_prep( if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); else - xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, + xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ @@ -1381,85 +1379,6 @@ out_unlock: return ret; } -/* - * The user wants to preemptively CoW all shared blocks in this file, - * which enables us to turn off the reflink flag. Iterate all - * extents which are not prealloc/delalloc to see which ranges are - * mentioned in the refcount tree, then read those blocks into the - * pagecache, dirty them, fsync them back out, and then we can update - * the inode flag. What happens if we run out of memory? :) - */ -STATIC int -xfs_reflink_dirty_extents( - struct xfs_inode *ip, - xfs_fileoff_t fbno, - xfs_filblks_t end, - xfs_off_t isize) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; - xfs_off_t fpos; - xfs_off_t flen; - struct xfs_bmbt_irec map[2]; - int nmaps; - int error = 0; - - while (end - fbno > 0) { - nmaps = 1; - /* - * Look for extents in the file. Skip holes, delalloc, or - * unwritten extents; they can't be reflinked. - */ - error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); - if (error) - goto out; - if (nmaps == 0) - break; - if (!xfs_bmap_is_real_extent(&map[0])) - goto next; - - map[1] = map[0]; - while (map[1].br_blockcount) { - agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); - aglen = map[1].br_blockcount; - - error = xfs_reflink_find_shared(mp, NULL, agno, agbno, - aglen, &rbno, &rlen, true); - if (error) - goto out; - if (rbno == NULLAGBLOCK) - break; - - /* Dirty the pages */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + - (rbno - agbno)); - flen = XFS_FSB_TO_B(mp, rlen); - if (fpos + flen > isize) - flen = isize - fpos; - error = iomap_file_dirty(VFS_I(ip), fpos, flen, - &xfs_iomap_ops); - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto out; - - map[1].br_blockcount -= (rbno - agbno + rlen); - map[1].br_startoff += (rbno - agbno + rlen); - map[1].br_startblock += (rbno - agbno + rlen); - } - -next: - fbno = map[0].br_startoff + map[0].br_blockcount; - } -out: - return error; -} - /* Does this inode need the reflink flag? */ int xfs_reflink_inode_has_shared_extents( @@ -1536,7 +1455,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error; @@ -1596,10 +1516,7 @@ xfs_reflink_unshare( xfs_off_t offset, xfs_off_t len) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t fbno; - xfs_filblks_t end; - xfs_off_t isize; + struct inode *inode = VFS_I(ip); int error; if (!xfs_is_reflink_inode(ip)) @@ -1607,20 +1524,13 @@ xfs_reflink_unshare( trace_xfs_reflink_unshare(ip, offset, len); - inode_dio_wait(VFS_I(ip)); + inode_dio_wait(inode); - /* Try to CoW the selected ranges */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - fbno = XFS_B_TO_FSBT(mp, offset); - isize = i_size_read(VFS_I(ip)); - end = XFS_B_TO_FSB(mp, offset + len); - error = xfs_reflink_dirty_extents(ip, fbno, end, isize); + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) - goto out_unlock; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - /* Wait for the IO to finish */ - error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + goto out; + error = filemap_write_and_wait(inode->i_mapping); if (error) goto out; @@ -1628,11 +1538,8 @@ xfs_reflink_unshare( error = xfs_reflink_try_clear_inode_flag(ip); if (error) goto out; - return 0; -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 28a43b7f581d..3e4fd46373ab 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -22,11 +22,11 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); -bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, +int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); -extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, +int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 77ed557b6127..4911b68f95dd 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -17,7 +17,7 @@ #include "xfs_rmap_item.h" #include "xfs_log.h" #include "xfs_rmap.h" - +#include "xfs_error.h" kmem_zone_t *xfs_rui_zone; kmem_zone_t *xfs_rud_zone; @@ -34,7 +34,7 @@ xfs_rui_item_free( if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else - kmem_zone_free(xfs_rui_zone, ruip); + kmem_cache_free(xfs_rui_zone, ruip); } /* @@ -142,9 +142,9 @@ xfs_rui_init( ASSERT(nextents > 0); if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -171,8 +171,10 @@ xfs_rui_copy_format( src_rui_fmt = buf->i_addr; len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - if (buf->i_len != len) + if (buf->i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; + } memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; @@ -227,7 +229,7 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); - kmem_zone_free(xfs_rud_zone, rudp); + kmem_cache_free(xfs_rud_zone, rudp); } static const struct xfs_item_ops xfs_rud_item_ops = { @@ -244,7 +246,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); + rudp = kmem_zone_zalloc(xfs_rud_zone, 0); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; @@ -539,7 +541,7 @@ xfs_rui_recover( */ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); xfs_rui_release(ruip); - return -EIO; + return -EFSCORRUPTED; } } @@ -581,6 +583,7 @@ xfs_rui_recover( type = XFS_RMAP_FREE; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); error = -EFSCORRUPTED; goto abort_error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5fa4db3c3e32..6209e7b6b895 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -792,8 +792,7 @@ xfs_growfs_rt_alloc( */ nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_METADATA, resblks, &map, - &nmap); + XFS_BMAPI_METADATA, 0, &map, &nmap); if (!error && nmap < 1) error = -ENOSPC; if (error) @@ -827,12 +826,10 @@ xfs_growfs_rt_alloc( * Get a buffer for the block. */ d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - if (bp == NULL) { - error = -EIO; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0, &bp); + if (error) goto out_trans_cancel; - } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* @@ -865,7 +862,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -963,7 +960,7 @@ xfs_growfs_rt( /* * Allocate a new (fake) mount/sb. */ - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + nmp = kmem_alloc(sizeof(*nmp), 0); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f9450235533c..2094386af8ac 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -37,10 +37,10 @@ #include "xfs_reflink.h" #include <linux/magic.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> static const struct super_operations xfs_super_operations; -struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -51,7 +51,7 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ * Table driven mount option parser. */ enum { - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize, + Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, @@ -59,382 +59,62 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, + Opt_discard, Opt_nodiscard, Opt_dax, }; -static const match_table_t tokens = { - {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */ - {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */ - {Opt_logdev, "logdev=%s"}, /* log device */ - {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */ - {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */ - {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */ - {Opt_noalign, "noalign"}, /* turn off stripe alignment */ - {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */ - {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */ - {Opt_swidth, "swidth=%u"}, /* data volume stripe width */ - {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */ - {Opt_grpid, "grpid"}, /* group-ID from parent directory */ - {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */ - {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */ - {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ - {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ - {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ - {Opt_inode32, "inode32"}, /* inode allocation limited to - * XFS_MAXINUMBER_32 */ - {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */ - {Opt_noikeep, "noikeep"}, /* free empty inode clusters */ - {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */ - {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes - * in stat(). */ - {Opt_attr2, "attr2"}, /* do use attr2 attribute format */ - {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */ - {Opt_filestreams,"filestreams"},/* use filestreams allocator */ - {Opt_quota, "quota"}, /* disk quotas (user) */ - {Opt_noquota, "noquota"}, /* no quotas */ - {Opt_usrquota, "usrquota"}, /* user quota enabled */ - {Opt_grpquota, "grpquota"}, /* group quota enabled */ - {Opt_prjquota, "prjquota"}, /* project quota enabled */ - {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */ - {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */ - {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */ - {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */ - {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */ - {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */ - {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */ - {Opt_discard, "discard"}, /* Discard unused blocks */ - {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ - {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ - {Opt_err, NULL}, +static const struct fs_parameter_spec xfs_fs_parameters[] = { + fsparam_u32("logbufs", Opt_logbufs), + fsparam_string("logbsize", Opt_logbsize), + fsparam_string("logdev", Opt_logdev), + fsparam_string("rtdev", Opt_rtdev), + fsparam_flag("wsync", Opt_wsync), + fsparam_flag("noalign", Opt_noalign), + fsparam_flag("swalloc", Opt_swalloc), + fsparam_u32("sunit", Opt_sunit), + fsparam_u32("swidth", Opt_swidth), + fsparam_flag("nouuid", Opt_nouuid), + fsparam_flag("grpid", Opt_grpid), + fsparam_flag("nogrpid", Opt_nogrpid), + fsparam_flag("bsdgroups", Opt_bsdgroups), + fsparam_flag("sysvgroups", Opt_sysvgroups), + fsparam_string("allocsize", Opt_allocsize), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag("inode64", Opt_inode64), + fsparam_flag("inode32", Opt_inode32), + fsparam_flag("ikeep", Opt_ikeep), + fsparam_flag("noikeep", Opt_noikeep), + fsparam_flag("largeio", Opt_largeio), + fsparam_flag("nolargeio", Opt_nolargeio), + fsparam_flag("attr2", Opt_attr2), + fsparam_flag("noattr2", Opt_noattr2), + fsparam_flag("filestreams", Opt_filestreams), + fsparam_flag("quota", Opt_quota), + fsparam_flag("noquota", Opt_noquota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_flag("uquota", Opt_uquota), + fsparam_flag("gquota", Opt_gquota), + fsparam_flag("pquota", Opt_pquota), + fsparam_flag("uqnoenforce", Opt_uqnoenforce), + fsparam_flag("gqnoenforce", Opt_gqnoenforce), + fsparam_flag("pqnoenforce", Opt_pqnoenforce), + fsparam_flag("qnoenforce", Opt_qnoenforce), + fsparam_flag("discard", Opt_discard), + fsparam_flag("nodiscard", Opt_nodiscard), + fsparam_flag("dax", Opt_dax), + {} }; - -STATIC int -suffix_kstrtoint(const substring_t *s, unsigned int base, int *res) -{ - int last, shift_left_factor = 0, _res; - char *value; - int ret = 0; - - value = match_strdup(s); - if (!value) - return -ENOMEM; - - last = strlen(value) - 1; - if (value[last] == 'K' || value[last] == 'k') { - shift_left_factor = 10; - value[last] = '\0'; - } - if (value[last] == 'M' || value[last] == 'm') { - shift_left_factor = 20; - value[last] = '\0'; - } - if (value[last] == 'G' || value[last] == 'g') { - shift_left_factor = 30; - value[last] = '\0'; - } - - if (kstrtoint(value, base, &_res)) - ret = -EINVAL; - kfree(value); - *res = _res << shift_left_factor; - return ret; -} - -/* - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - * - * Note that this function leaks the various device name allocations on - * failure. The caller takes care of them. - * - * *sb is const because this is also used to test options on the remount - * path, and we don't want this to have any side effects at remount time. - * Today this function does not change *sb, but just to future-proof... - */ -STATIC int -xfs_parseargs( - struct xfs_mount *mp, - char *options) -{ - const struct super_block *sb = mp->m_super; - char *p; - substring_t args[MAX_OPT_ARGS]; - int dsunit = 0; - int dswidth = 0; - int iosize = 0; - uint8_t iosizelog = 0; - - /* - * set up the mount name first so all the errors will refer to the - * correct device. - */ - mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); - if (!mp->m_fsname) - return -ENOMEM; - mp->m_fsname_len = strlen(mp->m_fsname) + 1; - - /* - * Copy binary VFS mount flags we are interested in. - */ - if (sb_rdonly(sb)) - mp->m_flags |= XFS_MOUNT_RDONLY; - if (sb->s_flags & SB_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (sb->s_flags & SB_SYNCHRONOUS) - mp->m_flags |= XFS_MOUNT_WSYNC; - - /* - * Set some default flags that could be cleared by the mount option - * parsing. - */ - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * These can be overridden by the mount option parsing. - */ - mp->m_logbufs = -1; - mp->m_logbsize = -1; - - if (!options) - goto done; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_logbufs: - if (match_int(args, &mp->m_logbufs)) - return -EINVAL; - break; - case Opt_logbsize: - if (suffix_kstrtoint(args, 10, &mp->m_logbsize)) - return -EINVAL; - break; - case Opt_logdev: - kfree(mp->m_logname); - mp->m_logname = match_strdup(args); - if (!mp->m_logname) - return -ENOMEM; - break; - case Opt_rtdev: - kfree(mp->m_rtname); - mp->m_rtname = match_strdup(args); - if (!mp->m_rtname) - return -ENOMEM; - break; - case Opt_allocsize: - case Opt_biosize: - if (suffix_kstrtoint(args, 10, &iosize)) - return -EINVAL; - iosizelog = ffs(iosize) - 1; - break; - case Opt_grpid: - case Opt_bsdgroups: - mp->m_flags |= XFS_MOUNT_GRPID; - break; - case Opt_nogrpid: - case Opt_sysvgroups: - mp->m_flags &= ~XFS_MOUNT_GRPID; - break; - case Opt_wsync: - mp->m_flags |= XFS_MOUNT_WSYNC; - break; - case Opt_norecovery: - mp->m_flags |= XFS_MOUNT_NORECOVERY; - break; - case Opt_noalign: - mp->m_flags |= XFS_MOUNT_NOALIGN; - break; - case Opt_swalloc: - mp->m_flags |= XFS_MOUNT_SWALLOC; - break; - case Opt_sunit: - if (match_int(args, &dsunit)) - return -EINVAL; - break; - case Opt_swidth: - if (match_int(args, &dswidth)) - return -EINVAL; - break; - case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - break; - case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - break; - case Opt_nouuid: - mp->m_flags |= XFS_MOUNT_NOUUID; - break; - case Opt_ikeep: - mp->m_flags |= XFS_MOUNT_IKEEP; - break; - case Opt_noikeep: - mp->m_flags &= ~XFS_MOUNT_IKEEP; - break; - case Opt_largeio: - mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; - break; - case Opt_nolargeio: - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - break; - case Opt_attr2: - mp->m_flags |= XFS_MOUNT_ATTR2; - break; - case Opt_noattr2: - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; - break; - case Opt_filestreams: - mp->m_flags |= XFS_MOUNT_FILESTREAMS; - break; - case Opt_noquota: - mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; - mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; - break; - case Opt_quota: - case Opt_uquota: - case Opt_usrquota: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_UQUOTA_ENFD); - break; - case Opt_qnoenforce: - case Opt_uqnoenforce: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_UQUOTA_ENFD; - break; - case Opt_pquota: - case Opt_prjquota: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_PQUOTA_ENFD); - break; - case Opt_pqnoenforce: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_PQUOTA_ENFD; - break; - case Opt_gquota: - case Opt_grpquota: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_GQUOTA_ENFD); - break; - case Opt_gqnoenforce: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_GQUOTA_ENFD; - break; - case Opt_discard: - mp->m_flags |= XFS_MOUNT_DISCARD; - break; - case Opt_nodiscard: - mp->m_flags &= ~XFS_MOUNT_DISCARD; - break; -#ifdef CONFIG_FS_DAX - case Opt_dax: - mp->m_flags |= XFS_MOUNT_DAX; - break; -#endif - default: - xfs_warn(mp, "unknown mount option [%s].", p); - return -EINVAL; - } - } - - /* - * no recovery flag requires a read-only mount - */ - if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_warn(mp, "no-recovery mounts must be read-only."); - return -EINVAL; - } - - if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { - xfs_warn(mp, - "sunit and swidth options incompatible with the noalign option"); - return -EINVAL; - } - -#ifndef CONFIG_XFS_QUOTA - if (XFS_IS_QUOTA_RUNNING(mp)) { - xfs_warn(mp, "quota support not available in this kernel."); - return -EINVAL; - } -#endif - - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { - xfs_warn(mp, "sunit and swidth must be specified together"); - return -EINVAL; - } - - if (dsunit && (dswidth % dsunit != 0)) { - xfs_warn(mp, - "stripe width (%d) must be a multiple of the stripe unit (%d)", - dswidth, dsunit); - return -EINVAL; - } - -done: - if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = dsunit; - mp->m_swidth = dswidth; - } - - if (mp->m_logbufs != -1 && - mp->m_logbufs != 0 && - (mp->m_logbufs < XLOG_MIN_ICLOGS || - mp->m_logbufs > XLOG_MAX_ICLOGS)) { - xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", - mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return -EINVAL; - } - if (mp->m_logbsize != -1 && - mp->m_logbsize != 0 && - (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || - mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || - !is_power_of_2(mp->m_logbsize))) { - xfs_warn(mp, - "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - mp->m_logbsize); - return -EINVAL; - } - - if (iosizelog) { - if (iosizelog > XFS_MAX_IO_LOG || - iosizelog < XFS_MIN_IO_LOG) { - xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", - iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return -EINVAL; - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = iosizelog; - mp->m_writeio_log = iosizelog; - } - - return 0; -} - struct proc_xfs_info { uint64_t flag; char *str; }; -STATIC void -xfs_showargs( - struct xfs_mount *mp, - struct seq_file *m) +static int +xfs_fs_show_options( + struct seq_file *m, + struct dentry *root) { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ @@ -448,30 +128,24 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, ",filestreams" }, { XFS_MOUNT_GRPID, ",grpid" }, { XFS_MOUNT_DISCARD, ",discard" }, - { XFS_MOUNT_SMALL_INUMS, ",inode32" }, + { XFS_MOUNT_LARGEIO, ",largeio" }, { XFS_MOUNT_DAX, ",dax" }, { 0, NULL } }; - static struct proc_xfs_info xfs_info_unset[] = { - /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" }, - { XFS_MOUNT_SMALL_INUMS, ",inode64" }, - { 0, NULL } - }; + struct xfs_mount *mp = XFS_M(root->d_sb); struct proc_xfs_info *xfs_infop; for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { if (mp->m_flags & xfs_infop->flag) seq_puts(m, xfs_infop->str); } - for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { - if (!(mp->m_flags & xfs_infop->flag)) - seq_puts(m, xfs_infop->str); - } - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, ",inode%d", + (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64); + + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) seq_printf(m, ",allocsize=%dk", - (int)(1 << mp->m_writeio_log) >> 10); + (1 << mp->m_allocsize_log) >> 10); if (mp->m_logbufs > 0) seq_printf(m, ",logbufs=%d", mp->m_logbufs); @@ -510,32 +184,8 @@ xfs_showargs( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); -} - -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - return (((uint64_t)pagefactor) << bitshift) - 1; + return 0; } /* @@ -808,32 +458,33 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, + 0, mp->m_super->s_id); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); if (!mp->m_eofblocks_workqueue) goto out_destroy_reclaim; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, - mp->m_fsname); + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_eofb; @@ -1037,13 +688,13 @@ xfs_fs_drop_inode( return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); } -STATIC void -xfs_free_fsname( +static void +xfs_mount_free( struct xfs_mount *mp) { - kfree(mp->m_fsname); kfree(mp->m_rtname); kfree(mp->m_logname); + kmem_free(mp); } STATIC int @@ -1204,181 +855,6 @@ xfs_quiesce_attr( xfs_log_quiesce(mp); } -STATIC int -xfs_test_remount_options( - struct super_block *sb, - char *options) -{ - int error = 0; - struct xfs_mount *tmp_mp; - - tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL); - if (!tmp_mp) - return -ENOMEM; - - tmp_mp->m_super = sb; - error = xfs_parseargs(tmp_mp, options); - xfs_free_fsname(tmp_mp); - kmem_free(tmp_mp); - - return error; -} - -STATIC int -xfs_fs_remount( - struct super_block *sb, - int *flags, - char *options) -{ - struct xfs_mount *mp = XFS_M(sb); - xfs_sb_t *sbp = &mp->m_sb; - substring_t args[MAX_OPT_ARGS]; - char *p; - int error; - - /* First, check for complete junk; i.e. invalid options */ - error = xfs_test_remount_options(sb, options); - if (error) - return error; - - sync_filesystem(sb); - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); - break; - case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); - break; - default: - /* - * Logically we would return an error here to prevent - * users from believing they might have changed - * mount options using remount which can't be changed. - * - * But unfortunately mount(8) adds all options from - * mtab and fstab to the mount arguments in some cases - * so we can't blindly reject options, but have to - * check for each specified option if it actually - * differs from the currently set option and only - * reject it if that's the case. - * - * Until that is implemented we return success for - * every remount request, and silently ignore all - * options that we can't actually change. - */ -#if 0 - xfs_info(mp, - "mount option \"%s\" not supported for remount", p); - return -EINVAL; -#else - break; -#endif - } - } - - /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) { - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - xfs_warn(mp, - "ro->rw transition prohibited on norecovery mount"); - return -EINVAL; - } - - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_ro_compat_feature(sbp, - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", - (sbp->sb_features_ro_compat & - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - return -EINVAL; - } - - mp->m_flags &= ~XFS_MOUNT_RDONLY; - - /* - * If this is the first remount to writeable state we - * might have some superblock changes to update. - */ - if (mp->m_update_sb) { - error = xfs_sync_sb(mp, false); - if (error) { - xfs_warn(mp, "failed to write sb changes"); - return error; - } - mp->m_update_sb = false; - } - - /* - * Fill out the reserve pool if it is empty. Use the stashed - * value if it is non-zero, otherwise go with the default. - */ - xfs_restore_resvblks(mp); - xfs_log_work_queue(mp); - - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - xfs_start_block_reaping(mp); - - /* Create the per-AG metadata reservation pool .*/ - error = xfs_fs_reserve_ag_blocks(mp); - if (error && error != -ENOSPC) - return error; - } - - /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { - /* - * Cancel background eofb scanning so it cannot race with the - * final log force+buftarg wait and deadlock the remount. - */ - xfs_stop_block_reaping(mp); - - /* Get rid of any leftover CoW reservations... */ - error = xfs_icache_free_cowblocks(mp, NULL); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - - /* Free the per-AG metadata reservation pool. */ - error = xfs_fs_unreserve_ag_blocks(mp); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - - /* - * Before we sync the metadata, we need to free up the reserve - * block pool so that the used block count in the superblock on - * disk is correct at the end of the remount. Stash the current - * reserve pool size so that if we get remounted rw, we can - * return it to the same size. - */ - xfs_save_resvblks(mp); - - xfs_quiesce_attr(mp); - mp->m_flags |= XFS_MOUNT_RDONLY; - } - - return 0; -} - /* * Second stage of a freeze. The data is already frozen so we only * need to take care of the metadata. Once that's done sync the superblock @@ -1409,15 +885,6 @@ xfs_fs_unfreeze( return 0; } -STATIC int -xfs_fs_show_options( - struct seq_file *m, - struct dentry *root) -{ - xfs_showargs(XFS_M(root->d_sb), m); - return 0; -} - /* * This function fills in xfs_mount_t fields based on mount args. * Note: the superblock _has_ now been read in. @@ -1540,60 +1007,337 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_delalloc_blks); } -static struct xfs_mount * -xfs_mount_alloc( +static void +xfs_fs_put_super( struct super_block *sb) { - struct xfs_mount *mp; + struct xfs_mount *mp = XFS_M(sb); - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); - if (!mp) - return NULL; + /* if ->fill_super failed, we have no mount to tear down */ + if (!sb->s_fs_info) + return; - mp->m_super = sb; - spin_lock_init(&mp->m_sb_lock); - spin_lock_init(&mp->m_agirotor_lock); - INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); - spin_lock_init(&mp->m_perag_lock); - mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); - INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); - mp->m_kobj.kobject.kset = xfs_kset; - /* - * We don't create the finobt per-ag space reservation until after log - * recovery, so we must set this to true so that an ifree transaction - * started during log recovery will not depend on space reservations - * for finobt expansion. - */ - mp->m_finobt_nores = true; - return mp; + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + free_percpu(mp->m_stats.xs_stats); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + + sb->s_fs_info = NULL; + xfs_mount_free(mp); } +static long +xfs_fs_nr_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + /* Paranoia: catch incorrect calls during mount setup or teardown */ + if (WARN_ON_ONCE(!sb->s_fs_info)) + return 0; + return xfs_reclaim_inodes_count(XFS_M(sb)); +} -STATIC int -xfs_fs_fill_super( +static long +xfs_fs_free_cached_objects( struct super_block *sb, - void *data, - int silent) + struct shrink_control *sc) { - struct inode *root; - struct xfs_mount *mp = NULL; - int flags = 0, error = -ENOMEM; + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, + .drop_inode = xfs_fs_drop_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, +}; + +static int +suffix_kstrtoint( + const char *s, + unsigned int base, + int *res) +{ + int last, shift_left_factor = 0, _res; + char *value; + int ret = 0; + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoint(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + +/* + * Set mount state from a mount option. + * + * NOTE: mp->m_super is NULL here! + */ +static int +xfs_fc_parse_param( + struct fs_context *fc, + struct fs_parameter *param) +{ + struct xfs_mount *mp = fc->s_fs_info; + struct fs_parse_result result; + int size = 0; + int opt; + + opt = fs_parse(fc, xfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_logbufs: + mp->m_logbufs = result.uint_32; + return 0; + case Opt_logbsize: + if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize)) + return -EINVAL; + return 0; + case Opt_logdev: + kfree(mp->m_logname); + mp->m_logname = kstrdup(param->string, GFP_KERNEL); + if (!mp->m_logname) + return -ENOMEM; + return 0; + case Opt_rtdev: + kfree(mp->m_rtname); + mp->m_rtname = kstrdup(param->string, GFP_KERNEL); + if (!mp->m_rtname) + return -ENOMEM; + return 0; + case Opt_allocsize: + if (suffix_kstrtoint(param->string, 10, &size)) + return -EINVAL; + mp->m_allocsize_log = ffs(size) - 1; + mp->m_flags |= XFS_MOUNT_ALLOCSIZE; + return 0; + case Opt_grpid: + case Opt_bsdgroups: + mp->m_flags |= XFS_MOUNT_GRPID; + return 0; + case Opt_nogrpid: + case Opt_sysvgroups: + mp->m_flags &= ~XFS_MOUNT_GRPID; + return 0; + case Opt_wsync: + mp->m_flags |= XFS_MOUNT_WSYNC; + return 0; + case Opt_norecovery: + mp->m_flags |= XFS_MOUNT_NORECOVERY; + return 0; + case Opt_noalign: + mp->m_flags |= XFS_MOUNT_NOALIGN; + return 0; + case Opt_swalloc: + mp->m_flags |= XFS_MOUNT_SWALLOC; + return 0; + case Opt_sunit: + mp->m_dalign = result.uint_32; + return 0; + case Opt_swidth: + mp->m_swidth = result.uint_32; + return 0; + case Opt_inode32: + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + return 0; + case Opt_inode64: + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + return 0; + case Opt_nouuid: + mp->m_flags |= XFS_MOUNT_NOUUID; + return 0; + case Opt_ikeep: + mp->m_flags |= XFS_MOUNT_IKEEP; + return 0; + case Opt_noikeep: + mp->m_flags &= ~XFS_MOUNT_IKEEP; + return 0; + case Opt_largeio: + mp->m_flags |= XFS_MOUNT_LARGEIO; + return 0; + case Opt_nolargeio: + mp->m_flags &= ~XFS_MOUNT_LARGEIO; + return 0; + case Opt_attr2: + mp->m_flags |= XFS_MOUNT_ATTR2; + return 0; + case Opt_noattr2: + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + return 0; + case Opt_filestreams: + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + return 0; + case Opt_noquota: + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; + return 0; + case Opt_quota: + case Opt_uquota: + case Opt_usrquota: + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); + return 0; + case Opt_qnoenforce: + case Opt_uqnoenforce: + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; + return 0; + case Opt_pquota: + case Opt_prjquota: + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_PQUOTA_ENFD); + return 0; + case Opt_pqnoenforce: + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_PQUOTA_ENFD; + return 0; + case Opt_gquota: + case Opt_grpquota: + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_GQUOTA_ENFD); + return 0; + case Opt_gqnoenforce: + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_GQUOTA_ENFD; + return 0; + case Opt_discard: + mp->m_flags |= XFS_MOUNT_DISCARD; + return 0; + case Opt_nodiscard: + mp->m_flags &= ~XFS_MOUNT_DISCARD; + return 0; +#ifdef CONFIG_FS_DAX + case Opt_dax: + mp->m_flags |= XFS_MOUNT_DAX; + return 0; +#endif + default: + xfs_warn(mp, "unknown mount option [%s].", param->key); + return -EINVAL; + } + + return 0; +} + +static int +xfs_fc_validate_params( + struct xfs_mount *mp) +{ /* - * allocate mp and do all low-level struct initializations before we - * attach it to the super + * no recovery flag requires a read-only mount */ - mp = xfs_mount_alloc(sb); - if (!mp) - goto out; - sb->s_fs_info = mp; + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } - error = xfs_parseargs(mp, (char *)data); + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && + (mp->m_dalign || mp->m_swidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return -EINVAL; + } + + if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) { + xfs_warn(mp, "quota support not available in this kernel."); + return -EINVAL; + } + + if ((mp->m_dalign && !mp->m_swidth) || + (!mp->m_dalign && mp->m_swidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return -EINVAL; + } + + if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + mp->m_swidth, mp->m_dalign); + return -EINVAL; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return -EINVAL; + } + + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return -EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) && + (mp->m_allocsize_log > XFS_MAX_IO_LOG || + mp->m_allocsize_log < XFS_MIN_IO_LOG)) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG); + return -EINVAL; + } + + return 0; +} + +static int +xfs_fc_fill_super( + struct super_block *sb, + struct fs_context *fc) +{ + struct xfs_mount *mp = sb->s_fs_info; + struct inode *root; + int flags = 0, error; + + mp->m_super = sb; + + error = xfs_fc_validate_params(mp); if (error) - goto out_free_fsname; + goto out_free_names; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1615,12 +1359,12 @@ xfs_fs_fill_super( msleep(xfs_globals.mount_delay * 1000); } - if (silent) + if (fc->sb_flags & SB_SILENT) flags |= XFS_MFSI_QUIET; error = xfs_open_devices(mp); if (error) - goto out_free_fsname; + goto out_free_names; error = xfs_init_mount_workqueues(mp); if (error) @@ -1649,6 +1393,26 @@ xfs_fs_fill_super( if (error) goto out_free_sb; + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1660,9 +1424,11 @@ xfs_fs_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; + sb->s_time_min = S32_MIN; + sb->s_time_max = S32_MAX; sb->s_iflags |= SB_I_CGROUPWB; set_posix_acl_flag(sb); @@ -1755,11 +1521,9 @@ xfs_fs_fill_super( xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); - out_free_fsname: + out_free_names: sb->s_fs_info = NULL; - xfs_free_fsname(mp); - kfree(mp); - out: + xfs_mount_free(mp); return error; out_unmount: @@ -1768,80 +1532,252 @@ xfs_fs_fill_super( goto out_free_sb; } -STATIC void -xfs_fs_put_super( - struct super_block *sb) +static int +xfs_fc_get_tree( + struct fs_context *fc) { - struct xfs_mount *mp = XFS_M(sb); + return get_tree_bdev(fc, xfs_fc_fill_super); +} - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; +static int +xfs_remount_rw( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + int error; - xfs_notice(mp, "Unmounting Filesystem"); - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } - xfs_freesb(mp); - free_percpu(mp->m_stats.xs_stats); - xfs_destroy_percpu_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, + "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } - sb->s_fs_info = NULL; - xfs_free_fsname(mp); - kfree(mp); + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + /* + * If this is the first remount to writeable state we might have some + * superblock changes to update. + */ + if (mp->m_update_sb) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_sb = false; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed value if + * it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + xfs_start_block_reaping(mp); + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; + + return 0; } -STATIC struct dentry * -xfs_fs_mount( - struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int +xfs_remount_ro( + struct xfs_mount *mp) { - return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); + int error; + + /* + * Cancel background eofb scanning so it cannot race with the final + * log force+buftarg wait and deadlock the remount. + */ + xfs_stop_block_reaping(mp); + + /* Get rid of any leftover CoW reservations... */ + error = xfs_icache_free_cowblocks(mp, NULL); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* Free the per-AG metadata reservation pool. */ + error = xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* + * Before we sync the metadata, we need to free up the reserve block + * pool so that the used block count in the superblock on disk is + * correct at the end of the remount. Stash the current* reserve pool + * size so that if we get remounted rw, we can return it to the same + * size. + */ + xfs_save_resvblks(mp); + + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + + return 0; } -static long -xfs_fs_nr_cached_objects( - struct super_block *sb, - struct shrink_control *sc) +/* + * Logically we would return an error here to prevent users from believing + * they might have changed mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from mtab and fstab to the mount + * arguments in some cases so we can't blindly reject options, but have to + * check for each specified option if it actually differs from the currently + * set option and only reject it if that's the case. + * + * Until that is implemented we return success for every remount request, and + * silently ignore all options that we can't actually change. + */ +static int +xfs_fc_reconfigure( + struct fs_context *fc) { - /* Paranoia: catch incorrect calls during mount setup or teardown */ - if (WARN_ON_ONCE(!sb->s_fs_info)) - return 0; - return xfs_reclaim_inodes_count(XFS_M(sb)); + struct xfs_mount *mp = XFS_M(fc->root->d_sb); + struct xfs_mount *new_mp = fc->s_fs_info; + xfs_sb_t *sbp = &mp->m_sb; + int flags = fc->sb_flags; + int error; + + error = xfs_fc_validate_params(new_mp); + if (error) + return error; + + sync_filesystem(mp->m_super); + + /* inode32 -> inode64 */ + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && + !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + } + + /* inode64 -> inode32 */ + if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) && + (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + } + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) { + error = xfs_remount_rw(mp); + if (error) + return error; + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) { + error = xfs_remount_ro(mp); + if (error) + return error; + } + + return 0; } -static long -xfs_fs_free_cached_objects( - struct super_block *sb, - struct shrink_control *sc) +static void xfs_fc_free( + struct fs_context *fc) { - return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); + struct xfs_mount *mp = fc->s_fs_info; + + /* + * mp is stored in the fs_context when it is initialized. + * mp is transferred to the superblock on a successful mount, + * but if an error occurs before the transfer we have to free + * it here. + */ + if (mp) + xfs_mount_free(mp); } -static const struct super_operations xfs_super_operations = { - .alloc_inode = xfs_fs_alloc_inode, - .destroy_inode = xfs_fs_destroy_inode, - .dirty_inode = xfs_fs_dirty_inode, - .drop_inode = xfs_fs_drop_inode, - .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_fs, - .freeze_fs = xfs_fs_freeze, - .unfreeze_fs = xfs_fs_unfreeze, - .statfs = xfs_fs_statfs, - .remount_fs = xfs_fs_remount, - .show_options = xfs_fs_show_options, - .nr_cached_objects = xfs_fs_nr_cached_objects, - .free_cached_objects = xfs_fs_free_cached_objects, +static const struct fs_context_operations xfs_context_ops = { + .parse_param = xfs_fc_parse_param, + .get_tree = xfs_fc_get_tree, + .reconfigure = xfs_fc_reconfigure, + .free = xfs_fc_free, }; +static int xfs_init_fs_context( + struct fs_context *fc) +{ + struct xfs_mount *mp; + + mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO); + if (!mp) + return -ENOMEM; + + spin_lock_init(&mp->m_sb_lock); + spin_lock_init(&mp->m_agirotor_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + spin_lock_init(&mp->m_perag_lock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + mp->m_allocsize_log = 16; /* 64k */ + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (fc->sb_flags & SB_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (fc->sb_flags & SB_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + fc->s_fs_info = mp; + fc->ops = &xfs_context_ops; + + return 0; +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", - .mount = xfs_fs_mount, + .init_fs_context = xfs_init_fs_context, + .parameters = xfs_fs_parameters, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -1850,37 +1786,39 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS)) - goto out; - - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), - "xfs_log_ticket"); + xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket", + sizeof(struct xlog_ticket), + 0, 0, NULL); if (!xfs_log_ticket_zone) - goto out_free_ioend_bioset; + goto out; - xfs_bmap_free_item_zone = kmem_zone_init( - sizeof(struct xfs_extent_free_item), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item", + sizeof(struct xfs_extent_free_item), + 0, 0, NULL); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; - xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); + xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur", + sizeof(struct xfs_btree_cur), + 0, 0, NULL); if (!xfs_btree_cur_zone) goto out_destroy_bmap_free_item_zone; - xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), - "xfs_da_state"); + xfs_da_state_zone = kmem_cache_create("xfs_da_state", + sizeof(struct xfs_da_state), + 0, 0, NULL); if (!xfs_da_state_zone) goto out_destroy_btree_cur_zone; - xfs_ifork_zone = kmem_zone_init(sizeof(struct xfs_ifork), "xfs_ifork"); + xfs_ifork_zone = kmem_cache_create("xfs_ifork", + sizeof(struct xfs_ifork), + 0, 0, NULL); if (!xfs_ifork_zone) goto out_destroy_da_state_zone; - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + xfs_trans_zone = kmem_cache_create("xf_trans", + sizeof(struct xfs_trans), + 0, 0, NULL); if (!xfs_trans_zone) goto out_destroy_ifork_zone; @@ -1890,111 +1828,121 @@ xfs_init_zones(void) * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), - "xfs_buf_item"); + xfs_buf_item_zone = kmem_cache_create("xfs_buf_item", + sizeof(struct xfs_buf_log_item), + 0, 0, NULL); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; - xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), "xfs_efd_item"); + xfs_efd_zone = kmem_cache_create("xfs_efd_item", + (sizeof(struct xfs_efd_log_item) + + (XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(struct xfs_extent)), + 0, 0, NULL); if (!xfs_efd_zone) goto out_destroy_buf_item_zone; - xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), "xfs_efi_item"); + xfs_efi_zone = kmem_cache_create("xfs_efi_item", + (sizeof(struct xfs_efi_log_item) + + (XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(struct xfs_extent)), + 0, 0, NULL); if (!xfs_efi_zone) goto out_destroy_efd_zone; - xfs_inode_zone = - kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | - KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); + xfs_inode_zone = kmem_cache_create("xfs_inode", + sizeof(struct xfs_inode), 0, + (SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD | SLAB_ACCOUNT), + xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; - xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); + xfs_ili_zone = kmem_cache_create("xfs_ili", + sizeof(struct xfs_inode_log_item), 0, + SLAB_MEM_SPREAD, NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; - xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), - "xfs_icr"); + + xfs_icreate_zone = kmem_cache_create("xfs_icr", + sizeof(struct xfs_icreate_item), + 0, 0, NULL); if (!xfs_icreate_zone) goto out_destroy_ili_zone; - xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item), - "xfs_rud_item"); + xfs_rud_zone = kmem_cache_create("xfs_rud_item", + sizeof(struct xfs_rud_log_item), + 0, 0, NULL); if (!xfs_rud_zone) goto out_destroy_icreate_zone; - xfs_rui_zone = kmem_zone_init( + xfs_rui_zone = kmem_cache_create("xfs_rui_item", xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), - "xfs_rui_item"); + 0, 0, NULL); if (!xfs_rui_zone) goto out_destroy_rud_zone; - xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), - "xfs_cud_item"); + xfs_cud_zone = kmem_cache_create("xfs_cud_item", + sizeof(struct xfs_cud_log_item), + 0, 0, NULL); if (!xfs_cud_zone) goto out_destroy_rui_zone; - xfs_cui_zone = kmem_zone_init( + xfs_cui_zone = kmem_cache_create("xfs_cui_item", xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), - "xfs_cui_item"); + 0, 0, NULL); if (!xfs_cui_zone) goto out_destroy_cud_zone; - xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), - "xfs_bud_item"); + xfs_bud_zone = kmem_cache_create("xfs_bud_item", + sizeof(struct xfs_bud_log_item), + 0, 0, NULL); if (!xfs_bud_zone) goto out_destroy_cui_zone; - xfs_bui_zone = kmem_zone_init( + xfs_bui_zone = kmem_cache_create("xfs_bui_item", xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), - "xfs_bui_item"); + 0, 0, NULL); if (!xfs_bui_zone) goto out_destroy_bud_zone; return 0; out_destroy_bud_zone: - kmem_zone_destroy(xfs_bud_zone); + kmem_cache_destroy(xfs_bud_zone); out_destroy_cui_zone: - kmem_zone_destroy(xfs_cui_zone); + kmem_cache_destroy(xfs_cui_zone); out_destroy_cud_zone: - kmem_zone_destroy(xfs_cud_zone); + kmem_cache_destroy(xfs_cud_zone); out_destroy_rui_zone: - kmem_zone_destroy(xfs_rui_zone); + kmem_cache_destroy(xfs_rui_zone); out_destroy_rud_zone: - kmem_zone_destroy(xfs_rud_zone); + kmem_cache_destroy(xfs_rud_zone); out_destroy_icreate_zone: - kmem_zone_destroy(xfs_icreate_zone); + kmem_cache_destroy(xfs_icreate_zone); out_destroy_ili_zone: - kmem_zone_destroy(xfs_ili_zone); + kmem_cache_destroy(xfs_ili_zone); out_destroy_inode_zone: - kmem_zone_destroy(xfs_inode_zone); + kmem_cache_destroy(xfs_inode_zone); out_destroy_efi_zone: - kmem_zone_destroy(xfs_efi_zone); + kmem_cache_destroy(xfs_efi_zone); out_destroy_efd_zone: - kmem_zone_destroy(xfs_efd_zone); + kmem_cache_destroy(xfs_efd_zone); out_destroy_buf_item_zone: - kmem_zone_destroy(xfs_buf_item_zone); + kmem_cache_destroy(xfs_buf_item_zone); out_destroy_trans_zone: - kmem_zone_destroy(xfs_trans_zone); + kmem_cache_destroy(xfs_trans_zone); out_destroy_ifork_zone: - kmem_zone_destroy(xfs_ifork_zone); + kmem_cache_destroy(xfs_ifork_zone); out_destroy_da_state_zone: - kmem_zone_destroy(xfs_da_state_zone); + kmem_cache_destroy(xfs_da_state_zone); out_destroy_btree_cur_zone: - kmem_zone_destroy(xfs_btree_cur_zone); + kmem_cache_destroy(xfs_btree_cur_zone); out_destroy_bmap_free_item_zone: - kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_cache_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: - kmem_zone_destroy(xfs_log_ticket_zone); - out_free_ioend_bioset: - bioset_exit(&xfs_ioend_bioset); + kmem_cache_destroy(xfs_log_ticket_zone); out: return -ENOMEM; } @@ -2007,25 +1955,24 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); - kmem_zone_destroy(xfs_bui_zone); - kmem_zone_destroy(xfs_bud_zone); - kmem_zone_destroy(xfs_cui_zone); - kmem_zone_destroy(xfs_cud_zone); - kmem_zone_destroy(xfs_rui_zone); - kmem_zone_destroy(xfs_rud_zone); - kmem_zone_destroy(xfs_icreate_zone); - kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_inode_zone); - kmem_zone_destroy(xfs_efi_zone); - kmem_zone_destroy(xfs_efd_zone); - kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_trans_zone); - kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_da_state_zone); - kmem_zone_destroy(xfs_btree_cur_zone); - kmem_zone_destroy(xfs_bmap_free_item_zone); - kmem_zone_destroy(xfs_log_ticket_zone); - bioset_exit(&xfs_ioend_bioset); + kmem_cache_destroy(xfs_bui_zone); + kmem_cache_destroy(xfs_bud_zone); + kmem_cache_destroy(xfs_cui_zone); + kmem_cache_destroy(xfs_cud_zone); + kmem_cache_destroy(xfs_rui_zone); + kmem_cache_destroy(xfs_rud_zone); + kmem_cache_destroy(xfs_icreate_zone); + kmem_cache_destroy(xfs_ili_zone); + kmem_cache_destroy(xfs_inode_zone); + kmem_cache_destroy(xfs_efi_zone); + kmem_cache_destroy(xfs_efd_zone); + kmem_cache_destroy(xfs_buf_item_zone); + kmem_cache_destroy(xfs_trans_zone); + kmem_cache_destroy(xfs_ifork_zone); + kmem_cache_destroy(xfs_da_state_zone); + kmem_cache_destroy(xfs_btree_cur_zone); + kmem_cache_destroy(xfs_bmap_free_item_zone); + kmem_cache_destroy(xfs_log_ticket_zone); } STATIC int __init diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 763e43d22dee..b552cf6d3379 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -11,9 +11,11 @@ #ifdef CONFIG_XFS_QUOTA extern int xfs_qm_init(void); extern void xfs_qm_exit(void); +# define XFS_QUOTA_STRING "quota, " #else # define xfs_qm_init() (0) # define xfs_qm_exit() do { } while (0) +# define XFS_QUOTA_STRING #endif #ifdef CONFIG_XFS_POSIX_ACL @@ -50,6 +52,12 @@ extern void xfs_qm_exit(void); # define XFS_WARN_STRING #endif +#ifdef CONFIG_XFS_ASSERT_FATAL +# define XFS_ASSERT_FATAL_STRING "fatal assert, " +#else +# define XFS_ASSERT_FATAL_STRING +#endif + #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -63,6 +71,8 @@ extern void xfs_qm_exit(void); XFS_SCRUB_STRING \ XFS_REPAIR_STRING \ XFS_WARN_STRING \ + XFS_QUOTA_STRING \ + XFS_ASSERT_FATAL_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index ed66fd2de327..d762d42ed0ff 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -17,6 +17,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_quota.h" +#include "xfs_symlink.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_trans.h" @@ -52,20 +53,10 @@ xfs_readlink_bmap_ilocked( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, - &xfs_symlink_buf_ops); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - goto out; - } + error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &bp, &xfs_symlink_buf_ops); + if (error) + return error; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); if (pathlen < byte_cnt) byte_cnt = pathlen; @@ -289,12 +280,10 @@ xfs_symlink( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0, &bp); + if (error) goto out_trans_cancel; - } bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -432,13 +421,12 @@ xfs_inactive_symlink_rmt( * Invalidate the block(s). No validation is done. */ for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) goto error_trans_cancel; - } xfs_trans_binval(tp, bp); } /* diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 9743d8c9394b..b1fa091427e6 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -5,7 +5,7 @@ #ifndef __XFS_SYMLINK_H #define __XFS_SYMLINK_H 1 -/* Kernel only symlink defintions */ +/* Kernel only symlink definitions */ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ddd0bf7a4740..f1bc88f4367c 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; -/* - * xfs_mount kobject. The mp kobject also serves as the per-mount parent object - * that is identified by the fsname under sysfs. - */ - -static inline struct xfs_mount * -to_mp(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - - return container_of(kobj, struct xfs_mount, m_kobj); -} - static struct attribute *xfs_mp_attrs[] = { NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8094b1920eef..e242988f57fb 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -23,6 +23,7 @@ struct xlog; struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; +struct xlog_rec_header; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; @@ -30,6 +31,10 @@ struct xfs_btree_cur; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; +struct xfs_icreate_log; +struct xfs_owner_info; +struct xfs_trans_res; +struct xfs_inobt_rec_incore; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -213,8 +218,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(void *, leaf); - __field(int, pos); + __field(void *, leaf) + __field(int, pos) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -720,7 +725,7 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->writeio_blocks = writeio_blocks; ), TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " - "m_writeio_blocks %u", + "m_allocsize_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) @@ -1153,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); -DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, - unsigned int len), - TP_ARGS(inode, page, off, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(pgoff_t, pgoff) - __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = XFS_I(inode)->i_ino; - __entry->pgoff = page_offset(page); - __entry->size = i_size_read(inode); - __entry->offset = off; - __entry->length = len; - ), - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "length %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->pgoff, - __entry->size, - __entry->offset, - __entry->length) -) - -#define DEFINE_PAGE_EVENT(name) \ -DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ - unsigned int len), \ - TP_ARGS(inode, page, off, len)) -DEFINE_PAGE_EVENT(xfs_writepage); -DEFINE_PAGE_EVENT(xfs_releasepage); -DEFINE_PAGE_EVENT(xfs_invalidatepage); - -DECLARE_EVENT_CLASS(xfs_readpage_class, - TP_PROTO(struct inode *inode, int nr_pages), - TP_ARGS(inode, nr_pages), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(int, nr_pages) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->nr_pages = nr_pages; - ), - TP_printk("dev %d:%d ino 0x%llx nr_pages %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->nr_pages) -) - -#define DEFINE_READPAGE_EVENT(name) \ -DEFINE_EVENT(xfs_readpage_class, name, \ - TP_PROTO(struct inode *inode, int nr_pages), \ - TP_ARGS(inode, nr_pages)) -DEFINE_READPAGE_EVENT(xfs_vm_readpage); -DEFINE_READPAGE_EVENT(xfs_vm_readpages); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1637,8 +1577,11 @@ DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_first); -DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); -DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_cur); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_right); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_left); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done); DEFINE_ALLOC_EVENT(xfs_alloc_near_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); @@ -1658,6 +1601,32 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); +TRACE_EVENT(xfs_alloc_cur_check, + TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno, + xfs_extlen_t len, xfs_extlen_t diff, bool new), + TP_ARGS(mp, btnum, bno, len, diff, new), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agblock_t, bno) + __field(xfs_extlen_t, len) + __field(xfs_extlen_t, diff) + __field(bool, new) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->btnum = btnum; + __entry->bno = bno; + __entry->len = len; + __entry->diff = diff; + __entry->new = new; + ), + TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->bno, __entry->len, __entry->diff, __entry->new) +) + DECLARE_EVENT_CLASS(xfs_da_class, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), @@ -3575,6 +3544,56 @@ TRACE_EVENT(xfs_pwork_init, __entry->nr_threads, __entry->pid) ) +DECLARE_EVENT_CLASS(xfs_kmem_class, + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), + TP_ARGS(size, flags, caller_ip), + TP_STRUCT__entry( + __field(ssize_t, size) + __field(int, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->size = size; + __entry->flags = flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("size %zd flags 0x%x caller %pS", + __entry->size, + __entry->flags, + (char *)__entry->caller_ip) +) + +#define DEFINE_KMEM_EVENT(name) \ +DEFINE_EVENT(xfs_kmem_class, name, \ + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ + TP_ARGS(size, flags, caller_ip)) +DEFINE_KMEM_EVENT(kmem_alloc); +DEFINE_KMEM_EVENT(kmem_alloc_io); +DEFINE_KMEM_EVENT(kmem_alloc_large); +DEFINE_KMEM_EVENT(kmem_realloc); +DEFINE_KMEM_EVENT(kmem_zone_alloc); + +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d42a68d8313b..3b208f9a865c 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -71,7 +71,7 @@ xfs_trans_free( if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); - kmem_zone_free(xfs_trans_zone, tp); + kmem_cache_free(xfs_trans_zone, tp); } /* @@ -90,7 +90,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + ntp = kmem_zone_zalloc(xfs_trans_zone, 0); /* * Initialize the new transaction structure. @@ -263,7 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + tp = kmem_zone_zalloc(xfs_trans_zone, 0); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 64d7f171ebd3..752c7fef9de7 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -169,21 +169,21 @@ int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); -struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, - struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - uint flags); +int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, + struct xfs_buf **bpp); -static inline struct xfs_buf * +static inline int xfs_trans_get_buf( struct xfs_trans *tp, struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags) + uint flags, + struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_trans_get_buf_map(tp, target, &map, 1, flags); + return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp); } int xfs_trans_read_buf_map(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6ccfd75d3c24..00cc5b8734be 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -427,15 +427,15 @@ xfsaild_push( case XFS_ITEM_FLUSHING: /* - * The item or its backing buffer is already beeing + * The item or its backing buffer is already being * flushed. The typical reason for that is that an * inode buffer is locked because we already pushed the * updates to it as part of inode clustering. * * We do not want to to stop flushing just because lots - * of items are already beeing flushed, but we need to + * of items are already being flushed, but we need to * re-try the flushing relatively soon if most of the - * AIL is beeing flushed. + * AIL is being flushed. */ XFS_STATS_INC(mp, xs_push_ail_flushing); trace_xfs_ail_flushing(lip); @@ -612,7 +612,7 @@ xfsaild( * The push is run asynchronously in a workqueue, which means the caller needs * to handle waiting on the async flush for space to become available. * We don't want to interrupt any push that is in progress, hence we only queue - * work if we set the pushing bit approriately. + * work if we set the pushing bit appropriately. * * We do this unlocked - we only need to know whether there is anything in the * AIL at the time we are called. We don't need to access the contents of @@ -836,7 +836,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_fsname); + ailp->ail_mount->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index b5b3a78ef31c..08174ffa2118 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -112,19 +112,22 @@ xfs_trans_bjoin( * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ -struct xfs_buf * +int xfs_trans_get_buf_map( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { xfs_buf_t *bp; struct xfs_buf_log_item *bip; + int error; + *bpp = NULL; if (!tp) - return xfs_buf_get_map(target, map, nmaps, flags); + return xfs_buf_get_map(target, map, nmaps, flags, bpp); /* * If we find the buffer in the cache with this transaction @@ -146,19 +149,20 @@ xfs_trans_get_buf_map( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); - return bp; + *bpp = bp; + return 0; } - bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp == NULL) { - return NULL; - } + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); + if (error) + return error; ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_log_item); - return bp; + *bpp = bp; + return 0; } /* @@ -276,7 +280,7 @@ xfs_trans_read_buf_map( ASSERT(bp->b_ops != NULL); error = xfs_buf_reverify(bp, ops); if (error) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __return_address); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, @@ -298,36 +302,17 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (!bp) { - if (!(flags & XBF_TRYLOCK)) - return -ENOMEM; - return tp ? 0 : -EAGAIN; - } - - /* - * If we've had a read error, then the contents of the buffer are - * invalid and should not be used. To ensure that a followup read tries - * to pull the buffer from disk again, we clear the XBF_DONE flag and - * mark the buffer stale. This ensures that anyone who has a current - * reference to the buffer will interpret it's contents correctly and - * future cache lookups will also treat it as an empty, uninitialised - * buffer. - */ - if (bp->b_error) { - error = bp->b_error; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_buf_ioerror_alert(bp, __func__); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - + error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops, + __return_address); + switch (error) { + case 0: + break; + default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; + /* fall through */ + case -ENOMEM: + case -EAGAIN: return error; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 1027c9ca6eb8..d1b9869bc5fa 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -25,8 +25,8 @@ STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); */ void xfs_trans_dqjoin( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(dqp->q_logitem.qli_dquot == dqp); @@ -49,8 +49,8 @@ xfs_trans_dqjoin( */ void xfs_trans_log_dquot( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -486,12 +486,12 @@ xfs_trans_apply_dquot_deltas( */ void xfs_trans_unreserve_and_mod_dquots( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; + struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; - bool locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -571,21 +571,21 @@ xfs_quota_warn( */ STATIC int xfs_trans_dqresv( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - int64_t nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int64_t nblks, + long ninos, + uint flags) { - xfs_qcnt_t hardlimit; - xfs_qcnt_t softlimit; - time_t timer; - xfs_qwarncnt_t warns; - xfs_qwarncnt_t warnlimit; - xfs_qcnt_t total_count; - xfs_qcnt_t *resbcountp; - xfs_quotainfo_t *q = mp->m_quotainfo; + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time64_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t total_count; + xfs_qcnt_t *resbcountp; + struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; @@ -635,7 +635,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTLONGWARN); @@ -662,7 +663,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTLONGWARN); @@ -824,13 +826,13 @@ xfs_trans_reserve_quota_nblks( /* * This routine is called to allocate a quotaoff log item. */ -xfs_qoff_logitem_t * +struct xfs_qoff_logitem * xfs_trans_get_qoff_item( - xfs_trans_t *tp, - xfs_qoff_logitem_t *startqoff, + struct xfs_trans *tp, + struct xfs_qoff_logitem *startqoff, uint flags) { - xfs_qoff_logitem_t *q; + struct xfs_qoff_logitem *q; ASSERT(tp != NULL); @@ -852,8 +854,8 @@ xfs_trans_get_qoff_item( */ void xfs_trans_log_quotaoff_item( - xfs_trans_t *tp, - xfs_qoff_logitem_t *qlp) + struct xfs_trans *tp, + struct xfs_qoff_logitem *qlp) { tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); @@ -863,7 +865,7 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); } void @@ -872,6 +874,6 @@ xfs_trans_free_dqinfo( { if (!tp->t_dqinfo) return; - kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo); + kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo); tp->t_dqinfo = NULL; } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 3123b5aaad2a..b0fedb543f97 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -11,6 +11,7 @@ #include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_attr.h" +#include "xfs_acl.h" #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -23,6 +24,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error, asize = size; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (!size) { @@ -30,7 +32,8 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, value = NULL; } - error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, + &asize, xflags); if (error) return error; return asize; @@ -66,6 +69,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (flags & XATTR_CREATE) @@ -73,10 +77,11 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, if (flags & XATTR_REPLACE) xflags |= ATTR_REPLACE; - if (!value) - return xfs_attr_remove(ip, (unsigned char *)name, xflags); - error = xfs_attr_set(ip, (unsigned char *)name, - (void *)value, size, xflags); + if (value) + error = xfs_attr_set(ip, name, namelen, (void *)value, size, + xflags); + else + error = xfs_attr_remove(ip, name, namelen, xflags); if (!error) xfs_forget_acl(inode, name, xflags); |