diff options
Diffstat (limited to 'fs')
332 files changed, 9491 insertions, 5775 deletions
diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 4f64b95d57bd..095c54165dfd 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -27,6 +27,7 @@ kafs-objs := \ vlocation.o \ vnode.o \ volume.o \ - write.o + write.o \ + xattr.o obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 3062cceb5c2a..782d4d05a53b 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -350,7 +350,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; - struct uuid_v1 *r; + struct afs_uuid *r; unsigned loop; __be32 *b; int ret; @@ -380,7 +380,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -453,7 +453,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) static void SRXAFSCB_ProbeUuid(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, work); - struct uuid_v1 *r = call->request; + struct afs_uuid *r = call->request; struct { __be32 match; @@ -476,7 +476,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) */ static int afs_deliver_cb_probe_uuid(struct afs_call *call) { - struct uuid_v1 *r; + struct afs_uuid *r; unsigned loop; __be32 *b; int ret; @@ -502,15 +502,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); if (!call->request) return -ENOMEM; b = call->buffer; r = call->request; - r->time_low = b[0]; - r->time_mid = htons(ntohl(b[1])); - r->time_hi_and_version = htons(ntohl(b[2])); + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 949f960337f5..613a77058263 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -61,6 +61,7 @@ const struct inode_operations afs_dir_inode_operations = { .permission = afs_permission, .getattr = afs_getattr, .setattr = afs_setattr, + .listxattr = afs_listxattr, }; const struct dentry_operations afs_fs_dentry_operations = { diff --git a/fs/afs/file.c b/fs/afs/file.c index 0d5b8508869b..510cba15fa56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -46,6 +46,7 @@ const struct inode_operations afs_file_inode_operations = { .getattr = afs_getattr, .setattr = afs_setattr, .permission = afs_permission, + .listxattr = afs_listxattr, }; const struct address_space_operations afs_fs_aops = { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index aae55dd15108..342316a9e3e0 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -28,6 +28,11 @@ struct afs_iget_data { struct afs_volume *volume; /* volume on which resides */ }; +static const struct inode_operations afs_symlink_inode_operations = { + .get_link = page_get_link, + .listxattr = afs_listxattr, +}; + /* * map the AFS file status to the inode member variables */ @@ -67,7 +72,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_fop = &afs_mntpt_file_operations; } else { inode->i_mode = S_IFLNK | vnode->status.mode; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &afs_symlink_inode_operations; } inode_nohighmem(inode); break; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 393672997cc2..82e16556afea 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -410,6 +410,15 @@ struct afs_interface { unsigned mtu; /* MTU of interface */ }; +struct afs_uuid { + __be32 time_low; /* low part of timestamp */ + __be16 time_mid; /* mid part of timestamp */ + __be16 time_hi_and_version; /* high part of timestamp and version */ + __u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ + __u8 clock_seq_low; /* clock seq low */ + __u8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + /*****************************************************************************/ /* * cache.c @@ -544,7 +553,7 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; -extern struct uuid_v1 afs_uuid; +extern struct afs_uuid afs_uuid; /* * misc.c @@ -722,6 +731,11 @@ extern int afs_writeback_all(struct afs_vnode *); extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); +/* + * xattr.c + */ +extern const struct xattr_handler *afs_xattr_handlers[]; +extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /*****************************************************************************/ /* diff --git a/fs/afs/main.c b/fs/afs/main.c index 51d7d17bca57..9944770849da 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -31,7 +31,7 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -struct uuid_v1 afs_uuid; +struct afs_uuid afs_uuid; struct workqueue_struct *afs_wq; /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index bd3b65cde282..690fea9d84c3 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -35,6 +35,7 @@ const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, .readlink = page_readlink, .getattr = afs_getattr, + .listxattr = afs_listxattr, }; const struct inode_operations afs_autocell_inode_operations = { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5990eb160bd..02781e78ffb6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -341,6 +341,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; size_t offset; + s64 tx_total_len; u32 abort_code; int ret; @@ -364,9 +365,20 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, srx.transport.sin.sin_port = call->port; memcpy(&srx.transport.sin.sin_addr, addr, 4); + /* Work out the length we're going to transmit. This is awkward for + * calls such as FS.StoreData where there's an extra injection of data + * after the initial fixed part. + */ + tx_total_len = call->request_size; + if (call->send_pages) { + tx_total_len += call->last_to - call->first_offset; + tx_total_len += (call->last - call->first) * PAGE_SIZE; + } + /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp, + (unsigned long)call, + tx_total_len, gfp, (async ? afs_wake_up_async_call : afs_wake_up_call_waiter)); @@ -738,6 +750,8 @@ void afs_send_empty_reply(struct afs_call *call) _enter(""); + rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0); + msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); @@ -772,6 +786,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) _enter(""); + rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len); + iov[0].iov_base = (void *) buf; iov[0].iov_len = len; msg.msg_name = NULL; diff --git a/fs/afs/security.c b/fs/afs/security.c index ecb86a670180..faca66227ecf 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -327,12 +327,11 @@ int afs_permission(struct inode *inode, int mask) if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; } else if (mask & MAY_READ) { - if (!(access & AFS_ACE_READ)) + if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; } else if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ - AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */ - AFS_ACE_WRITE))) /* chmod */ + AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; } else { BUG(); diff --git a/fs/afs/super.c b/fs/afs/super.c index c79633e5cfd8..67680c2d96cf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -319,6 +319,7 @@ static int afs_fill_super(struct super_block *sb, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; + sb->s_xattr = afs_xattr_handlers; ret = super_setup_bdi(sb); if (ret) return ret; diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c new file mode 100644 index 000000000000..2830e4f48d85 --- /dev/null +++ b/fs/afs/xattr.c @@ -0,0 +1,121 @@ +/* Extended attribute handling for AFS. We use xattrs to get and set metadata + * instead of providing pioctl(). + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/xattr.h> +#include "internal.h" + +static const char afs_xattr_list[] = + "afs.cell\0" + "afs.fid\0" + "afs.volume"; + +/* + * Retrieve a list of the supported xattrs. + */ +ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + if (size == 0) + return sizeof(afs_xattr_list); + if (size < sizeof(afs_xattr_list)) + return -ERANGE; + memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list)); + return sizeof(afs_xattr_list); +} + +/* + * Get the name of the cell on which a file resides. + */ +static int afs_xattr_get_cell(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_cell *cell = vnode->volume->cell; + size_t namelen; + + namelen = strlen(cell->name); + if (size == 0) + return namelen; + if (namelen > size) + return -ERANGE; + memcpy(buffer, cell->name, size); + return namelen; +} + +static const struct xattr_handler afs_xattr_afs_cell_handler = { + .name = "afs.cell", + .get = afs_xattr_get_cell, +}; + +/* + * Get the volume ID, vnode ID and vnode uniquifier of a file as a sequence of + * hex numbers separated by colons. + */ +static int afs_xattr_get_fid(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + char text[8 + 1 + 8 + 1 + 8 + 1]; + size_t len; + + len = sprintf(text, "%x:%x:%x", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + if (size == 0) + return len; + if (len > size) + return -ERANGE; + memcpy(buffer, text, len); + return len; +} + +static const struct xattr_handler afs_xattr_afs_fid_handler = { + .name = "afs.fid", + .get = afs_xattr_get_fid, +}; + +/* + * Get the name of the volume on which a file resides. + */ +static int afs_xattr_get_volume(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + const char *volname = vnode->volume->vlocation->vldb.name; + size_t namelen; + + namelen = strlen(volname); + if (size == 0) + return namelen; + if (namelen > size) + return -ERANGE; + memcpy(buffer, volname, size); + return namelen; +} + +static const struct xattr_handler afs_xattr_afs_volume_handler = { + .name = "afs.volume", + .get = afs_xattr_get_volume, +}; + +const struct xattr_handler *afs_xattr_handlers[] = { + &afs_xattr_afs_cell_handler, + &afs_xattr_afs_fid_handler, + &afs_xattr_afs_volume_handler, + NULL +}; @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_pos = iocb->aio_offset; req->common.ki_complete = aio_complete; req->common.ki_flags = iocb_flags(req->common.ki_filp); + req->common.ki_hint = file_write_hint(file); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 519599dddd36..9941dc8342df 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_init(&bio, vecs, nr_pages); bio.bi_bdev = bdev; bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -262,8 +263,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (vecs != inline_vecs) kfree(vecs); - if (unlikely(bio.bi_error)) - return bio.bi_error; + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + + bio_uninit(&bio); + return ret; } @@ -288,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio) bool should_dirty = dio->should_dirty; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_error && !dio->bio.bi_error) - dio->bio.bi_error = bio->bi_error; + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; } else { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; - ssize_t ret = dio->bio.bi_error; + ssize_t ret; - if (likely(!ret)) { + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); @@ -334,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -358,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) for (;;) { bio->bi_bdev = bdev; bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { - bio->bi_error = ret; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } @@ -412,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); - ret = dio->bio.bi_error; + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; @@ -436,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); if (!blkdev_dio_pool) return -ENOMEM; return 0; @@ -624,7 +632,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct block_device *bdev = I_BDEV(bd_inode); int error; - error = filemap_write_and_wait_range(filp->f_mapping, start, end); + error = file_write_and_wait_range(filp, start, end); if (error) return error; @@ -1743,6 +1751,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) return -ENOMEM; filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return blkdev_get(bdev, filp->f_mode, filp); } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 247b8dfaf6e5..8d8370ddb6b2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (ret) - return ret; - } - ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -119,6 +113,13 @@ out: int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + int ret; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) + return ret; + } return __btrfs_set_acl(NULL, inode, acl, type); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 24865da63d8f..f723c11bb763 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,7 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" @@ -2305,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = vmalloc(alloc_bytes); + data = kvmalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); @@ -2339,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, if (IS_ERR(fspath)) return (void *)fspath; - ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); if (!ifp) { - vfree(fspath); + kvfree(fspath); return ERR_PTR(-ENOMEM); } @@ -2356,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - vfree(ipath->fspath); + kvfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b8622e4d1744..d87ac27a5f2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -310,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..11d37c94ce05 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,7 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/string.h> #include "ctree.h" #include "disk-io.h" @@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); - if (!bio) { - pr_info("btrfsic: bio_alloc() for %u pages failed!\n", - num_pages - i); - return -1; - } + bio = btrfs_io_bio_alloc(num_pages - i); bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, dev_bytenr += (j - i) * PAGE_SIZE; i = j; } - for (i = 0; i < num_pages; i++) { + for (i = 0; i < num_pages; i++) block_ctx->datav[i] = kmap(block_ctx->pagev[i]); - if (!block_ctx->datav[i]) { - pr_info("btrfsic: kmap() failed (dev %s)!\n", - block_ctx->dev->name); - return -1; - } - } return block_ctx->len; } @@ -2129,7 +2118,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2132,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i; + unsigned int i = 0; u64 dev_bytenr; u64 cur_bytenr; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; int bio_is_patched; char **mapped_datav; + unsigned int segs = bio_segments(bio); dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc_array(bio->bi_vcnt, + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - bio_for_each_segment_all(bvec, bio, i) { - BUG_ON(bvec->bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec.bv_page); + i++; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec->bv_len, bvec->bv_offset); - cur_bytenr += bvec->bv_len; + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, bio->bi_vcnt, + mapped_datav, segs, bio, &bio_is_patched, NULL, bio->bi_opf); - bio_for_each_segment_all(bvec, bio, i) - kunmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) + kunmap(bvec.bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & @@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, fs_info->sectorsize, PAGE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { - state = vzalloc(sizeof(*state)); - if (!state) { - pr_info("btrfs check-integrity: vzalloc() failed!\n"); - return -1; - } + pr_info("btrfs check-integrity: allocation failed!\n"); + return -1; } if (!btrfsic_is_initialized) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 10e6b282d09d..2c0b7b57fcd5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -42,48 +43,7 @@ #include "extent_io.h" #include "extent_map.h" -struct compressed_bio { - /* number of bios pending for this compressed extent */ - refcount_t pending_bios; - - /* the pages with the compressed data on them */ - struct page **compressed_pages; - - /* inode that owns this data */ - struct inode *inode; - - /* starting offset in the inode for our pages */ - u64 start; - - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; - - /* the compression algorithm for this bio */ - int compress_type; - - /* number of compressed pages in the array */ - unsigned long nr_pages; - - /* IO errors */ - int errors; - int mirror_num; - - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u32 sums; -}; - -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen); +static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) @@ -94,12 +54,6 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } -static struct bio *compressed_bio_alloc(struct block_device *bdev, - u64 first_byte, gfp_t gfp_flags) -{ - return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); -} - static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) @@ -155,7 +109,7 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -173,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_decompress_bio(cb->compress_type, - cb->compressed_pages, - cb->start, - cb->orig_bio, - cb->compressed_len); + ret = btrfs_decompress_bio(cb); + csum_failed: if (ret) cb->errors = 1; @@ -268,7 +219,7 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -287,7 +238,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -320,7 +271,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -335,13 +286,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; + return BLK_STS_RESOURCE; refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -355,11 +306,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if (!bio) { - kfree(cb); - return -ENOMEM; - } + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -368,17 +315,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -400,14 +347,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } bio_put(bio); - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - BUG_ON(!bio); + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -434,7 +380,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -569,7 +515,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -586,7 +532,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -600,7 +546,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -638,7 +584,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, __GFP_HIGHMEM); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; - ret = -ENOMEM; + ret = BLK_STS_RESOURCE; goto fail2; } } @@ -650,28 +596,26 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); - if (!comp_bio) - goto fail2; + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -697,15 +641,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } bio_put(comp_bio); - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, - GFP_NOFS); - BUG_ON(!comp_bio); + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -726,7 +668,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -801,6 +743,7 @@ static struct list_head *find_workspace(int type) struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; + unsigned nofs_flag; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; @@ -830,7 +773,15 @@ again: atomic_inc(total_ws); spin_unlock(ws_lock); + /* + * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have + * to turn it off here because we might get called from the restricted + * context of btrfs_compress_bio/btrfs_compress_pages + */ + nofs_flag = memalloc_nofs_save(); workspace = btrfs_compress_op[idx]->alloc_workspace(); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(workspace)) { atomic_dec(total_ws); wake_up(ws_wait); @@ -961,19 +912,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen) +static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; int ret; + int type = cb->compress_type; workspace = find_workspace(type); - - ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in, - disk_start, orig_bio, - srclen); + ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); free_workspace(type, workspace); + return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 39ec43ab8df1..87f6d3332163 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,6 +34,45 @@ /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) +struct compressed_bio { + /* number of bios pending for this compressed extent */ + refcount_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* the compression algorithm for this bio */ + int compress_type; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + void btrfs_init_compress(void); void btrfs_exit_compress(void); @@ -48,12 +87,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { @@ -78,10 +117,7 @@ struct btrfs_compress_op { unsigned long *total_out); int (*decompress_bio)(struct list_head *workspace, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen); + struct compressed_bio *cb); int (*decompress)(struct list_head *workspace, unsigned char *data_in, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a3a75f1de002..3f4daa9d6e2c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,7 +19,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -3667,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, /* make room in the right data area */ data_end = leaf_data_end(fs_info, right); memmove_extent_buffer(right, - btrfs_leaf_data(right) + data_end - push_space, - btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end - push_space, + BTRFS_LEAF_DATA_OFFSET + data_end, BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, btrfs_leaf_data(right) + + copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(left) + leaf_data_end(fs_info, left), + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3888,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset_nr(right, push_items - 1); - copy_extent_buffer(left, right, btrfs_leaf_data(left) + + copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset_nr(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); @@ -3917,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - leaf_data_end(fs_info, right); - memmove_extent_buffer(right, btrfs_leaf_data(right) + + memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), @@ -4069,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, btrfs_leaf_data(l) + + BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - + data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); @@ -4607,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; @@ -4634,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, } } - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); @@ -4707,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - data_size, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - data_size, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; @@ -4790,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - total_data, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; } @@ -4983,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { int data_end = leaf_data_end(fs_info, leaf); - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, - btrfs_leaf_data(leaf) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); for (i = slot + nr; i < nritems; i++) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4f8f75d9e839..3f3eb7b17cac 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,6 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; @@ -716,6 +715,10 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +#define BTRFS_FS_QUOTA_OVERRIDE 14 +/* Used to record internally whether fs has been frozen */ +#define BTRFS_FS_FROZEN 15 + /* * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) @@ -748,8 +751,7 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; + atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -797,17 +799,7 @@ struct btrfs_fs_info { * so it is also safe. */ u64 max_inline; - /* - * Protected by ->chunk_mutex and sb->s_umount. - * - * The reason that we use two lock to protect it is because only - * remount and mount operations can change it and these two operations - * are under sb->s_umount, but the read side (chunk allocation) can not - * acquire sb->s_umount or the deadlock would happen. So we use two - * locks to protect it. On the write side, we must acquire two locks, - * and on the read side, we just need acquire one of them. - */ - u64 alloc_start; + struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; @@ -1107,9 +1099,6 @@ struct btrfs_fs_info { */ struct list_head pinned_chunks; - /* Used to record internally whether fs has been frozen */ - int fs_frozen; - /* Cached block sizes */ u32 nodesize; u32 sectorsize; @@ -1277,21 +1266,20 @@ struct btrfs_root { /* For qgroup metadata space reserve */ atomic64_t qgroup_meta_rsv; }; + static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { return btrfs_sb(inode->i_sb)->sectorsize; } -static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) -{ - return blocksize - sizeof(struct btrfs_header); -} - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return __BTRFS_LEAF_DATA_SIZE(info->nodesize); + + return info->nodesize - sizeof(struct btrfs_header); } +#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) + static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1553,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ s->member = cpu_to_le##bits(val); \ } + +static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + + BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); @@ -2324,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } -static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) -{ - return offsetof(struct btrfs_leaf, items); -} /* * The leaf data grows from end-to-front in the node. @@ -2538,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(btrfs_leaf_data(leaf) + \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(btrfs_leaf_data(leaf) + \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) @@ -2680,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2703,9 +2708,13 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2722,8 +2731,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); @@ -3031,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod); int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, + struct extent_buffer *leaf, int slot, struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3078,8 +3089,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3094,7 +3105,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); @@ -3171,6 +3182,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); int btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index be70d90dfee5..93ffa898df6d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -470,7 +470,8 @@ add_tail: static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) + struct btrfs_delayed_ref_node *update, + int *old_ref_mod_ret) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; @@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing_ref->total_ref_mod; + if (old_ref_mod_ret) + *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing_ref->total_ref_mod += update->ref_mod; @@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret) + int action, int is_data, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref, + old_ref_mod); /* * we've updated the existing ref, free the newly * allocated ref @@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (old_ref_mod) + *old_ref_mod = 0; if (is_data && count_mod < 0) delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; @@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; + if (new_ref_mod) + *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted); + &qrecord_inserted, old_ref_mod, + new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action) + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted); + action, 1, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, @@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL); + extent_op->is_data, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c0264ff01b53..ce88e4ac5276 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -247,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action); + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5fe1ca8abc70..bee3edeea7a3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c24d615e3d7f..41cb9196eaa8 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, dir_item)) - return NULL; total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { @@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); + if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) + return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, + int slot, struct btrfs_dir_item *dir_item) { u16 namelen = BTRFS_NAME_LEN; + int ret; u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { @@ -472,6 +474,12 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, return 1; } + namelen = btrfs_dir_name_len(leaf, dir_item); + ret = btrfs_is_name_len_valid(leaf, slot, + (unsigned long)(dir_item + 1), namelen); + if (!ret) + return 1; + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + btrfs_dir_name_len(leaf, dir_item)) > @@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, return 0; } + +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_key key; + u32 read_start; + u32 read_end; + u32 item_start; + u32 item_end; + u32 size; + bool ret = true; + + ASSERT(start > BTRFS_LEAF_DATA_OFFSET); + + read_start = start - BTRFS_LEAF_DATA_OFFSET; + read_end = read_start + name_len; + item_start = btrfs_item_offset_nr(leaf, slot); + item_end = btrfs_item_end_nr(leaf, slot); + + btrfs_item_key_to_cpu(leaf, &key, slot); + + switch (key.type) { + case BTRFS_DIR_ITEM_KEY: + case BTRFS_XATTR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + size = sizeof(struct btrfs_dir_item); + break; + case BTRFS_INODE_REF_KEY: + size = sizeof(struct btrfs_inode_ref); + break; + case BTRFS_INODE_EXTREF_KEY: + size = sizeof(struct btrfs_inode_extref); + break; + case BTRFS_ROOT_REF_KEY: + case BTRFS_ROOT_BACKREF_KEY: + size = sizeof(struct btrfs_root_ref); + break; + default: + ret = false; + goto out; + } + + if (read_start < item_start) { + ret = false; + goto out; + } + if (read_end > item_end) { + ret = false; + goto out; + } + + /* there shall be item(s) before name */ + if (read_start - item_start < size) { + ret = false; + goto out; + } + +out: + if (!ret) + btrfs_crit(fs_info, "invalid dir item name len: %u", + (unsigned int)name_len); + return ret; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5f678dcb20e6..086dcbadce09 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,9 +87,8 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; - struct list_head list; struct btrfs_work work; }; @@ -118,9 +117,9 @@ void btrfs_end_io_wq_exit(void) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + void *private_data; + struct btrfs_fs_info *fs_info; struct bio *bio; - struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; int mirror_num; @@ -131,7 +130,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -799,7 +798,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -836,19 +835,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -868,14 +867,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, + ret = async->submit_bio_start(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -885,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work) int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; + fs_info = async->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -898,13 +897,13 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } - async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->submit_bio_done(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); } @@ -916,20 +915,20 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; - async->inode = inode; + async->private_data = private_data; + async->fs_info = fs_info; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -986,11 +985,12 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -998,7 +998,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1015,13 +1015,14 @@ static int check_async_write(unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(bio_flags); - int ret; + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1043,8 +1044,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0, - bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, + bio_offset, private_data, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -1054,7 +1055,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1222,10 +1223,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf) buf->start + buf->len - 1); } -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -1255,9 +1256,9 @@ void clean_tree_block(struct btrfs_fs_info *fs_info, btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -1347,8 +1348,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&root->dirty_log_pages, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1820,7 +1820,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -2309,7 +2309,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); BTRFS_I(inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); @@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); @@ -2662,12 +2661,11 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->free_chunk_space = 0; + atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ @@ -2704,10 +2702,8 @@ int open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping); - extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); @@ -3485,65 +3481,61 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio) { - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); + complete(bio->bi_private); } /* - * trigger flushes for one the devices. If you pass wait == 0, the flushes are - * sent down. With wait == 1, it waits for the previous flush. - * - * any device where the flush fails with eopnotsupp are flagged as not-barrier - * capable + * Submit a flush request to the device if it supports it. Error handling is + * done in the waiting counterpart. */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static void write_dev_flush(struct btrfs_device *device) { struct request_queue *q = bdev_get_queue(device->bdev); - struct bio *bio; - int ret = 0; + struct bio *bio = device->flush_bio; if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) - return 0; + return; - if (wait) { - bio = device->flush_bio; - if (!bio) - return 0; + bio_reset(bio); + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; - wait_for_completion(&device->flush_wait); + submit_bio(bio); + device->flush_bio_sent = 1; +} - if (bio->bi_error) { - ret = bio->bi_error; - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); - } +/* + * If the flush bio has been submitted by write_dev_flush, wait for it. + */ +static blk_status_t wait_dev_flush(struct btrfs_device *device) +{ + struct bio *bio = device->flush_bio; - /* drop the reference from the wait == 0 run */ - bio_put(bio); - device->flush_bio = NULL; + if (!device->flush_bio_sent) + return 0; - return ret; - } + device->flush_bio_sent = 0; + wait_for_completion_io(&device->flush_wait); - /* - * one reference for us, and we leave it for the - * caller - */ - device->flush_bio = NULL; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - return -ENOMEM; + return bio->bi_status; +} - bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - init_completion(&device->flush_wait); - bio->bi_private = &device->flush_wait; - device->flush_bio = bio; +static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +{ + int dev_flush_error = 0; + struct btrfs_device *dev; + + list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { + if (!dev->bdev || dev->last_flush_error) + dev_flush_error++; + } - bio_get(bio); - btrfsic_submit_bio(bio); + if (dev_flush_error > + fsdevs->fs_info->num_tolerated_disk_barrier_failures) + return -EIO; return 0; } @@ -3556,25 +3548,21 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (dev->missing) continue; - if (!dev->bdev) { - errors_send++; + if (!dev->bdev) continue; - } if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 0); - if (ret) - errors_send++; + write_dev_flush(dev); + dev->last_flush_error = 0; } /* wait for all the barriers */ @@ -3588,13 +3576,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 1); - if (ret) + ret = wait_dev_flush(dev); + if (ret) { + dev->last_flush_error = ret; + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); errors_wait++; + } + } + + if (errors_wait) { + /* + * At some point we need the status of all disks + * to arrive at the volume status. So error checking + * is being pushed to a separate loop. + */ + return check_barrier_error(info->fs_devices); } - if (errors_send > info->num_tolerated_disk_barrier_failures || - errors_wait > info->num_tolerated_disk_barrier_failures) - return -EIO; return 0; } @@ -4049,9 +4047,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->start, transid, fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + buf->len, + fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { btrfs_print_leaf(fs_info, buf); @@ -4578,11 +4576,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - - /* - memset(cur_trans, 0, sizeof(*cur_trans)); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - */ } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) @@ -4638,6 +4631,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } +static struct btrfs_fs_info *btree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, @@ -4645,6 +4644,8 @@ static const struct extent_io_ops btree_extent_io_ops = { /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btree_io_failed_hook, + .set_range_writeback = btrfs_set_range_writeback, + .tree_fs_info = btree_fs_info, /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 21f1ceb85b76..0a634d3ffc16 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -118,16 +118,16 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 87144c9f9593..fa66980726c9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -282,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } + ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); + if (!ret) { + btrfs_free_path(path); + return -EIO; + } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 33d979e9ea2a..375f8c728d91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush); + enum btrfs_reserve_flush_enum flush, + bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -766,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. @@ -2092,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && @@ -2099,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 0, - BTRFS_ADD_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_ADD_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + return ret; } @@ -2411,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, head = btrfs_delayed_node_to_head(node); trace_run_delayed_ref_head(fs_info, node, head, node->action); + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, node->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -node->num_bytes); + btrfs_put_block_group(cache); + } + if (insert_reserved) { btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); @@ -3364,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3483,7 +3522,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3514,6 +3553,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -3924,88 +3964,83 @@ static const char *alloc_name(u64 flags) }; } -static int update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) +static int create_space_info(struct btrfs_fs_info *info, u64 flags, + struct btrfs_space_info **new) { - struct btrfs_space_info *found; + + struct btrfs_space_info *space_info; int i; - int factor; int ret; - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - - found = __find_space_info(info, flags); - if (found) { - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; - return 0; - } - found = kzalloc(sizeof(*found), GFP_NOFS); - if (!found) + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); if (ret) { - kfree(found); + kfree(space_info); return ret; } for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&found->block_groups[i]); - init_rwsem(&found->groups_sem); - spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - found->total_bytes = total_bytes; - found->disk_total = total_bytes * factor; - found->bytes_used = bytes_used; - found->disk_used = bytes_used * factor; - found->bytes_pinned = 0; - found->bytes_reserved = 0; - found->bytes_readonly = bytes_readonly; - found->bytes_may_use = 0; - found->full = 0; - found->max_extent_size = 0; - found->force_alloc = CHUNK_ALLOC_NO_FORCE; - found->chunk_alloc = 0; - found->flush = 0; - init_waitqueue_head(&found->wait); - INIT_LIST_HEAD(&found->ro_bgs); - INIT_LIST_HEAD(&found->tickets); - INIT_LIST_HEAD(&found->priority_tickets); - - ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, info->space_info_kobj, "%s", - alloc_name(found->flags)); + alloc_name(space_info->flags)); if (ret) { - percpu_counter_destroy(&found->total_bytes_pinned); - kfree(found); + percpu_counter_destroy(&space_info->total_bytes_pinned); + kfree(space_info); return ret; } - *space_info = found; - list_add_rcu(&found->list, &info->space_info); + *new = space_info; + list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = found; + info->data_sinfo = space_info; return ret; } +static void update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + found = __find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4121,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; @@ -4138,6 +4173,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { @@ -4187,7 +4237,7 @@ again: data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); + alloc_target = btrfs_data_alloc_profile(fs_info); /* * It is ugly that we don't call nolock join * transaction for the free space inode case here. @@ -4238,7 +4288,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1, 0, + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4278,12 +4328,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4298,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -4341,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4351,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -4463,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } if (left < thresh) { - u64 flags; + u64 flags = btrfs_system_alloc_profile(fs_info); - flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from @@ -4506,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(fs_info, flags); if (!space_info) { - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); - BUG_ON(ret); /* -ENOMEM */ + ret = create_space_info(fs_info, flags, &space_info); + if (ret) + return ret; } - BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -4614,11 +4662,11 @@ out: return ret; } -static int can_overcommit(struct btrfs_root *root, +static int can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; u64 space_size; @@ -4629,7 +4677,11 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - profile = btrfs_get_alloc_profile(root, 0); + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + used = btrfs_space_info_used(space_info, false); /* @@ -4646,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&fs_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(&fs_info->free_chunk_lock); + avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free @@ -4698,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, } } -static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; - int nr; + u64 nr; bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = (int)div64_u64(to_reclaim, bytes); + nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; return nr; @@ -4716,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 max_reclaim; + u64 items; long time_left; unsigned long nr_pages; int loops; - int items; enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &fs_info->delalloc_block_rsv; @@ -4776,7 +4825,7 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, flush)) { + if (can_overcommit(fs_info, space_info, orig, flush, false)) { spin_unlock(&space_info->lock); break; } @@ -4838,7 +4887,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) >= 0) { + bytes - delayed_rsv->size) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } @@ -4886,7 +4935,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes * 2, orig_bytes, + shrink_delalloc(fs_info, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4896,7 +4945,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; } ret = do_chunk_alloc(trans, fs_info, - btrfs_get_alloc_profile(root, 0), + btrfs_metadata_alloc_profile(fs_info), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) @@ -4917,8 +4966,9 @@ static int flush_space(struct btrfs_fs_info *fs_info, } static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, - struct btrfs_space_info *space_info) +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) { struct reserve_ticket *ticket; u64 used; @@ -4933,14 +4983,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4954,17 +5004,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_root *root, u64 used) +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -5001,8 +5052,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -5024,8 +5075,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_tickets_id == space_info->tickets_id) { @@ -5063,8 +5115,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state = FLUSH_DELAYED_ITEMS_NR; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->extent_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -5143,12 +5195,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -5170,7 +5222,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (can_overcommit(root, space_info, orig_bytes, flush)) { + } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); @@ -5197,7 +5250,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, orig_bytes, flush, "enospc"); queue_work(system_unbound_wq, - &root->fs_info->async_reclaim_work); + &fs_info->async_reclaim_work); } } else { list_add_tail(&ticket.list, @@ -5211,7 +5264,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(space_info, root, used) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -5269,9 +5323,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; + bool system_chunk = (root == fs_info->chunk_root); - ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, - flush); + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -5380,9 +5435,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, * overcommit, and if we can't then we just need to free up our space * and not satisfy any requests. */ - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); if (used - num_bytes >= space_info->total_bytes) check_overcommit = true; again: @@ -5394,8 +5447,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !can_overcommit(fs_info->extent_root, space_info, 0, - flush)) + !can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); @@ -6124,6 +6176,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6141,16 +6195,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6169,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -6248,6 +6304,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + num_bytes); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6324,6 +6382,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes); set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -6794,27 +6853,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, - u64 owner, u64 root_objectid) -{ - struct btrfs_space_info *space_info; - u64 flags; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } - - space_info = __find_space_info(fs_info, flags); - BUG_ON(!space_info); /* Logic bug */ - percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); -} - - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -7037,8 +7075,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } - add_pinned_bytes(info, -num_bytes, owner_objectid, - root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -7170,19 +7206,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - buf->start, buf->len, - parent, + int old_ref_mod, new_ref_mod; + + ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, + buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ + pin = old_ref_mod >= 0 && new_ref_mod < 0; } - if (!last_ref) - return; - - if (btrfs_header_generation(buf) == trans->transid) { + if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group_cache *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -7191,6 +7227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -7206,18 +7243,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); - pin = 0; } out: if (pin) add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), root->root_key.objectid); - /* - * Deleting the buffer, clear the corrupt flag since it doesn't matter - * anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } } /* Can return -ENOMEM */ @@ -7226,12 +7264,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -7241,19 +7279,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + old_ref_mod = new_ref_mod = 0; ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, 0, - BTRFS_DROP_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_DROP_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + return ret; } @@ -7956,7 +8000,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 flags; int ret; - flags = btrfs_get_alloc_profile(root, is_data); + flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, @@ -8200,9 +8244,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - ram_bytes, BTRFS_ADD_DELAYED_EXTENT); + ins->offset, 0, root_objectid, owner, + offset, ram_bytes, + BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; } @@ -8422,11 +8466,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - ins.objectid, ins.offset, - parent, root_objectid, level, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, + ins.offset, parent, + root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, NULL, NULL); if (ret) goto out_free_delayed; } @@ -10059,19 +10103,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - ret = update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&info->block_group_cache_lock); - btrfs_put_block_group(cache); - goto error; - } + update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10203,16 +10237,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } #endif /* - * Call to ensure the corresponding space_info object is created and - * assigned to our block group, but don't update its counters just yet. - * We want our bg to be added to the rbtree with its ->space_info set. + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. */ - ret = update_space_info(fs_info, cache->flags, 0, 0, 0, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; + cache->space_info = __find_space_info(fs_info, cache->flags); + if (!cache->space_info) { + ret = create_space_info(fs_info, cache->flags, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } } ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10227,18 +10264,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - ret = update_space_info(fs_info, cache->flags, size, bytes_used, + update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - return ret; - } update_global_block_rsv(fs_info); __link_block_group(cache->space_info, cache); @@ -10786,21 +10813,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } out: return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d3619e010005..556484cf5d93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode; - u64 isize; - - if (!tree->mapping) - return; - - inode = tree->mapping->host; - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } + if (tree->ops && tree->ops->check_extent_io_range) + tree->ops->check_extent_io_range(tree->private_data, caller, + start, end); } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { - if (!tree->mapping) - return NULL; - return btrfs_sb(tree->mapping->host->i_sb); + if (tree->ops) + return tree->ops->tree_fs_info(tree->private_data); + return NULL; } int __init extent_io_init(void) @@ -174,7 +164,8 @@ int __init extent_io_init(void) goto free_state_cache; btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio)); + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS); if (!btrfs_bioset) goto free_buffer_cache; @@ -213,13 +204,13 @@ void extent_io_exit(void) } void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping) + void *private_data) { tree->state = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - tree->mapping = mapping; + tree->private_data = private_data; } static struct extent_state *alloc_extent_state(gfp_t mask) @@ -369,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->mapping->host, new, - other); + tree->ops->merge_extent_hook(tree->private_data, new, other); } /* @@ -421,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->mapping->host, state, bits); + tree->ops->set_bit_hook(tree->private_data, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), - state, bits); + tree->ops->clear_bit_hook(tree->private_data, state, bits); } static void set_state_bits(struct extent_io_tree *tree, @@ -478,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->mapping->host, orig, split); + tree->ops->split_extent_hook(tree->private_data, orig, split); } /* @@ -1402,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) */ static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); - put_page(page); - index++; - } + tree->ops->set_range_writeback(tree->private_data, start, end); } /* find the first state struct with 'bits' set after 'start', and @@ -1961,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec) { int ret; int err = 0; - struct extent_io_tree *failure_tree = &inode->io_failure_tree; set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, @@ -1974,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) if (ret) err = ret; - ret = clear_extent_bits(&inode->io_tree, rec->start, + ret = clear_extent_bits(io_tree, rec->start, rec->start + rec->len - 1, EXTENT_DAMAGED); if (ret && !err) @@ -1994,11 +1974,10 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, - u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; @@ -2009,9 +1988,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; @@ -2070,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - btrfs_ino(inode), start, + ino, start, rcu_str_deref(dev->name), sector); btrfs_bio_counter_dec(fs_info); bio_put(bio); @@ -2090,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; - ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, - PAGE_SIZE, start, p, + ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) break; @@ -2105,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, - unsigned int pg_offset) +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset) { u64 private; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *state; int num_copies; int ret; private = 0; - ret = count_range_bits(&inode->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0); + ret = count_range_bits(failure_tree, &private, (u64)-1, 1, + EXTENT_DIRTY, 0); if (!ret) return 0; - ret = get_state_failrec(&inode->io_failure_tree, start, - &failrec); + ret = get_state_failrec(failure_tree, start, &failrec); if (ret) return 0; @@ -2138,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, if (fs_info->sb->s_flags & MS_RDONLY) goto out; - spin_lock(&inode->io_tree.lock); - state = find_first_extent_bit_state(&inode->io_tree, + spin_lock(&io_tree->lock); + state = find_first_extent_bit_state(io_tree, failrec->start, EXTENT_LOCKED); - spin_unlock(&inode->io_tree.lock); + spin_unlock(&io_tree->lock); if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { - repair_io_failure(inode, start, failrec->len, - failrec->logical, page, - pg_offset, failrec->failed_mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, + failrec->failed_mirror); } } out: - free_io_failure(inode, failrec); + free_io_failure(failure_tree, io_tree, failrec); return 0; } @@ -2356,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct btrfs_io_bio *btrfs_failed_bio; struct btrfs_io_bio *btrfs_bio; - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return NULL; - + bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = fs_info->fs_devices->latest_bdev; @@ -2397,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct io_failure_record *failrec; struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2409,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2422,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, (int)phy_offset, failed_bio->bi_end_io, NULL); if (!bio) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, tree, failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -2431,11 +2406,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { - free_io_failure(BTRFS_I(inode), failrec); + if (status) { + free_io_failure(failure_tree, tree, failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2474,6 +2450,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; @@ -2503,7 +2480,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2536,9 +2513,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *tree; + struct extent_io_tree *tree, *failure_tree; u64 offset = 0; u64 start; u64 end; @@ -2556,9 +2533,10 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will @@ -2588,8 +2566,10 @@ static void end_bio_extent_readpage(struct bio *bio) if (ret) uptodate = 0; else - clean_io_failure(BTRFS_I(inode), start, - page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, + page, + btrfs_ino(BTRFS_I(inode)), 0); } if (likely(uptodate)) @@ -2615,7 +2595,7 @@ static void end_bio_extent_readpage(struct bio *bio) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = !bio->bi_error; + uptodate = !bio->bi_status; offset += len; continue; } @@ -2673,77 +2653,80 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } /* - * this allocates from the btrfs_bioset. We're returning a bio right now - * but you can call btrfs_io_bio for the appropriate container_of magic + * Initialize the members up to but not including 'bio'. Use after allocating a + * new bio by bio_alloc_bioset as it does not initialize the bytes outside of + * 'bio' because use of __GFP_ZERO is not supported. */ -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) { - struct btrfs_io_bio *btrfs_bio; - struct bio *bio; - - bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); + memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); +} - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) { - bio = bio_alloc_bioset(gfp_flags, - nr_vecs, btrfs_bioset); - } - } +/* + * The following helpers allocate a bio. As it's backed by a bioset, it'll + * never fail. We're returning a bio right now but you can call btrfs_io_bio + * for the appropriate container_of magic + */ +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +{ + struct bio *bio; - if (bio) { - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = first_sector; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_byte >> 9; + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *btrfs_bio_clone(struct bio *bio) { struct btrfs_io_bio *btrfs_bio; struct bio *new; - new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); - if (new) { - btrfs_bio = btrfs_io_bio(new); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + btrfs_bio = btrfs_io_bio(new); + btrfs_io_bio_init(btrfs_bio); + btrfs_bio->iter = bio->bi_iter; return new; } -/* this also allocates from the btrfs_bioset */ -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) { - struct btrfs_io_bio *btrfs_bio; struct bio *bio; - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); - if (bio) { - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_bio; + + /* this will never fail when it's backed by a bioset */ + bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + ASSERT(bio); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_io_bio_init(btrfs_bio); + + bio_trim(bio, offset >> 9, size >> 9); + btrfs_bio->iter = bio->bi_iter; + return bio; +} static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2755,13 +2738,13 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops) - ret = tree->ops->submit_bio_hook(page->mapping->host, bio, + ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, bio_flags, start); else btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -2818,14 +2801,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, - GFP_NOFS | __GFP_HIGH); - if (!bio) - return -ENOMEM; - + bio = btrfs_bio_alloc(bdev, sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio->bi_write_hint = page->mapping->host->i_write_hint; bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); @@ -3597,9 +3577,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -eb->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); @@ -3707,7 +3687,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); @@ -3757,7 +3737,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); memzero_extent_buffer(eb, start, end - start); } @@ -4463,29 +4443,25 @@ try_submit_last: } /* - * Sanity check for fiemap cache + * Emit last fiemap cache * - * All fiemap cache should be submitted by emit_fiemap_extent() - * Iteration should be terminated either by last fiemap extent or - * fieinfo->fi_extents_max. - * So no cached fiemap should exist. + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). */ -static int check_fiemap_cache(struct btrfs_fs_info *fs_info, - struct fiemap_extent_info *fieinfo, - struct fiemap_cache *cache) +static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) { int ret; if (!cache->cached) return 0; - /* Small and recoverbale problem, only to info developer */ -#ifdef CONFIG_BTRFS_DEBUG - WARN_ON(1); -#endif - btrfs_warn(fs_info, - "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x", - cache->offset, cache->phys, cache->len, cache->flags); ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, cache->len, cache->flags); cache->cached = false; @@ -4701,7 +4677,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } out_free: if (!ret) - ret = check_fiemap_cache(root->fs_info, fieinfo, &cache); + ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1eafa2f0ede3..3fb8513bf02e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -92,7 +92,7 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { @@ -108,32 +108,36 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); + struct btrfs_fs_info *(*tree_fs_info)(void *private_data); + void (*set_range_writeback)(void *private_data, u64 start, u64 end); /* * Optional hooks, called if the pointer is not NULL */ - int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + int (*fill_delalloc)(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + void (*set_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*clear_bit_hook)(struct btrfs_inode *inode, + void (*clear_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*merge_extent_hook)(struct inode *inode, + void (*merge_extent_hook)(void *private_data, struct extent_state *new, struct extent_state *other); - void (*split_extent_hook)(struct inode *inode, + void (*split_extent_hook)(void *private_data, struct extent_state *orig, u64 split); + void (*check_extent_io_range)(void *private_data, const char *caller, + u64 start, u64 end); }; struct extent_io_tree { struct rb_root state; - struct address_space *mapping; + void *private_data; u64 dirty_bytes; int track_uptodate; spinlock_t lock; @@ -205,12 +209,46 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -230,8 +268,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, u64 start, u64 len, int create); -void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping); +void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -459,20 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags); -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); struct btrfs_fs_info; struct btrfs_inode; -int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, - u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); -int clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int mirror_num); @@ -507,7 +545,9 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec); +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31d7163..fdcb41002623 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,11 +160,12 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -177,12 +178,12 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 page_bytes_left; u32 diff; int nblocks; - int count = 0, i; + int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +192,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -206,8 +207,6 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; - WARN_ON(bio->bi_vcnt <= 0); - /* * the free space stuff is only read when it hasn't been * updated in the current transaction. So, we can safely @@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (dio) offset = logical_offset; - bio_for_each_segment_all(bvec, bio, i) { - page_bytes_left = bvec->bv_len; + bio_for_each_segment(bvec, bio, iter) { + page_bytes_left = bvec.bv_len; if (count) goto next; if (!dio) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, (u32 *)csum, nblocks); if (count) @@ -303,12 +302,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,26 +432,26 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec; + struct bvec_iter iter; + struct bio_vec bvec; int index; int nr_sectors; - int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; + int i; u64 offset; - WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment(bvec, bio, iter) { if (!contig) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, - bvec->bv_len + fs_info->sectorsize + bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < nr_sectors; i++) { @@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + total_bytes; index = 0; - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); } sums->sums[index] = ~(u32)0; sums->sums[index] - = btrfs_csum_data(data + bvec->bv_offset + = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), sums->sums[index], fs_info->sectorsize); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da1096eb1a40..9e75d8a39aac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1657,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1740,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1796,12 +1801,13 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -1876,17 +1882,36 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; loff_t pos; - size_t count; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); return err; } + pos = iocb->ki_pos; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } + current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { @@ -1914,8 +1939,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { @@ -2011,7 +2034,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; - int ret = 0; + int ret = 0, err; bool full_sync = 0; u64 len; @@ -2030,7 +2053,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) - return ret; + goto out; inode_lock(inode); atomic_inc(&root->log_batch); @@ -2135,10 +2158,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping - * flags for any errors that might have happened while doing - * writeback of file data. + * for any errors that might have happened since we last + * checked called fsync. */ - ret = filemap_check_errors(inode->i_mapping); + ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); inode_unlock(inode); goto out; } @@ -2227,6 +2250,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; return ret > 0 ? -EIO : ret; } @@ -2390,10 +2416,13 @@ out: */ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; int ret = 0; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + round_down(*start, fs_info->sectorsize), + round_up(*len, fs_info->sectorsize), 0); if (IS_ERR(em)) return PTR_ERR(em); @@ -2769,6 +2798,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2898,8 +2928,8 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -2910,8 +2940,8 @@ static long btrfs_fallocate(struct file *file, int mode, * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2930,8 +2960,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2969,8 +3000,9 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } @@ -3071,13 +3103,19 @@ out: return offset; } +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_AIO_NOWAIT; + return generic_file_open(inode, filp); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, - .open = generic_file_open, + .open = btrfs_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index fc0bd8406758..a5e34de06c2f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -17,7 +17,7 @@ */ #include <linux/kernel.h> -#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "locking.h" @@ -153,21 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static u8 *alloc_bitmap(u32 bitmap_size) { - void *mem; + u8 *ret; + unsigned int nofs_flag; /* - * The allocation size varies, observed numbers were < 4K up to 16K. - * Using vmalloc unconditionally would be too heavy, we'll try - * contiguous allocations first. + * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse + * into the filesystem as the free space bitmap can be modified in the + * critical section of a transaction commit. + * + * TODO: push the memalloc_nofs_{save,restore}() to the caller where we + * know that recursion is unsafe. */ - if (bitmap_size <= PAGE_SIZE) - return kzalloc(bitmap_size, GFP_NOFS); - - mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); - if (mem) - return mem; - - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL); + nofs_flag = memalloc_nofs_save(); + ret = kvzalloc(bitmap_size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + return ret; } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, @@ -1188,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); @@ -1277,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->commit_root); kfree(free_space_root); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5c6c20ec64d8..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -492,7 +493,7 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef3c98c527c1..06dea7c89bbd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; @@ -178,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; - int err = 0; int ret; size_t cur_size = size; unsigned long offset; @@ -200,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { - err = ret; + if (ret) goto fail; - } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -258,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, BTRFS_I(inode)->disk_i_size = inode->i_size; ret = btrfs_update_inode(trans, root, inode); - return ret; fail: - return err; + return ret; } @@ -350,7 +345,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -608,12 +603,11 @@ cont: /* * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk + * win, compare the page count read with the blocks on disk, + * compression must free at least one sector size */ total_in = ALIGN(total_in, PAGE_SIZE); - if (total_compressed >= total_in) { - will_compress = 0; - } else { + if (total_compressed + blocksize <= total_in) { num_bytes = total_in; *num_added += 1; @@ -842,13 +836,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -1569,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) /* * extent_io.c call back to do delayed allocation processing */ -static int run_delalloc_range(struct inode *inode, struct page *locked_page, +static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { + struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); @@ -1596,9 +1590,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } -static void btrfs_split_extent_hook(struct inode *inode, +static void btrfs_split_extent_hook(void *private_data, struct extent_state *orig, u64 split) { + struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1633,10 +1628,11 @@ static void btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(struct inode *inode, +static void btrfs_merge_extent_hook(void *private_data, struct extent_state *new, struct extent_state *other) { + struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1736,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(struct inode *inode, +static void btrfs_set_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1766,8 +1763,8 @@ static void btrfs_set_bit_hook(struct inode *inode, if (btrfs_is_testing(fs_info)) return; - __percpu_counter_add(&fs_info->delalloc_bytes, len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, len, + fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (*bits & EXTENT_DEFRAG) @@ -1790,10 +1787,11 @@ static void btrfs_set_bit_hook(struct inode *inode, /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static void btrfs_clear_bit_hook(struct btrfs_inode *inode, +static void btrfs_clear_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1840,8 +1838,8 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, &inode->vfs_inode, state->start, len); - __percpu_counter_add(&fs_info->delalloc_bytes, -len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, + fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; if (do_list && inode->delalloc_bytes == 0 && @@ -1901,11 +1899,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { - int ret = 0; + struct inode *inode = private_data; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1920,16 +1919,17 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1939,14 +1939,15 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -1976,8 +1977,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, - bio_flags, bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, + bio_offset, inode, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; @@ -1991,8 +1992,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -2035,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -2072,7 +2074,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -2092,6 +2094,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -2143,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 qg_released; int extent_inserted = 0; int ret; @@ -2198,13 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins); + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head */ - btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + if (ret < 0) + goto out; + qg_released = ret; + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2926,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -4762,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4776,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4784,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4856,11 +4865,12 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -5255,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -5868,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; @@ -5919,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -5934,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) ctx->pos = found_key.offset; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, di)) + if (verify_dir_item(fs_info, leaf, slot, di)) goto next; name_len = btrfs_dir_name_len(leaf, di); @@ -7480,7 +7488,7 @@ out: bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) { struct radix_tree_root *root = &inode->i_mapping->page_tree; - int found = false; + bool found = false; void **pagep = NULL; struct page *page = NULL; unsigned long start_idx; @@ -7978,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int isector; int read_mode = 0; + int segs; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7992,13 +8003,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } - if ((failed_bio->bi_vcnt > 1) - || (failed_bio->bi_io_vec->bv_len - > btrfs_inode_sectorsize(inode))) + segs = bio_segments(failed_bio); + if (segs > 1 || + (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -8006,7 +8017,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, pgoff, isector, repair_endio, repair_arg); if (!bio) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -8017,7 +8028,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } @@ -8034,19 +8045,24 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; + struct inode *inode = done->inode; struct bio_vec *bvec; + struct extent_io_tree *io_tree, *failure_tree; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(BTRFS_I(done->inode), done->start, - bvec->bv_page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, + io_tree, done->start, bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), 0); end: complete(&done->done); bio_put(bio); @@ -8056,36 +8072,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; unsigned int pgoff; u32 sectorsize; int nr_sectors; - int i; int ret; + int err = 0; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); + pgoff = bvec.bv_offset; next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio_nocsum, &done); - if (ret) - return ret; + if (ret) { + err = ret; + goto next; + } wait_for_completion(&done.done); @@ -8094,6 +8114,7 @@ next_block_or_try_again: goto next_block_or_try_again; } +next: start += sectorsize; nr_sectors--; @@ -8104,19 +8125,21 @@ next_block_or_try_again: } } - return 0; + return err; } static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct extent_io_tree *io_tree, *failure_tree; + struct inode *inode = done->inode; struct bio_vec *bvec; int uptodate; int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; @@ -8124,13 +8147,19 @@ static void btrfs_retry_endio(struct bio *bio) ASSERT(bio->bi_vcnt == 1); ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + bio_for_each_segment_all(bvec, bio, i) { - ret = __readpage_endio_check(done->inode, io_bio, i, - bvec->bv_page, bvec->bv_offset, - done->start, bvec->bv_len); + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + bvec->bv_offset, done->start, + bvec->bv_len); if (!ret) - clean_io_failure(BTRFS_I(done->inode), done->start, - bvec->bv_page, bvec->bv_offset); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, io_tree, done->start, + bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), + bvec->bv_offset); else uptodate = 0; } @@ -8141,11 +8170,12 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; u64 offset = 0; @@ -8153,7 +8183,7 @@ static int __btrfs_subio_endio_read(struct inode *inode, int nr_sectors; unsigned int pgoff; int csum_pos; - int i; + bool uptodate = (err == 0); int ret; fs_info = BTRFS_I(inode)->root->fs_info; @@ -8162,29 +8192,31 @@ static int __btrfs_subio_endio_read(struct inode *inode, err = 0; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec->bv_offset; + pgoff = bvec.bv_offset; next_block: - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec->bv_page, pgoff, start, - sectorsize); - if (likely(!ret)) - goto next; + if (uptodate) { + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec.bv_page, pgoff, start, sectorsize); + if (likely(!ret)) + goto next; + } try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8211,8 +8243,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8232,10 +8264,13 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { err = btrfs_subio_endio_read(inode, io_bio, err); + if (!err) + bio->bi_status = 0; + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8243,11 +8278,11 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } @@ -8299,20 +8334,21 @@ static void btrfs_endio_direct_write(struct bio *bio) struct bio *dio_bio = dip->dio_bio; __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_error); + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8321,7 +8357,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -8351,31 +8387,21 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: bio_put(bio); } -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, - u64 first_sector, gfp_t gfp_flags) -{ - struct bio *bio; - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); - if (bio) - bio_associate_current(bio); - return bio; -} - -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8406,7 +8432,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8423,8 +8449,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0, - file_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, + file_offset, inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8454,103 +8480,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 submit_len = 0; u64 map_length; - u32 blocksize = fs_info->sectorsize; int async_submit = 0; - int nr_sectors; + u64 submit_len; + int clone_offset = 0; + int clone_len; int ret; - int i, j; map_length = orig_bio->bi_iter.bi_size; + submit_len = map_length; ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; - if (map_length >= orig_bio->bi_iter.bi_size) { + if (map_length >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + /* bio split */ + ASSERT(map_length <= INT_MAX); atomic_inc(&dip->pending_bios); + do { + clone_len = min_t(int, submit_len, map_length); - bio_for_each_segment_all(bvec, orig_bio, j) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - i = 0; -next_block: - if (unlikely(map_length < submit_len + blocksize || - bio_add_page(bio, bvec->bv_page, blocksize, - bvec->bv_offset + (i * blocksize)) < blocksize)) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the dip might get freed - * before we're done setting it up - */ - atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, - file_offset, skip_sum, - async_submit); - if (ret) { - bio_put(bio); - atomic_dec(&dip->pending_bios); - goto out_err; - } - - start_sector += submit_len >> 9; - file_offset += submit_len; + /* + * This will never fail as it's passing GPF_NOFS and + * the allocation is backed by btrfs_bioset. + */ + bio = btrfs_bio_clone_partial(orig_bio, clone_offset, + clone_len); + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + + ASSERT(submit_len >= clone_len); + submit_len -= clone_len; + if (submit_len == 0) + break; - submit_len = 0; + /* + * Increase the count before we submit the bio so we know + * the end IO handler won't happen before we increase the + * count. Otherwise, the dip might get freed before we're + * done setting it up. + */ + atomic_inc(&dip->pending_bios); - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, - start_sector, GFP_NOFS); - if (!bio) - goto out_err; - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } - map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(bio); - goto out_err; - } + clone_offset += clone_len; + start_sector += clone_len >> 9; + file_offset += clone_len; - goto next_block; - } else { - submit_len += blocksize; - if (--nr_sectors) { - i++; - goto next_block; - } - } - } + map_length = submit_len; + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); + if (ret) + goto out_err; + } while (submit_len > 0); submit: ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, @@ -8577,19 +8583,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { struct btrfs_dio_private *dip = NULL; - struct bio *io_bio = NULL; - struct btrfs_io_bio *btrfs_bio; + struct bio *bio = NULL; + struct btrfs_io_bio *io_bio; int skip_sum; bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { - ret = -ENOMEM; - goto free_ordered; - } + bio = btrfs_bio_clone(dio_bio); dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { @@ -8602,17 +8604,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - io_bio->bi_private = dip; - dip->orig_bio = io_bio; + bio->bi_private = dip; + dip->orig_bio = bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); - btrfs_bio = btrfs_io_bio(io_bio); - btrfs_bio->logical = file_offset; + io_bio = btrfs_io_bio(bio); + io_bio->logical = file_offset; if (write) { - io_bio->bi_end_io = btrfs_endio_direct_write; + bio->bi_end_io = btrfs_endio_direct_write; } else { - io_bio->bi_end_io = btrfs_endio_direct_read; + bio->bi_end_io = btrfs_endio_direct_read; dip->subio_endio = btrfs_subio_endio_read; } @@ -8635,8 +8637,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (btrfs_bio->end_io) - btrfs_bio->end_io(btrfs_bio, ret); + if (io_bio->end_io) + io_bio->end_io(io_bio, ret); free_ordered: /* @@ -8648,16 +8650,15 @@ free_ordered: * same as btrfs_endio_direct_[write|read] because we can't call these * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (io_bio && dip) { - io_bio->bi_error = -EIO; - bio_endio(io_bio); + if (bio && dip) { + bio_io_error(bio); /* - * The end io callbacks free our dip, do the final put on io_bio + * The end io callbacks free our dip, do the final put on bio * and all the cleanup and final put for dio_bio (through * dio_end_io()). */ dip = NULL; - io_bio = NULL; + bio = NULL; } else { if (write) __endio_write_update_ordered(inode, @@ -8668,15 +8669,15 @@ free_ordered: unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. */ - dio_end_io(dio_bio, ret); + dio_end_io(dio_bio); } - if (io_bio) - bio_put(io_bio); + if (bio) + bio_put(bio); kfree(dip); } @@ -8720,6 +8721,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8755,8 +8757,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; dio_data.outstanding_extents = count_max_extents(count); @@ -8788,8 +8794,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8804,8 +8810,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start, false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -8813,6 +8819,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -9003,7 +9010,7 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -9045,6 +9052,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -9070,7 +9078,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { ret = file_update_time(vmf->vma->vm_file); @@ -9124,8 +9132,8 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } @@ -9176,13 +9184,16 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } @@ -9404,8 +9415,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, &inode->i_data); - extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + extent_io_tree_init(&ei->io_tree, inode); + extent_io_tree_init(&ei->io_failure_tree, inode); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; atomic_set(&ei->sync_writers, 0); @@ -9514,7 +9525,6 @@ void btrfs_destroy_cachep(void) rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_transaction_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); } @@ -9534,12 +9544,6 @@ int btrfs_init_cachep(void) if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", - sizeof(struct btrfs_transaction), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_transaction_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); @@ -9564,6 +9568,24 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, u64 delalloc_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; + u32 bi_flags = BTRFS_I(inode)->flags; + + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + if (bi_flags & BTRFS_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (bi_flags & BTRFS_INODE_COMPRESS) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (bi_flags & BTRFS_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (bi_flags & BTRFS_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; @@ -10538,7 +10560,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } @@ -10659,6 +10681,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) return -EAGAIN; } +static struct btrfs_fs_info *iotree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + +static void btrfs_check_extent_io_range(void *private_data, const char *caller, + u64 start, u64 end) +{ + struct inode *inode = private_data; + u64 isize; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } +} + +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end) +{ + struct inode *inode = private_data; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + ASSERT(page); /* Pages should be in the extent_io_tree */ + set_page_writeback(page); + put_page(page); + index++; + } +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10702,6 +10760,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, + .tree_fs_info = iotree_fs_info, + .set_range_writeback = btrfs_set_range_writeback, /* optional callbacks */ .fill_delalloc = run_delalloc_range, @@ -10711,6 +10771,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, .split_extent_hook = btrfs_split_extent_hook, + .check_extent_io_range = btrfs_check_extent_io_range, }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e176375f374f..fa1b78cf25f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -37,7 +37,7 @@ #include <linux/bit_spinlock.h> #include <linux/security.h> #include <linux/xattr.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> @@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1226,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1247,15 +1248,17 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } @@ -4588,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, out: btrfs_free_path(path); - vfree(inodes); + kvfree(inodes); kfree(loi); return ret; @@ -4897,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, fs_info, sa->src, sa->dst); @@ -4956,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->create) { ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); } else { @@ -5010,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = root->root_key.objectid; } - /* FIXME: check if the IDs really exist */ ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index f48c8c14dc14..d433e75d489a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -18,13 +18,14 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/lzo.h> +#include <linux/refcount.h> #include "compression.h" #define LZO_LEN 4 @@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->buf); - vfree(workspace->cbuf); - vfree(workspace->mem); + kvfree(workspace->buf); + kvfree(workspace->cbuf); + kvfree(workspace->mem); kfree(workspace); } @@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - pr_debug("BTRFS: deflate in loop returned %d\n", + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; @@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws, in_len = min(bytes_left, PAGE_SIZE); } - if (tot_out > tot_in) + if (tot_out >= tot_in) { + ret = -E2BIG; goto out; + } /* store the size of all chunks of compressed data */ cpage_out = kmap(pages[0]); @@ -254,16 +257,13 @@ out: return ret; } -static int lzo_decompress_bio(struct list_head *ws, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; char *data_in; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; @@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws, unsigned long tot_len; char *buf; bool may_late_unmap, need_unmap; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7b40e2e7292a..a3aca495e33e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, LIST_HEAD(skipped); LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; - int count = 0; + u64 count = 0; const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); @@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, cond_resched(); spin_lock(&root->ordered_extent_lock); - if (nr != -1) + if (nr != U64_MAX) nr--; count++; } @@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, - const u64 range_start, const u64 range_len) +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - int done; - int total_done = 0; + u64 total_done = 0; + u64 done; INIT_LIST_HEAD(&splice); @@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, total_done += done; spin_lock(&fs_info->ordered_root_lock); - if (nr != -1) { + if (nr != U64_MAX) { nr -= done; - WARN_ON(nr < 0); } } list_splice_tail(&splice, &fs_info->ordered_roots); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e0c1d5b8d859..56c4c0ee6381 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct btrfs_inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index cdafbf92ef0c..fcae61e175f3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); - pr_info("\t\tblock group used %llu\n", - btrfs_disk_block_group_used(l, bi)); + pr_info( + "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", + btrfs_disk_block_group_used(l, bi), + btrfs_disk_block_group_chunk_objectid(l, bi), + btrfs_disk_block_group_flags(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: print_chunk(l, btrfs_item_ptr(l, i, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index d6cb155ef7a1..4b23ae5d0e5c 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; + if (verify_dir_item(fs_info, leaf, + path->slots[0], di)) { + ret = -EIO; + goto out; + } + if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index deffbeb74a0b..4ce351efe281 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1406,38 +1406,6 @@ out: return ret; } -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - u64 qgroup_to_skip; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - qgroup_to_skip = delayed_refs->qgroup_to_skip; - - /* - * No need to do lock, since this function will only be called in - * btrfs_commit_transaction(). - */ - node = rb_first(&delayed_refs->dirty_extent_root); - while (node) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - if (WARN_ON(!record->old_roots)) - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, &record->old_roots); - if (ret < 0) - break; - if (qgroup_to_skip) - ulist_del(record->old_roots, qgroup_to_skip, 0); - node = rb_next(node); - } - return ret; -} - int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1559,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, if (ret) return ret; } + cond_resched(); return 0; } @@ -1918,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, return 0; } +/* + * Check if the @roots potentially is a list of fs tree roots + * + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots in the list (considering an empty + * one as well) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); + if (!unode) + return 1; + + /* + * If it contains fs tree roots, then it must belong to fs/subvol + * trees. + * If it contains a non-fs tree, it won't be shared with fs/subvol trees. + */ + return is_fstree(unode->val); +} + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1934,10 +1932,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - if (new_roots) + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; nr_new_roots = new_roots->nnodes; - if (old_roots) + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; nr_old_roots = old_roots->nnodes; + } + + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) + goto out_free; BUG_ON(!fs_info->quota_root); @@ -2017,6 +2025,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* + * Old roots should be searched when inserting qgroup + * extent record + */ + if (WARN_ON(!record->old_roots)) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, + &record->old_roots); + if (ret < 0) + goto cleanup; + } + + /* * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). @@ -2025,8 +2046,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; - if (qgroup_to_skip) + if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); @@ -2338,6 +2362,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; + + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -2376,7 +2405,7 @@ retry: ret = btrfs_start_delalloc_inodes(root, 0); if (ret) return ret; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -2806,55 +2835,130 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed, true); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_release(&changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_release(&changeset.range_changed); + extent_changeset_release(reserved); + return ret; +} + +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -2868,8 +2972,9 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, changeset.bytes_changed); + ret = changeset.bytes_changed; out: - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2878,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2905,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -2969,8 +3077,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); @@ -2987,5 +3094,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) changeset.bytes_changed); } - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index fe04d3f295c6..d9984e87cddf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -134,8 +134,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); + /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. @@ -243,9 +242,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, bool enforce); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8ea0eb76325..6f845d219cd6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -31,7 +31,7 @@ #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * of a failing mount. */ table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!table) { - table = vzalloc(table_size); - if (!table) - return -ENOMEM; - } + table = kvzalloc(table_size, GFP_KERNEL); + if (!table) + return -ENOMEM; spin_lock_init(&table->cache_lock); INIT_LIST_HEAD(&table->stripe_cache); @@ -871,7 +868,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -884,7 +881,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -897,7 +894,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -914,7 +911,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -1092,7 +1089,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1101,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); - if (!bio) - return -ENOMEM; - + bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; @@ -1448,7 +1442,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1991,7 +1985,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2530,7 +2524,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a17e775a4a89..ab852b8e3e37 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - int err; struct list_head extctl; int refcnt; spinlock_t lock; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d60df51959f7..65661d1aae4e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -4269,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&rc->processed_blocks, NULL); return rc; } @@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, -1, + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group->key.objectid, rc->block_group->key.offset); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7d6bc308bf43..460db0cb2d07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -390,6 +390,13 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); + ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, + name_len); + if (!ret) { + err = -EIO; + goto out; + } + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c7b45eb2403d..6f1e4c984b94 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -18,6 +18,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -95,7 +96,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -161,14 +162,6 @@ struct scrub_parity { unsigned long bitmap[0]; }; -struct scrub_wr_ctx { - struct scrub_bio *wr_curr_bio; - struct btrfs_device *tgtdev; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ - atomic_t flush_all_writes; - struct mutex wr_lock; -}; - struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_fs_info *fs_info; @@ -183,11 +176,14 @@ struct scrub_ctx { atomic_t cancel_req; int readonly; int pages_per_rd_bio; - u32 sectorsize; - u32 nodesize; int is_dev_replace; - struct scrub_wr_ctx wr_ctx; + + struct scrub_bio *wr_curr_bio; + struct mutex wr_lock; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct btrfs_device *wr_tgtdev; /* * statistics @@ -289,10 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num); -static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx, - struct btrfs_device *dev, - int is_dev_replace); -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); @@ -643,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - scrub_free_wr_ctx(&sctx->wr_ctx); - /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; @@ -664,6 +654,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sbio); } + kfree(sctx->wr_curr_bio); scrub_free_csums(sctx); kfree(sctx); } @@ -680,7 +671,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) @@ -710,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->bios[i]->next_free = -1; } sctx->first_free = 0; - sctx->nodesize = fs_info->nodesize; - sctx->sectorsize = fs_info->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); @@ -722,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); - ret = scrub_setup_wr_ctx(&sctx->wr_ctx, - fs_info->dev_replace.tgtdev, is_dev_replace); - if (ret) { - scrub_free_ctx(sctx); - return ERR_PTR(ret); + WARN_ON(sctx->wr_curr_bio != NULL); + mutex_init(&sctx->wr_lock); + sctx->wr_curr_bio = NULL; + if (is_dev_replace) { + WARN_ON(!fs_info->dev_replace.tgtdev); + sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; + sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; + atomic_set(&sctx->flush_all_writes, 0); } + return sctx; nomem: @@ -742,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, u32 nlink; int ret; int i; + unsigned nofs_flag; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; @@ -780,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); + /* + * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub + * uses GFP_NOFS in this context, so we keep it consistent but it does + * not seem to be strictly necessary. + */ + nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, swarn->path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; @@ -954,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE, + ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, fixup->logical, page, offset - page_offset(page), fixup->mirror_num); @@ -1668,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1693,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1705,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1737,12 +1737,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page->dev->bdev; bio_add_page(bio, page->page, PAGE_SIZE, 0); @@ -1830,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1898,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; int ret; - mutex_lock(&wr_ctx->wr_lock); + mutex_lock(&sctx->wr_lock); again: - if (!wr_ctx->wr_curr_bio) { - wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + if (!sctx->wr_curr_bio) { + sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), GFP_KERNEL); - if (!wr_ctx->wr_curr_bio) { - mutex_unlock(&wr_ctx->wr_lock); + if (!sctx->wr_curr_bio) { + mutex_unlock(&sctx->wr_lock); return -ENOMEM; } - wr_ctx->wr_curr_bio->sctx = sctx; - wr_ctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sctx = sctx; + sctx->wr_curr_bio->page_count = 0; } - sbio = wr_ctx->wr_curr_bio; + sbio = sctx->wr_curr_bio; if (sbio->page_count == 0) { struct bio *bio; sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; - sbio->dev = wr_ctx->tgtdev; + sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - wr_ctx->pages_per_wr_bio); - if (!bio) { - mutex_unlock(&wr_ctx->wr_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); sbio->bio = bio; } @@ -1937,7 +1924,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1951,7 +1938,7 @@ again: if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return -EIO; } scrub_wr_submit(sctx); @@ -1961,23 +1948,22 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == wr_ctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_wr_bio) scrub_wr_submit(sctx); - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return 0; } static void scrub_wr_submit(struct scrub_ctx *sctx) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; - if (!wr_ctx->wr_curr_bio) + if (!sctx->wr_curr_bio) return; - sbio = wr_ctx->wr_curr_bio; - wr_ctx->wr_curr_bio = NULL; + sbio = sctx->wr_curr_bio; + sctx->wr_curr_bio = NULL; WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer @@ -1992,7 +1978,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2007,7 +1993,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2081,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0]->page; buffer = kmap_atomic(page); - len = sctx->sectorsize; + len = sctx->fs_info->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -2146,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->nodesize - BTRFS_CSUM_SIZE; + len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -2329,10 +2315,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - sctx->pages_per_rd_bio); - if (!bio) - return -ENOMEM; + bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); sbio->bio = bio; } @@ -2341,7 +2324,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2377,7 +2360,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2420,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_block_put(sblock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2458,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) goto bbio_out; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; @@ -2588,7 +2568,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2601,7 +2581,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2628,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) spin_unlock(&sctx->list_lock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2726,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) if (!sum) return 0; - index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; - num_sectors = sum->len / sctx->sectorsize; + index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize; + num_sectors = sum->len / sctx->fs_info->sectorsize; memcpy(csum, sum->sums + index, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); @@ -2746,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed++; sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; sctx->stat.tree_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2892,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -3004,7 +2984,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -3037,10 +3017,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (ret || !bbio || !bbio->raid_map) goto bbio_out; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; @@ -3305,9 +3282,9 @@ out: logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); btrfs_release_path(path); return ret < 0 ? ret : 0; @@ -3463,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_blocked_if_needed(fs_info); } @@ -3677,9 +3654,9 @@ skip: out: /* push queued extents */ scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); blk_finish_plug(&plug); btrfs_free_path(path); @@ -3859,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, -1, + ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->key.objectid, cache->key.offset); if (ret > 0) { @@ -3916,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * write requests are really completed when bios_in_flight * changes to 0. */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); @@ -3934,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_pause_off(fs_info); @@ -4337,32 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, btrfs_put_bbio(bbio); } -static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx, - struct btrfs_device *dev, - int is_dev_replace) -{ - WARN_ON(wr_ctx->wr_curr_bio != NULL); - - mutex_init(&wr_ctx->wr_lock); - wr_ctx->wr_curr_bio = NULL; - if (!is_dev_replace) - return 0; - - WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; - wr_ctx->tgtdev = dev; - atomic_set(&wr_ctx->flush_all_writes, 0); - return 0; -} - -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) -{ - mutex_lock(&wr_ctx->wr_lock); - kfree(wr_ctx->wr_curr_bio); - wr_ctx->wr_curr_bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); -} - static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace) { @@ -4665,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct btrfs_device *dev; int ret; - dev = sctx->wr_ctx.tgtdev; + dev = sctx->wr_tgtdev; if (!dev) return -EIO; if (!dev->bdev) { @@ -4673,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fc496a6f842a..e937c10b8287 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } + ret = btrfs_is_name_len_valid(eb, path->slots[0], + (unsigned long)(di + 1), name_len + data_len); + if (!ret) { + ret = -EIO; + goto out; + } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { @@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, buf = tmp; } if (!buf) { - buf = vmalloc(buf_len); + buf = kvmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; @@ -2769,15 +2775,20 @@ out: struct recorded_ref { struct list_head list; - char *dir_path; char *name; struct fs_path *full_path; u64 dir; u64 dir_gen; - int dir_path_len; int name_len; }; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; +} + /* * We need to process new refs before deleted refs, but compare_tree gives us * everything mixed. So we first record all refs and later process them. @@ -2794,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir, ref->dir = dir; ref->dir_gen = dir_gen; - ref->full_path = path; - - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; - ref->dir_path = ref->full_path->start; - if (ref->name == ref->full_path->start) - ref->dir_path_len = 0; - else - ref->dir_path_len = ref->full_path->end - - ref->full_path->start - 1 - ref->name_len; - + set_ref_path(ref, path); list_add_tail(&ref->list, head); return 0; } @@ -3546,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root, struct fs_path *fs_path) { u64 ino = ino2; + bool free_path = false; + int ret = 0; + + if (!fs_path) { + fs_path = fs_path_alloc(); + if (!fs_path) + return -ENOMEM; + free_path = true; + } while (ino > BTRFS_FIRST_FREE_OBJECTID) { - int ret; u64 parent; u64 parent_gen; @@ -3557,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root, if (ret < 0) { if (ret == -ENOENT && ino == ino2) ret = 0; - return ret; + goto out; + } + if (parent == ino1) { + ret = parent_gen == ino1_gen ? 1 : 0; + goto out; } - if (parent == ino1) - return parent_gen == ino1_gen ? 1 : 0; ino = parent; } - return 0; + out: + if (free_path) + fs_path_free(fs_path); + return ret; } static int wait_for_parent_move(struct send_ctx *sctx, @@ -3686,6 +3700,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int is_orphan = 0; u64 last_dir_ino_rm = 0; bool can_rename = true; + bool orphanized_ancestor = false; btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); @@ -3837,9 +3852,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * might contain the pre-orphanization name of * ow_inode, which is no longer valid. */ - fs_path_reset(valid_path); - ret = get_cur_path(sctx, sctx->cur_ino, - sctx->cur_inode_gen, valid_path); + ret = is_ancestor(sctx->parent_root, + ow_inode, ow_gen, + sctx->cur_ino, NULL); + if (ret > 0) { + orphanized_ancestor = true; + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + valid_path); + } if (ret < 0) goto out; } else { @@ -3960,6 +3982,43 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (!ret) { + /* + * If we orphanized any ancestor before, we need + * to recompute the full path for deleted names, + * since any such path was computed before we + * processed any references and orphanized any + * ancestor inode. + */ + if (orphanized_ancestor) { + struct fs_path *new_path; + + /* + * Our reference's name member points to + * its full_path member string, so we + * use here a new path. + */ + new_path = fs_path_alloc(); + if (!new_path) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, + new_path); + if (ret < 0) { + fs_path_free(new_path); + goto out; + } + ret = fs_path_add(new_path, + cur->name, + cur->name_len); + if (ret < 0) { + fs_path_free(new_path); + goto out; + } + fs_path_free(cur->full_path); + set_ref_path(cur, new_path); + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -6397,13 +6456,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); if (!sctx->clone_roots) { - sctx->clone_roots = vzalloc(alloc_size); - if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4f1cdd5058f1..74e47794e63f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -601,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_alloc_start: - num = match_strdup(&args[0]); - if (num) { - mutex_lock(&info->chunk_mutex); - info->alloc_start = memparse(num, NULL); - mutex_unlock(&info->chunk_mutex); - kfree(num); - btrfs_info(info, "allocations start at %llu", - info->alloc_start); - } else { - ret = -ENOMEM; - goto out; - } + btrfs_info(info, + "option alloc_start is obsolete, ignored"); break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL @@ -1187,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1232,8 +1222,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); - if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -1716,7 +1704,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; u64 old_max_inline = fs_info->max_inline; - u64 old_alloc_start = fs_info->alloc_start; int old_thread_pool_size = fs_info->thread_pool_size; unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; @@ -1855,9 +1842,6 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; - mutex_lock(&fs_info->chunk_mutex); - fs_info->alloc_start = old_alloc_start; - mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; @@ -1898,18 +1882,15 @@ static inline void btrfs_descending_sort_devices( static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 skip_space; u64 type; u64 avail_space; - u64 used_space; u64 min_stripe_size; int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; - int ret; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1927,12 +1908,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), - GFP_NOFS); + GFP_KERNEL); if (!devices_info) return -ENOMEM; /* calc min stripe number for data space allocation */ - type = btrfs_get_alloc_profile(root, 1); + type = btrfs_data_alloc_profile(fs_info); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; num_stripes = nr_devices; @@ -1949,8 +1930,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, else min_stripe_size = BTRFS_STRIPE_LEN; - if (fs_info->alloc_start) - mutex_lock(&fs_devices->device_list_mutex); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || @@ -1973,34 +1952,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, */ skip_space = SZ_1M; - /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start && - fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) { - rcu_read_unlock(); - skip_space = max(fs_info->alloc_start, skip_space); - - /* - * btrfs can not use the free space in - * [0, skip_space - 1], we must subtract it from the - * total. In order to implement it, we account the used - * space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, - skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - mutex_unlock(&fs_devices->device_list_mutex); - return ret; - } - - rcu_read_lock(); - - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; - } - /* * we can use the free space in [0, skip_space - 1], subtract * it from the total. @@ -2019,8 +1970,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i++; } rcu_read_unlock(); - if (fs_info->alloc_start) - mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -2057,10 +2006,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, * multiplier to scale the sizes. * * Unused device space usage is based on simulating the chunk allocator - * algorithm that respects the device sizes, order of allocations and the - * 'alloc_start' value, this is a close approximation of the actual use but - * there are other factors that may change the result (like a new metadata - * chunk). + * algorithm that respects the device sizes and order of allocations. This is + * a close approximation of the actual use but there are other factors that may + * change the result (like a new metadata chunk). * * If metadata is exhausted, f_bavail will be 0. */ @@ -2243,7 +2191,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - fs_info->fs_frozen = 1; + set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* * We don't need a barrier here, we'll wait for any transaction that * could be in progress on other threads (and do delayed iputs that @@ -2262,7 +2210,9 @@ static int btrfs_freeze(struct super_block *sb) static int btrfs_unfreeze(struct super_block *sb) { - btrfs_sb(sb)->fs_frozen = 0; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fba8940..c2d5f3580b4c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + + return len; +} + +BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), BTRFS_ATTR_PTR(clone_alignment), + BTRFS_ATTR_PTR(quota_override), NULL, }; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 133753232a94..d06b1c931d05 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, &inode->i_data); + extent_io_tree_init(&tmp, inode); /* * First go through and create and mark all of our pages dirty, we pin diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2168654c90a1..f615d59b0489 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } - kmem_cache_free(btrfs_transaction_cachep, transaction); + kfree(transaction); } } @@ -228,7 +228,7 @@ loop: */ BUG_ON(type == TRANS_JOIN_NOLOCK); - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -238,11 +238,11 @@ loop: * someone started a transaction after we unlocked. Make sure * to redo the checks above */ - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); goto loop; } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); return -EROFS; } @@ -294,7 +294,7 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - fs_info->btree_inode->i_mapping); + fs_info->btree_inode); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -1374,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, fs_info); if (ret) goto out; - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret < 0) - goto out; ret = btrfs_qgroup_account_extents(trans, fs_info); if (ret < 0) goto out; @@ -1926,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static inline void @@ -2180,13 +2177,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. @@ -2314,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * it'll result in deadlock about SB_FREEZE_FS. */ if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && !fs_info->fs_frozen) + current != fs_info->cleaner_kthread && + !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) btrfs_run_delayed_iputs(fs_info); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ccfe9fe7754a..f20ef211a73d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1175,15 +1175,19 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, - u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1198,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, return 0; } -static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) +static int ref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1280,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index, &parent_objectid); + ret = extref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1293,8 +1302,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index); } if (ret) goto out; @@ -1841,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) + if (verify_dir_item(fs_info, eb, slot, di)) return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); @@ -2017,7 +2026,7 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) { + if (verify_dir_item(fs_info, eb, slot, di)) { ret = -EIO; goto out; } @@ -2102,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2143,6 +2153,12 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; + ret = verify_dir_item(fs_info, path->nodes[0], + path->slots[0], di); + if (ret) { + ret = -EIO; + goto out; + } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -4546,6 +4562,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } + ret = btrfs_is_name_len_valid(eb, slot, name_ptr, + this_name_len); + if (!ret) { + ret = -EIO; + goto out; + } if (this_name_len > name_len) { char *new_name; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 017b67daa3bb..5eb7217738ed 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -242,6 +242,17 @@ static struct btrfs_device *__alloc_device(void) if (!dev) return ERR_PTR(-ENOMEM); + /* + * Preallocate a bio that's always going to be used for flushing device + * barriers and matches the device lifespan + */ + dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + if (!dev->flush_bio) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + bio_get(dev->flush_bio); + INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->resized_list); @@ -838,6 +849,7 @@ static void __free_device(struct work_struct *work) device = container_of(work, struct btrfs_device, rcu_work); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1353,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; - u64 min_search_start; /* * We don't want to overwrite the superblock on the drive nor any area * used by the boot loader (grub for example), so we make sure to start * at an offset of at least 1MB. */ - min_search_start = max(fs_info->alloc_start, 1024ull * 1024); - search_start = max(search_start, min_search_start); + search_start = max_t(u64, search_start, SZ_1M); path = btrfs_alloc_path(); if (!path) @@ -2387,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); + device->total_bytes = round_down(i_size_read(bdev->bd_inode), + fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; @@ -2417,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->fs_devices->total_devices++; fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; tmp = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, - tmp + device->total_bytes); + round_down(tmp + device->total_bytes, fs_info->sectorsize)); tmp = btrfs_super_num_devices(fs_info->super_copy); btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); @@ -2574,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } - name = rcu_string_strdup(device_path, GFP_NOFS); + name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { kfree(device); ret = -ENOMEM; @@ -2689,6 +2698,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; + new_size = round_down(new_size, fs_info->sectorsize); + mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = new_size - device->total_bytes; @@ -2701,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, old_total + diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; btrfs_device_set_total_bytes(device, new_size); @@ -2874,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += dev_extent_len; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } @@ -4393,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); - u64 diff = old_size - new_size; + u64 diff; + + new_size = round_down(new_size, fs_info->sectorsize); + diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -4409,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space -= diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_sub(diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); @@ -4522,7 +4533,8 @@ again: &fs_info->fs_devices->resized_devices); WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); /* Now btrfs_update_device() will change the on-disk size. */ @@ -4535,9 +4547,7 @@ done: btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -4882,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); } - spin_lock(&info->free_chunk_lock); - info->free_chunk_space -= (stripe_size * map->num_stripes); - spin_unlock(&info->free_chunk_lock); + atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); @@ -5029,20 +5037,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_root *extent_root = fs_info->extent_root; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; int ret; chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(extent_root, 0); + alloc_profile = btrfs_metadata_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); if (ret) return ret; sys_chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); + alloc_profile = btrfs_system_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); return ret; } @@ -6042,9 +6049,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6082,13 +6090,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6199,7 +6207,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } @@ -6266,10 +6274,9 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, continue; } - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); /* -ENOMEM */ - } else + if (dev_nr < total_devs - 1) + bio = btrfs_bio_clone(first_bio); + else bio = first_bio; submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, @@ -6684,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes - - device->bytes_used; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes - device->bytes_used, + &fs_info->free_chunk_space); } ret = 0; return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c7d0fbc915ca..6f45fd60d15a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,6 +74,8 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; + int last_flush_error; + int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -279,6 +281,11 @@ struct btrfs_io_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; btrfs_io_bio_end_io_t *end_io; + struct bvec_iter iter; + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_io_bio but relies on bio being last. + */ struct bio bio; }; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b3cbf80c5acf..2c7e53f9ff1b 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, di)) { + if (verify_dir_item(fs_info, leaf, slot, di)) { ret = -EIO; goto err; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 135b10823c6d..c248f9286366 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -24,12 +24,13 @@ #include <linux/slab.h> #include <linux/zlib.h> #include <linux/zutil.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> +#include <linux/refcount.h> #include "compression.h" struct workspace { @@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->strm.workspace); + kvfree(workspace->strm.workspace); kfree(workspace->buf); kfree(workspace); } @@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void) struct workspace *workspace; int workspacesize; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = vmalloc(workspacesize); - workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); + workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -211,10 +212,7 @@ out: return ret; } -static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -222,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, char *data_in; size_t total_out = 0; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..ea0e05ec2916 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,7 +49,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -178,7 +178,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost sync page write"); - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -352,8 +352,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); - mapping_set_error(page->mapping, -EIO); - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); } @@ -481,8 +480,6 @@ static void __remove_assoc_queue(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); - if (buffer_write_io_error(bh)) - set_bit(AS_EIO, &bh->b_assoc_map->flags); bh->b_assoc_map = NULL; } @@ -1181,6 +1178,17 @@ void mark_buffer_dirty(struct buffer_head *bh) } EXPORT_SYMBOL(mark_buffer_dirty); +void mark_buffer_write_io_error(struct buffer_head *bh) +{ + set_buffer_write_io_error(bh); + /* FIXME: do we need to set this in both places? */ + if (bh->b_page && bh->b_page->mapping) + mapping_set_error(bh->b_page->mapping, -EIO); + if (bh->b_assoc_map) + mapping_set_error(bh->b_assoc_map, -EIO); +} +EXPORT_SYMBOL(mark_buffer_write_io_error); + /* * Decrement a buffer_head's reference count. If all buffers against a page * have zero reference count, are clean and unlocked, and if the page is clean @@ -1829,7 +1837,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1883,7 +1892,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -3021,11 +3031,11 @@ EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) { - struct buffer_head tmp; struct inode *inode = mapping->host; - tmp.b_state = 0; - tmp.b_blocknr = 0; - tmp.b_size = i_blocksize(inode); + struct buffer_head tmp = { + .b_size = i_blocksize(inode), + }; + get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } @@ -3038,7 +3048,7 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } @@ -3091,7 +3101,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc) + enum rw_hint write_hint, struct writeback_control *wbc) { struct bio *bio; @@ -3120,6 +3130,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; + bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3142,7 +3153,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, NULL); + return submit_bh_wbc(op, op_flags, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3279,8 +3290,6 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = head; do { - if (buffer_write_io_error(bh) && page->mapping) - mapping_set_error(page->mapping, -EIO); if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; @@ -3492,6 +3501,130 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * + * Returns the offset within the file on success, and -ENOENT otherwise. + */ +static loff_t +page_seek_hole_data(struct page *page, loff_t lastoff, int whence) +{ + loff_t offset = page_offset(page); + struct buffer_head *bh, *head; + bool seek_data = whence == SEEK_DATA; + + if (lastoff < offset) + lastoff = offset; + + bh = head = page_buffers(page); + do { + offset += bh->b_size; + if (lastoff >= offset) + continue; + + /* + * Unwritten extents that have data in the page cache covering + * them can be identified by the BH_Unwritten state flag. + * Pages with multiple buffers might have a mix of holes, data + * and unwritten extents - any buffer with valid data in it + * should have BH_Uptodate flag set on it. + */ + + if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data) + return lastoff; + + lastoff = offset; + } while ((bh = bh->b_this_page) != head); + return -ENOENT; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: unwritten and uptodate buffer heads count as data; + * everything else counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec, 0); + + do { + unsigned want, nr_pages, i; + + want = min_t(unsigned, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + * + * If current page offset is beyond where we've ended, + * we've found a hole. + */ + if (whence == SEEK_HOLE && + lastoff < page_offset(page)) + goto check_range; + + /* Searching done if the page index is out of range. */ + if (page->index >= end) + goto not_found; + + lock_page(page); + if (likely(page->mapping == inode->i_mapping) && + page_has_buffers(page)) { + lastoff = page_seek_hole_data(page, lastoff, whence); + if (lastoff >= 0) { + unlock_page(page); + goto check_range; + } + } + unlock_page(page); + lastoff = page_offset(page) + PAGE_SIZE; + } + + /* Searching done if fewer pages returned than wanted. */ + if (nr_pages < want) + break; + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + void __init buffer_init(void) { unsigned long nrpages; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9bf90bcc56ac..bb3a02ca9da4 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -18,7 +18,7 @@ #include <linux/fscache-cache.h> #include <linux/timer.h> -#include <linux/wait.h> +#include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/workqueue.h> #include <linux/security.h> @@ -97,7 +97,7 @@ struct cachefiles_cache { * backing file read tracking */ struct cachefiles_one_read { - wait_queue_t monitor; /* link into monitored waitqueue */ + wait_queue_entry_t monitor; /* link into monitored waitqueue */ struct page *back_page; /* backing file page we're waiting for */ struct page *netfs_page; /* netfs page we're going to fill */ struct fscache_retrieval *op; /* retrieval op covering this */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 41df8a27d7eb..3978b324cbca 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -204,7 +204,7 @@ wait_for_old_object: wait_queue_head_t *wq; signed long timeout = 60 * HZ; - wait_queue_t wait; + wait_queue_entry_t wait; bool requeue; /* if the object we're waiting for is queued for processing, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index afbdc418966d..18d7aa61ef0f 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -21,7 +21,7 @@ * - we use this to detect read completion of backing pages * - the caller holds the waitqueue lock */ -static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, +static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, int sync, void *_key) { struct cachefiles_one_read *monitor = @@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, } /* remove from the waitqueue */ - list_del(&wait->task_list); + list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ ASSERT(monitor->op); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 034f00f21390..afeefe79c25e 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -146,6 +146,15 @@ config CIFS_DEBUG2 option can be turned off unless you are debugging cifs problems. If unsure, say N. +config CIFS_DEBUG_DUMP_KEYS + bool "Dump encryption keys for offline decryption (Unsafe)" + depends on CIFS_DEBUG && CIFS_SMB2 + help + Enabling this will dump the encryption and decryption keys + used to communicate on an encrypted share connection on the + console. This allows Wireshark to decrypt and dissect + encrypted network captures. Enable this carefully. + config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS && KEYS diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index a0b3e7d1be48..e0445e2075b2 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -79,6 +79,10 @@ convert_sfu_char(const __u16 src_char, char *target) static bool convert_sfm_char(const __u16 src_char, char *target) { + if (src_char >= 0xF001 && src_char <= 0xF01F) { + *target = src_char - 0xF000; + return true; + } switch (src_char) { case SFM_COLON: *target = ':'; @@ -417,6 +421,10 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string) { __le16 dest_char; + if (src_char >= 0x01 && src_char <= 0x1F) { + dest_char = cpu_to_le16(src_char + 0xF000); + return dest_char; + } switch (src_char) { case ':': dest_char = cpu_to_le16(SFM_COLON); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fcef70602b27..bc09df6b473a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2234,14 +2234,16 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) set_page_writeback(page); retry_write: rc = cifs_partialpagewrite(page, 0, PAGE_SIZE); - if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL) - goto retry_write; - else if (rc == -EAGAIN) + if (rc == -EAGAIN) { + if (wbc->sync_mode == WB_SYNC_ALL) + goto retry_write; redirty_page_for_writepage(wbc, page); - else if (rc != 0) + } else if (rc != 0) { SetPageError(page); - else + mapping_set_error(page->mapping, rc); + } else { SetPageUptodate(page); + } end_page_writeback(page); put_page(page); free_xid(xid); @@ -2810,12 +2812,12 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc; + inode_lock(inode); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2828,11 +2830,11 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: + up_read(&cinode->lock_sem); inode_unlock(inode); if (rc > 0) rc = generic_write_sync(iocb, rc); - up_read(&cinode->lock_sem); return rc; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4d1fcd76d022..a8693632235f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/freezer.h> #include <linux/sched/signal.h> +#include <linux/wait_bit.h> #include <asm/div64.h> #include "cifsfs.h" diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 7e48561abd29..ccbb397debbc 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1288,6 +1288,108 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_ACL +static struct cifs_ntsd * +get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, + const struct cifs_fid *cifsfid, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + unsigned int xid; + int rc = -EOPNOTSUPP; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + xid = get_xid(); + cifs_dbg(FYI, "trying to get acl\n"); + + rc = SMB2_query_acl(xid, tlink_tcon(tlink), cifsfid->persistent_fid, + cifsfid->volatile_fid, (void **)&pntsd, pacllen); + free_xid(xid); + + cifs_put_tlink(tlink); + + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; + +} + +static struct cifs_ntsd * +get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, + const char *path, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + unsigned int xid; + int rc; + struct cifs_tcon *tcon; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; + __le16 *utf16_path; + + cifs_dbg(FYI, "get smb3 acl for path %s\n", path); + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + tcon = tlink_tcon(tlink); + xid = get_xid(); + + if (backup_cred(cifs_sb)) + oparms.create_options = CREATE_OPEN_BACKUP_INTENT; + else + oparms.create_options = 0; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return ERR_PTR(-ENOMEM); + + oparms.tcon = tcon; + oparms.desired_access = READ_CONTROL; + oparms.disposition = FILE_OPEN; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (!rc) { + rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid, + fid.volatile_fid, (void **)&pntsd, pacllen); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + } + + cifs_put_tlink(tlink); + free_xid(xid); + + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; +} + +/* Retrieve an ACL from the server */ +static struct cifs_ntsd * +get_smb2_acl(struct cifs_sb_info *cifs_sb, + struct inode *inode, const char *path, + u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + struct cifsFileInfo *open_file = NULL; + + if (inode) + open_file = find_readable_file(CIFS_I(inode), true); + if (!open_file) + return get_smb2_acl_by_path(cifs_sb, path, pacllen); + + pntsd = get_smb2_acl_by_fid(cifs_sb, &open_file->fid, pacllen); + cifsFileInfo_put(open_file); + return pntsd; +} +#endif + static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { @@ -2393,6 +2495,11 @@ struct smb_version_operations smb20_operations = { .dir_needs_close = smb2_dir_needs_close, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_ACL + .get_acl = get_smb2_acl, + .get_acl_by_fid = get_smb2_acl_by_fid, +/* .set_acl = set_smb3_acl, */ +#endif /* CIFS_ACL */ }; struct smb_version_operations smb21_operations = { @@ -2477,6 +2584,11 @@ struct smb_version_operations smb21_operations = { .enum_snapshots = smb3_enum_snapshots, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_ACL + .get_acl = get_smb2_acl, + .get_acl_by_fid = get_smb2_acl_by_fid, +/* .set_acl = set_smb3_acl, */ +#endif /* CIFS_ACL */ }; struct smb_version_operations smb30_operations = { @@ -2571,6 +2683,11 @@ struct smb_version_operations smb30_operations = { .receive_transform = smb3_receive_transform, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_ACL + .get_acl = get_smb2_acl, + .get_acl_by_fid = get_smb2_acl_by_fid, +/* .set_acl = set_smb3_acl, */ +#endif /* CIFS_ACL */ }; #ifdef CONFIG_CIFS_SMB311 @@ -2753,7 +2870,7 @@ struct smb_version_values smb302_values = { struct smb_version_values smb311_values = { .version_string = SMB311_VERSION_STRING, .protocol_id = SMB311_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e4afdaae743f..4938e8b6d32f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2081,8 +2081,9 @@ validate_and_copy_buf(unsigned int offset, unsigned int buffer_length, static int query_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, u8 info_class, - size_t output_len, size_t min_len, void *data) + u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, + u32 additional_info, size_t output_len, size_t min_len, void **data, + u32 *dlen) { struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; @@ -2108,10 +2109,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->InfoType = SMB2_O_INFO_FILE; + req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + req->AdditionalInformation = cpu_to_le32(additional_info); /* 4 for rfc1002 length field and 1 for Buffer */ req->InputBufferOffset = cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); @@ -2130,24 +2132,51 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, goto qinf_exit; } + if (dlen) { + *dlen = le32_to_cpu(rsp->OutputBufferLength); + if (!*data) { + *data = kmalloc(*dlen, GFP_KERNEL); + if (!*data) { + cifs_dbg(VFS, + "Error %d allocating memory for acl\n", + rc); + *dlen = 0; + goto qinf_exit; + } + } + } + rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp->hdr, min_len, data); + &rsp->hdr, min_len, *data); qinf_exit: free_rsp_buf(resp_buftype, rsp); return rc; } +int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + sizeof(struct smb2_file_all_info), (void **)&data, + NULL); +} + int -SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, +SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, - struct smb2_file_all_info *data) + void **data, u32 *plen) { + __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; + *plen = 0; + return query_info(xid, tcon, persistent_fid, volatile_fid, - FILE_ALL_INFORMATION, - sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - sizeof(struct smb2_file_all_info), data); + 0, SMB2_O_INFO_SECURITY, additional_info, + SMB2_MAX_BUFFER_SIZE, + sizeof(struct smb2_file_all_info), data, plen); } int @@ -2155,9 +2184,10 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid) { return query_info(xid, tcon, persistent_fid, volatile_fid, - FILE_INTERNAL_INFORMATION, + FILE_INTERNAL_INFORMATION, SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_internal_info), sizeof(struct smb2_file_internal_info), - sizeof(struct smb2_file_internal_info), uniqueid); + (void **)&uniqueid, NULL); } /* diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 6853454fc871..3595cd755147 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -135,6 +135,9 @@ extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); +extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + void **data, unsigned int *plen); extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index c69ec96e92ac..67367cf1f8cd 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -335,9 +335,31 @@ generate_smb3signingkey(struct cifs_ses *ses, if (rc) return rc; - return generate_key(ses, ptriplet->decryption.label, - ptriplet->decryption.context, - ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + rc = generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + + if (rc) + return rc; + +#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS + cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__); + /* + * The session id is opaque in terms of endianness, so we can't + * print it as a long long. we dump it as we got it on the wire + */ + cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid), + &ses->Suid); + cifs_dbg(VFS, "Session Key %*ph\n", + SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); + cifs_dbg(VFS, "Signing Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey); +#endif + return rc; } int diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 47a125ece11e..7efbab013957 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -536,11 +536,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, list_add_tail(&mid->qhead, &server->pending_mid_q); spin_unlock(&GlobalMid_Lock); - + /* + * Need to store the time in mid before calling I/O. For call_async, + * I/O response may come back and free the mid entry on another thread. + */ + cifs_save_when_sent(mid); cifs_in_send_inc(server); rc = smb_send_rqst(server, rqst, flags); cifs_in_send_dec(server); - cifs_save_when_sent(mid); if (rc < 0) { server->sequence_number -= 2; diff --git a/fs/coda/file.c b/fs/coda/file.c index 9d956cd6d46f..363402fcb3ed 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -34,7 +34,7 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos); + return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); } static ssize_t @@ -51,7 +51,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); inode_lock(coda_inode); - ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); + ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6116d5275a3e..2dd4a7af7dd7 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -739,23 +739,22 @@ static int do_i2c_smbus_ioctl(struct file *file, unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; - compat_caddr_t datap; + union { + /* beginnings of those have identical layouts */ + struct i2c_smbus_ioctl_data32 data32; + struct i2c_smbus_ioctl_data data; + } v; tdata = compat_alloc_user_space(sizeof(*tdata)); if (tdata == NULL) return -ENOMEM; - if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) - return -EFAULT; - if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) + memset(&v, 0, sizeof(v)); + if (copy_from_user(&v.data32, udata, sizeof(v.data32))) return -EFAULT; + v.data.data = compat_ptr(v.data32.data); - if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8))) - return -EFAULT; - if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32))) - return -EFAULT; - if (__get_user(datap, &udata->data) || - __put_user(compat_ptr(datap), &tdata->data)) + if (copy_to_user(tdata, &v.data, sizeof(v.data))) return -EFAULT; return do_ioctl(file, cmd, (unsigned long)tdata); @@ -866,8 +865,6 @@ COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) -COMPATIBLE_IOCTL(TIOCGPKT) -COMPATIBLE_IOCTL(TIOCGPTLCK) COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -883,16 +880,12 @@ COMPATIBLE_IOCTL(TIOCMGET) COMPATIBLE_IOCTL(TIOCMBIC) COMPATIBLE_IOCTL(TIOCMBIS) COMPATIBLE_IOCTL(TIOCMSET) -COMPATIBLE_IOCTL(TIOCPKT) COMPATIBLE_IOCTL(TIOCNOTTY) COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) -COMPATIBLE_IOCTL(TIOCGPTN) -COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) -COMPATIBLE_IOCTL(TIOCSIG) #ifdef TIOCSRS485 COMPATIBLE_IOCTL(TIOCSRS485) #endif diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 08b46e6e3995..02b7d91c9231 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -7,6 +7,7 @@ config FS_ENCRYPTION select CRYPTO_XTS select CRYPTO_CTS select CRYPTO_CTR + select CRYPTO_SHA256 select KEYS help Enable encryption of files and directories. This diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a409a84f1bca..6181e9526860 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, goto errout; } err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) + if (err == 0 && bio->bi_status) err = -EIO; bio_put(bio); if (err) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6d6eca394d4d..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/dcache.h> #include <linux/namei.h> +#include <crypto/aes.h> #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; @@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, BUG_ON(len == 0); + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(lblk_num); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); sg_set_page(&src, src_page, len, offs); - skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d1bb02b1ee58..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -453,12 +453,3 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 1e1f8a361b75..a1d5021c31ef 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,10 +12,13 @@ #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt_supp.h> +#include <crypto/hash.h> /* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 +#define FS_IV_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 #define FS_AES_256_GCM_KEY_SIZE 32 #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 @@ -54,6 +57,7 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -87,4 +91,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 179e578b875b..018c588c7ac3 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,8 +10,13 @@ #include <keys/user-type.h> #include <linux/scatterlist.h> +#include <linux/ratelimit.h> +#include <crypto/aes.h> +#include <crypto/sha.h> #include "fscrypt_private.h" +static struct crypto_shash *essiv_hash_tfm; + static void derive_crypt_complete(struct crypto_async_request *req, int rc) { struct fscrypt_completion_result *ecr = req->data; @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,7 +82,7 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix) + const char *prefix, int min_keysize) { char *description; struct key *keyring_key; @@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info, master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { - if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; + if (S_ISREG(inode->i_mode)) { + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci) return; crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; @@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix); + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode) ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } + } if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) crypt_info = NULL; out: diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 210976e7a269..ce07a86200f3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode, memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) - return -EINVAL; - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; if (policy->flags & ~FS_POLICY_FLAGS_VALID) @@ -260,6 +256,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, memcpy(ctx.master_key_descriptor, ci->ci_master_key, FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); if (res) @@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> -#include <linux/pmem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> @@ -84,7 +83,7 @@ struct exceptional_entry_key { }; struct wait_exceptional_entry_queue { - wait_queue_t wait; + wait_queue_entry_t wait; struct exceptional_entry_key key; }; @@ -108,7 +107,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; @@ -784,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev, } dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - wb_cache_pmem(kaddr, size); + dax_flush(dax_dev, pgoff, kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -856,8 +855,10 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, dax_dev, mapping, indices[i], pvec.pages[i]); - if (ret < 0) + if (ret < 0) { + mapping_set_error(mapping, ret); goto out; + } } start_index = indices[pvec.nr - 1] + 1; } @@ -976,7 +977,8 @@ int __dax_zero_page_range(struct block_device *bdev, dax_read_unlock(id); return rc; } - clear_pmem(kaddr + offset, size); + memset(kaddr + offset, 0, size); + dax_flush(dax_dev, pgoff, kaddr + offset, size); dax_read_unlock(id); } return 0; @@ -1055,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(kaddr, map_len, iter); + map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, + map_len, iter); else map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { @@ -1213,7 +1216,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, diff --git a/fs/dcache.c b/fs/dcache.c index a9f995f6859e..7ece68d0d4db 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -277,6 +277,33 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } +void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + atomic_inc(&p->u.count); + spin_unlock(&dentry->d_lock); + name->name = p->name; + } else { + memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + spin_unlock(&dentry->d_lock); + name->name = name->inline_name; + } +} +EXPORT_SYMBOL(take_dentry_name_snapshot); + +void release_dentry_name_snapshot(struct name_snapshot *name) +{ + if (unlikely(name->name != name->inline_name)) { + struct external_name *p; + p = container_of(name->name, struct external_name, name[0]); + if (unlikely(atomic_dec_and_test(&p->u.count))) + kfree_rcu(p, u.head); + } +} +EXPORT_SYMBOL(release_dentry_name_snapshot); + static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) @@ -3546,8 +3573,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3559,24 +3584,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3590,14 +3610,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ @@ -3608,6 +3625,11 @@ EXPORT_SYMBOL(d_genocide); void __init vfs_caches_init_early(void) { + int i; + + for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) + INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); + dcache_init_early(); inode_init_early(); } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 354e2ab62031..6dabc4a10396 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/filesystems for more details. + * See Documentation/filesystems/ for more details. * */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e892ae7d89f8..a0e4e2f7e0be 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/kernel-api for more details. + * See ./Documentation/core-api/kernel-api.rst for more details. * */ @@ -766,7 +766,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, { int error; struct dentry *dentry = NULL, *trap; - const char *old_name; + struct name_snapshot old_name; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ @@ -781,19 +781,19 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name, d_is_dir(old_dentry), NULL, old_dentry); - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; diff --git a/fs/direct-io.c b/fs/direct-io.c index a04ebea77de8..08cf27811e5a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work) dio_complete(dio, 0, true); } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. @@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio) /** * dio_end_io - handle the end io action for the given bio * @bio: The direct io bio thats being completed - * @error: Error if there was one * * This is meant to be called by any filesystem that uses their own dio_submit_t * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; @@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, else bio->bi_end_io = dio_bio_end_io; + bio->bi_write_hint = dio->iocb->ki_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; unsigned i; - int err; + blk_status_t err = bio->bi_status; - if (bio->bi_error) - dio->io_error = -EIO; + if (err) { + if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) + dio->io_error = -EAGAIN; + else + dio->io_error = -EIO; + } if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { - err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { @@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); put_page(page); } - err = bio->bi_error; bio_put(bio); } return err; @@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } @@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; dio->op_flags = REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_NOWAIT) + dio->op_flags |= REQ_NOWAIT; } else { dio->op = REQ_OP_READ; } diff --git a/fs/eventfd.c b/fs/eventfd.c index 68b9fffcb2c8..2fb4eadaa118 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; @@ -215,8 +215,8 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); * * Returns %0 if successful, or the following error codes: * - * -EAGAIN : The operation would have blocked but @no_wait was non-zero. - * -ERESTARTSYS : A signal interrupted the wait operation. + * - -EAGAIN : The operation would have blocked but @no_wait was non-zero. + * - -ERESTARTSYS : A signal interrupted the wait operation. * * If @no_wait is zero, the function might sleep until the eventfd internal * counter becomes greater than zero. diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5420767c9b68..b1c8e23ddf65 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -244,7 +244,7 @@ struct eppoll_entry { * Wait queue item that will be linked to the target file wait * queue head. */ - wait_queue_t wait; + wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; @@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } -static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ -static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } @@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) * mechanism. It is called by the stored file descriptors when they * have events to report. */ -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; @@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ - list_del_init(&wait->task_list); + list_del_init(&wait->entry); } spin_lock_irqsave(&ep->lock, flags); @@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int res = 0, eavail, timed_out = 0; unsigned long flags; u64 slack = 0; - wait_queue_t wait; + wait_queue_entry_t wait; ktime_t expires, *to = NULL; if (timeout > 0) { diff --git a/fs/exec.c b/fs/exec.c index 904199086490..62175cbcc801 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,8 +220,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; - unsigned long ptr_size; - struct rlimit *rlim; + unsigned long ptr_size, limit; /* * Since the stack will hold pointers to the strings, we @@ -250,14 +249,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return page; /* - * Limit to 1/4-th the stack size for the argv+env strings. + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ - rlim = current->signal->rlim; - if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) + limit = _STK_LIM / 4 * 3; + limit = min(limit, rlimit(RLIMIT_STACK) / 4); + if (size > limit) goto fail; } diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 8eeb694332fe..98233a97b7b8 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -72,7 +72,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) set_page_dirty(page); if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index d9650c9508e4..e2709695b177 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -100,7 +100,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) } if (IS_DIRSYNC(dir)) { - err = write_one_page(page, 1); + err = write_one_page(page); if (!err) err = sync_inode_metadata(dir, 1); } else { diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 03f5ce1d3dbe..23ebb92484c6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -113,7 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; }; static inline spinlock_t * diff --git a/fs/ext2/file.c b/fs/ext2/file.c index b21891a6bfca..d34d32bdc944 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -174,15 +174,12 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int ret; struct super_block *sb = file->f_mapping->host->i_sb; - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; ret = generic_file_fsync(file, start, end, datasync); - if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { + if (ret == -EIO) /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, "detected IO error when writing metadata buffers"); - ret = -EIO; - } return ret; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9c2028b50e5c..7b1bc9059863 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -147,9 +147,9 @@ static void ext2_put_super (struct super_block * sb) ext2_quota_off_umount(sb); - if (sbi->s_mb_cache) { - ext2_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_block_cache) { + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -1131,9 +1131,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } #ifdef CONFIG_EXT2_FS_XATTR - sbi->s_mb_cache = ext2_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + sbi->s_ea_block_cache = ext2_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); goto failed_mount3; } #endif @@ -1182,8 +1182,8 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: - if (sbi->s_mb_cache) - ext2_xattr_destroy_cache(sbi->s_mb_cache); + if (sbi->s_ea_block_cache) + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index fbdb8f171893..1b9b1268d418 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -121,6 +121,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = { NULL }; +#define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache) + static inline const struct xattr_handler * ext2_xattr_handler(int name_index) { @@ -150,7 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -195,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; goto cleanup; @@ -208,7 +210,7 @@ found: le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) goto bad_block; - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); if (buffer) { error = -ERANGE; @@ -246,7 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -281,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", goto bad_block; entry = next; } - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); /* list the attribute names */ @@ -493,8 +495,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set", * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect modified block */ - mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, + bh->b_blocknr); /* keep the buffer locked while modifying it. */ } else { @@ -627,7 +629,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -655,7 +657,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, don't need to change the reference count. */ new_bh = old_bh; get_bh(new_bh); - ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + ext2_xattr_cache_insert(ea_block_cache, new_bh); } else { /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, @@ -676,7 +678,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, memcpy(new_bh->b_data, header, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + ext2_xattr_cache_insert(ea_block_cache, new_bh); ext2_xattr_update_super_block(sb); } @@ -721,8 +723,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect freed block */ - mb_cache_entry_delete_block(ext2_mb_cache, - hash, old_bh->b_blocknr); + mb_cache_entry_delete(ea_block_cache, hash, + old_bh->b_blocknr); /* Free the old block. */ ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); @@ -795,8 +797,8 @@ ext2_xattr_delete_inode(struct inode *inode) * This must happen under buffer lock for ext2_xattr_set2() to * reliably detect freed block */ - mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, + bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); @@ -897,21 +899,21 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext2_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", "inode %ld: block %ld read error", - inode->i_ino, (unsigned long) ce->e_block); + inode->i_ino, (unsigned long) ce->e_value); } else { lock_buffer(bh); /* @@ -924,27 +926,27 @@ again: * entry is still hashed is reliable. */ if (hlist_bl_unhashed(&ce->e_hash_list)) { - mb_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); unlock_buffer(bh); brelse(bh); goto again; } else if (le32_to_cpu(HDR(bh)->h_refcount) > EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", - (unsigned long) ce->e_block, + (unsigned long) ce->e_value, le32_to_cpu(HDR(bh)->h_refcount), EXT2_XATTR_REFCOUNT_MAX); } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb_cache_entry_touch(ext2_mb_cache, ce); - mb_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb_cache_entry_find_next(ext2_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 3ec0e46de95f..09441ae07a5b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type) */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) + struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; @@ -218,7 +218,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } error = ext4_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); + value, size, xattr_flags); kfree(value); if (!error) @@ -231,18 +231,23 @@ int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; - int error, retries = 0; + int error, credits, retries = 0; + size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; error = dquot_initialize(inode); if (error) return error; retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); + error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, + &credits); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); - error = __ext4_set_acl(handle, inode, type, acl); + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -267,13 +272,13 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, - default_acl); + default_acl, XATTR_CREATE); posix_acl_release(default_acl); } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, - acl); + acl, XATTR_CREATE); posix_acl_release(acl); } return error; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32191548abed..9ebde0cd632e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1114,6 +1114,7 @@ struct ext4_inode_info { /* * Mount flags set via mount options or defaults */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ @@ -1444,6 +1445,8 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ /* tunables */ unsigned long s_stripe; @@ -1516,7 +1519,8 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ @@ -1797,10 +1801,12 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ - EXT4_FEATURE_INCOMPAT_CSUM_SEED) + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -2098,6 +2104,12 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do @@ -2126,6 +2138,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + /* * Timeout and state flag for lazy initialization inode thread. */ @@ -2389,16 +2411,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, - uid_t *owner, int handle_type, - unsigned int line_no, int nblocks); + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); -#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ - 0, 0, 0) + i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ - (type), __LINE__, (nblocks)) + 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); @@ -2433,6 +2456,7 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb, extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); /* inode.c */ int ext4_inode_is_fast_symlink(struct inode *inode); @@ -2704,19 +2728,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_group_desc_csum(struct super_block *sb) -{ - return ext4_has_feature_gdt_csum(sb) || - EXT4_SB(sb)->s_chksum_driver != NULL; -} - static inline int ext4_has_metadata_csum(struct super_block *sb) { WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); - return (EXT4_SB(sb)->s_chksum_driver != NULL); + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); } + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | @@ -2756,13 +2781,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) { - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f97611171023..dabad1bc8617 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -77,7 +77,14 @@ #define EXT4_RESERVE_TRANS_BLOCKS 12U -#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 +/* + * Number of credits needed if we need to insert an entry into a + * directory. For each new index block, we need 4 blocks (old index + * block, new index block, bitmap block, bg summary). For normal + * htree directories there are 2 levels; if the largedir feature + * enabled it's 3 levels. + */ +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was @@ -104,20 +111,6 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) -static inline int ext4_jbd2_credits_xattr(struct inode *inode) -{ - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); - - /* - * In case of inline data, we may push out the data to a block, - * so we need to reserve credits for this eventuality - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - return credits; -} - - /* * Ext4 handle operation types -- for logging purposes */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e36508610b7..e0a8425ff74d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) static inline int get_default_free_blocks_flags(struct inode *inode) { - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 02ce7e7bbdf5..58294c9a7e1d 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock_shared(inode); + if (!inode_trylock_shared(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock_shared(inode); + } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore @@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_dax_write_iter(iocb, from); #endif - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -345,13 +364,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (ext4_encrypted_inode(inode)) { - int err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -435,6 +447,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } + + /* Set the flags to support nowait AIO */ + filp->f_mode |= FMODE_AIO_NOWAIT; + return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b19436098837..7ec340898598 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -480,6 +480,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; + ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; @@ -487,9 +488,12 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_grpblk_t last_cluster; int error = 0; + bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; + else if (keys[0].fmr_physical < bofs) + keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; start_fsb = keys[0].fmr_physical; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 9d549608fd30..aae2c3971cef 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -124,7 +124,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 98ac2f1f23b3..507bfb3344d4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * as writing the quota to disk may need the lock as well. */ dquot_initialize(inode); - ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); @@ -743,8 +742,9 @@ out: */ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, - __u32 goal, uid_t *owner, int handle_type, - unsigned int line_no, int nblocks) + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -766,30 +766,69 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + sb = dir->i_sb; + sbi = EXT4_SB(sb); + + if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && + !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); if (err) return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) return ERR_PTR(-ENOKEY); - if (!handle) - nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; } - sb = dir->i_sb; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never + * more than 1k. In practice they are under + * 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + } + ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); /* * Initialize owners and quota early so that we don't have to account @@ -1053,6 +1092,7 @@ got: /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; @@ -1109,13 +1149,15 @@ got: goto fail_free_drop; } - err = ext4_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; + if (!(ei->i_flags & EXT4_EA_INODE_FL)) { + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; - err = ext4_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bc15c2c17633..7ffa290cbb8e 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 8d141c0c8ff9..28c5c3abddb3 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, /* Compute min_offs. */ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_block && entry->e_value_size) { + if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cf82d03968c..3c600f02673f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -144,16 +144,12 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. + * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; - - if (ext4_has_inline_data(inode)) - return 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + return S_ISLNK(inode->i_mode) && inode->i_size && + (inode->i_size < EXT4_N_BLOCKS * 4); } /* @@ -189,6 +185,8 @@ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; + int extra_credits = 3; + struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -213,7 +211,8 @@ void ext4_evict_inode(struct inode *inode) */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -238,8 +237,12 @@ void ext4_evict_inode(struct inode *inode) * protection against it */ sb_start_intwrite(inode->i_sb); + + if (!IS_NOQUOTA(inode)) + extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+3); + ext4_blocks_for_truncate(inode)+extra_credits); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -254,6 +257,16 @@ void ext4_evict_inode(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* + * Set inode->i_size to 0 before calling ext4_truncate(). We need + * special handling of symlinks here because i_size is used to + * determine whether ext4_inode_info->i_data contains symlink data or + * block mappings. Setting i_size to 0 will remove its fast symlink + * status. Erase i_data so that it becomes a valid empty block map. + */ + if (ext4_inode_is_fast_symlink(inode)) + memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -271,25 +284,17 @@ void ext4_evict_inode(struct inode *inode) } } - /* - * ext4_ext_truncate() doesn't reserve any slop when it - * restarts journal transactions; therefore there may not be - * enough credits left in the handle to remove the inode from - * the orphan list and set the dtime field. - */ - if (!ext4_handle_has_enough_credits(handle, 3)) { - err = ext4_journal_extend(handle, 3); - if (err > 0) - err = ext4_journal_restart(handle, 3); - if (err != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", err); - stop_handle: - ext4_journal_stop(handle); - ext4_orphan_del(NULL, inode); - sb_end_intwrite(inode->i_sb); - goto no_delete; - } + /* Remove xattr references. */ + err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, + extra_credits); + if (err) { + ext4_warning(inode->i_sb, "xattr delete (err %d)", err); +stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); + goto no_delete; } /* @@ -317,6 +322,7 @@ void ext4_evict_inode(struct inode *inode) ext4_free_inode(handle, inode); ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -710,7 +716,7 @@ out_sem: if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && - !IS_NOQUOTA(inode) && + !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode); @@ -4712,7 +4718,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); + inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4846,6 +4852,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } brelse(iloc.bh); ext4_set_inode_flags(inode); + + if (ei->i_flags & EXT4_EA_INODE_FL) { + ext4_xattr_inode_set_class(inode); + + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + unlock_new_inode(inode); return inode; @@ -5037,7 +5052,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(raw_inode)) { + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } @@ -5287,7 +5302,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } + + /* dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(inode, attr); + up_read(&EXT4_I(inode)->xattr_sem); + if (error) { ext4_journal_stop(handle); return error; @@ -5307,6 +5329,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; int shrink = (attr->ia_size <= inode->i_size); + if (ext4_encrypted_inode(inode)) { + error = fscrypt_get_encryption_info(inode); + if (error) + return error; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0c21e22acd74..42b3a73143cf 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -218,7 +218,7 @@ static int ext4_ioctl_setflags(struct inode *inode, unsigned int jflag; /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; @@ -342,7 +342,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) err = -EPERM; inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto out_unlock; err = ext4_get_inode_loc(inode, &iloc); @@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { + + /* __dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); + up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b7928cddd539..581e357e8406 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -367,8 +367,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -2639,6 +2637,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -2782,7 +2781,8 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count, + struct bio **biop) { ext4_fsblk_t discard_block; @@ -2791,18 +2791,18 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + if (biop) { + return __blkdev_issue_discard(sb->s_bdev, + (sector_t)discard_block << (sb->s_blocksize_bits - 9), + (sector_t)count << (sb->s_blocksize_bits - 9), + GFP_NOFS, 0, biop); + } else + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } -/* - * This function is called by the jbd2 layer once the commit has finished, - * so we know we can free the blocks that were released with that commit. - */ -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc) +static void ext4_free_data_in_buddy(struct super_block *sb, + struct ext4_free_data *entry) { - struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; @@ -2810,18 +2810,6 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count); - if (err && err != -EOPNOTSUPP) - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } - err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); @@ -2862,6 +2850,56 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_free_data *entry, *tmp; + struct bio *discard_bio = NULL; + struct list_head freed_data_list; + struct list_head *cut_pos = NULL; + int err; + + INIT_LIST_HEAD(&freed_data_list); + + spin_lock(&sbi->s_md_lock); + list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { + if (entry->efd_tid != commit_tid) + break; + cut_pos = &entry->efd_list; + } + if (cut_pos) + list_cut_position(&freed_data_list, &sbi->s_freed_data_list, + cut_pos); + spin_unlock(&sbi->s_md_lock); + + if (test_opt(sb, DISCARD)) { + list_for_each_entry(entry, &freed_data_list, efd_list) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, + &discard_bio); + if (err && err != -EOPNOTSUPP) { + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } else if (err == -EOPNOTSUPP) + break; + } + + if (discard_bio) + submit_bio_wait(discard_bio); + } + + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); +} + int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -3529,7 +3567,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "prellocated %u for group %u\n", preallocated, group); + mb_debug(1, "preallocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -4464,7 +4502,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); /* Allow to use superuser reservation for quota file */ - if (IS_NOQUOTA(ar->inode)) + if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { @@ -4583,14 +4621,28 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static int can_merge(struct ext4_free_data *entry1, - struct ext4_free_data *entry2) +static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, + struct ext4_free_data *entry, + struct ext4_free_data *new_entry, + struct rb_root *entry_rb_root) { - if ((entry1->efd_tid == entry2->efd_tid) && - (entry1->efd_group == entry2->efd_group) && - ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) - return 1; - return 0; + if ((entry->efd_tid != new_entry->efd_tid) || + (entry->efd_group != new_entry->efd_group)) + return; + if (entry->efd_start_cluster + entry->efd_count == + new_entry->efd_start_cluster) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + } else if (new_entry->efd_start_cluster + new_entry->efd_count == + entry->efd_start_cluster) { + new_entry->efd_count += entry->efd_count; + } else + return; + spin_lock(&sbi->s_md_lock); + list_del(&entry->efd_list); + spin_unlock(&sbi->s_md_lock); + rb_erase(&entry->efd_node, entry_rb_root); + kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack int @@ -4646,29 +4698,19 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } - /* Add the extent to transaction's private list */ - new_entry->efd_jce.jce_func = ext4_free_data_callback; + spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, &new_entry->efd_jce); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); return 0; @@ -4871,7 +4913,8 @@ do_more: * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count); + err = ext4_issue_discard(sb, block_group, bit, count, + NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%lu failed" @@ -5094,7 +5137,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count, NULL); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 2bed62084a8c..009300ee1561 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -78,10 +78,8 @@ do { \ struct ext4_free_data { - /* MUST be the first member */ - struct ext4_journal_cb_entry efd_jce; - - /* ext4_free_data private data starts from here */ + /* this links the free block information from sb_info */ + struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 364ea4d4a943..cf5181b62df1 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), - S_IFREG, NULL, goal, owner); + S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c992ef2c2f94..9bb36909ec92 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -484,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } - if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " "not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 404256caf9cf..13f0cadb1238 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { - return le32_to_cpu(entry->block) & 0x00ffffff; + return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) @@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; @@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } indirect = root->info.indirect_levels; - if (indirect > 1) { - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", - root->info.indirect_levels); + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } goto fail; } @@ -859,12 +866,19 @@ fail: static void dx_release(struct dx_frame *frames) { + struct dx_root_info *info; + int i; + if (frames[0].bh == NULL) return; - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); + info = &((struct dx_root *)frames[0].bh->b_data)->info; + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } } /* @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; @@ -1428,11 +1442,11 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); - goto next; + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, @@ -1442,7 +1456,8 @@ restart: EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); - goto next; + ret = ERR_PTR(-EFSBADCRC); + goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, &fname, @@ -1485,7 +1500,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1889,7 +1904,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirent_node(handle, dir, bh); @@ -1908,7 +1923,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, { struct buffer_head *bh2; struct dx_root *root; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; struct ext4_dir_entry_tail *t; @@ -2127,13 +2142,16 @@ out: static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; + int restart; int err; +again: + restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); @@ -2155,24 +2173,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, if (err != -ENOSPC) goto cleanup; + err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; + int levels = frame - frames + 1; + unsigned int icount; + int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning_inode(dir, "Directory index full!"); + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is " + "not enabled on this " + "filesystem"); + } err = -ENOSPC; goto cleanup; } + icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); @@ -2187,7 +2225,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, err = ext4_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; - if (levels) { + if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", @@ -2195,7 +2233,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, - frames[0].bh); + (frame - 1)->bh); if (err) goto journal_error; @@ -2211,17 +2249,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); + err = ext4_handle_dirty_dx_node(handle, dir, + (frame - 1)->bh); + if (err) + goto journal_error; + if (restart) { + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + goto journal_error; + } } else { - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); + struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -2229,22 +2275,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); + dxroot = (struct dx_root *)frames[0].bh->b_data; + dxroot->info.indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; - } - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); - if (err) { - ext4_std_error(inode->i_sb, err); - goto cleanup; + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); @@ -2256,10 +2298,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, goto cleanup; journal_error: - ext4_std_error(dir->i_sb, err); + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ + if (restart && err == 0) + goto again; return err; } @@ -2296,7 +2343,7 @@ int ext4_generic_delete_entry(handle_t *handle, blocksize); else de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138ba739..c2fce4478cca 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio) bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -349,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0; + io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -396,6 +398,7 @@ submit_and_retry: ret = io_submit_init_bio(io, bh); if (ret) return ret; + io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829d56de..40a5497b0f60 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d37c81f327e7..0886fe82e9c4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -373,6 +373,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); + + ext4_process_freed_data(sb, txn->t_tid); + spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, @@ -927,9 +930,13 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); @@ -1143,7 +1150,16 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; - int res, res2, retries = 0; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; res = ext4_convert_inline_data(inode); if (res) @@ -1178,8 +1194,12 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (res) return res; retry: - handle = ext4_journal_start(inode, EXT4_HT_MISC, - ext4_jbd2_credits_xattr(inode)); + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1203,7 +1223,7 @@ retry: return res; } -static int ext4_dummy_context(struct inode *inode) +static bool ext4_dummy_context(struct inode *inode) { return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); } @@ -1256,16 +1276,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode) } static const struct dquot_operations ext4_quota_operations = { - .get_reserved_space = ext4_get_reserved_space, - .write_dquot = ext4_write_dquot, - .acquire_dquot = ext4_acquire_dquot, - .release_dquot = ext4_release_dquot, - .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, - .get_projid = ext4_get_projid, - .get_next_id = ext4_get_next_id, + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, + .get_inode_usage = ext4_get_inode_usage, + .get_next_id = ext4_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1328,7 +1349,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, Opt_nojournal_checksum, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, }; static const match_table_t tokens = { @@ -1411,6 +1432,8 @@ static const match_table_t tokens = { {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1618,6 +1641,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -3445,7 +3469,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (ext4_has_feature_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) { sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); @@ -3467,7 +3492,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb)) + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3597,6 +3622,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "The Hurd can't support 64-bit file systems"); goto failed_mount; } + + /* + * ea_inode feature uses l_i_version field which is not + * available in HURD_COMPAT mode. + */ + if (ext4_has_feature_ea_inode(sb)) { + ext4_msg(sb, KERN_ERR, + "ea_inode feature is not supported for Hurd"); + goto failed_mount; + } } if (IS_EXT2_SB(sb)) { @@ -3950,7 +3985,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -4061,10 +4096,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - sbi->s_mb_cache = ext4_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); - goto failed_mount_wq; + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_block_cache"); + goto failed_mount_wq; + } + + if (ext4_has_feature_ea_inode(sb)) { + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_inode_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_inode_cache"); + goto failed_mount_wq; + } + } } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -4296,9 +4343,13 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -4957,6 +5008,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { + ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); + err = -EINVAL; + goto restore_opts; + } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d74dc5f81a04..48c7a7d55ed3 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -100,7 +100,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (!ret || val >= clusters) + if (ret || val >= clusters) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5d3c2536641c..cff4f41ced61 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -72,12 +72,14 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); -static struct buffer_head *ext4_xattr_cache_find(struct inode *, - struct ext4_xattr_header *, - struct mb_cache_entry **); -static void ext4_xattr_rehash(struct ext4_xattr_header *, - struct ext4_xattr_entry *); +static void ext4_xattr_block_cache_insert(struct mb_cache *, + struct buffer_head *); +static struct buffer_head * +ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, + struct mb_cache_entry **); +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count); +static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, @@ -104,8 +106,22 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ - inode->i_sb->s_fs_info)->s_mb_cache) +#define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_block_cache) + +#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_inode_cache) + +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode); + +#ifdef CONFIG_LOCKDEP +void ext4_xattr_inode_set_class(struct inode *ea_inode) +{ + lockdep_set_subclass(&ea_inode->i_rwsem, 1); +} +#endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, @@ -177,9 +193,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, /* Check the values */ while (!IS_LAST_ENTRY(entry)) { - if (entry->e_value_block != 0) - return -EFSCORRUPTED; - if (entry->e_value_size != 0) { + if (entry->e_value_size != 0 && + entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); u32 size = le32_to_cpu(entry->e_value_size); void *value; @@ -269,6 +284,185 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, return cmp ? -ENODATA : 0; } +static u32 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) +{ + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); +} + +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) +{ + return ((u64)ea_inode->i_ctime.tv_sec << 32) | + ((u32)ea_inode->i_version); +} + +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) +{ + ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + ea_inode->i_version = (u32)ref_count; +} + +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) +{ + return (u32)ea_inode->i_atime.tv_sec; +} + +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) +{ + ea_inode->i_atime.tv_sec = hash; +} + +/* + * Read the EA value from an inode. + */ +static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) +{ + unsigned long block = 0; + struct buffer_head *bh; + int blocksize = ea_inode->i_sb->s_blocksize; + size_t csize, copied = 0; + void *copy_pos = buf; + + while (copied < size) { + csize = (size - copied) > blocksize ? blocksize : size - copied; + bh = ext4_bread(NULL, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + return -EFSCORRUPTED; + + memcpy(copy_pos, bh->b_data, csize); + brelse(bh); + + copy_pos += csize; + block += 1; + copied += csize; + } + return 0; +} + +static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + struct inode **ea_inode) +{ + struct inode *inode; + int err; + + inode = ext4_iget(parent->i_sb, ea_ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(parent->i_sb, + "error while reading EA inode %lu err=%d", ea_ino, + err); + return err; + } + + if (is_bad_inode(inode)) { + ext4_error(parent->i_sb, + "error while reading EA inode %lu is_bad_inode", + ea_ino); + err = -EIO; + goto error; + } + + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + ext4_error(parent->i_sb, + "EA inode %lu does not have EXT4_EA_INODE_FL flag", + ea_ino); + err = -EINVAL; + goto error; + } + + *ea_inode = inode; + return 0; +error: + iput(inode); + return err; +} + +static int +ext4_xattr_inode_verify_hashes(struct inode *ea_inode, + struct ext4_xattr_entry *entry, void *buffer, + size_t size) +{ + u32 hash; + + /* Verify stored hash matches calculated hash. */ + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); + if (hash != ext4_xattr_inode_get_hash(ea_inode)) + return -EFSCORRUPTED; + + if (entry) { + __le32 e_hash, tmp_data; + + /* Verify entry hash. */ + tmp_data = cpu_to_le32(hash); + e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, + &tmp_data, 1); + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + } + return 0; +} + +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + +/* + * Read xattr value from the EA inode. + */ +static int +ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, + void *buffer, size_t size) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + struct inode *ea_inode; + int err; + + err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), + &ea_inode); + if (err) { + ea_inode = NULL; + goto out; + } + + if (i_size_read(ea_inode) != size) { + ext4_warning_inode(ea_inode, + "ea_inode file size=%llu entry size=%zu", + i_size_read(ea_inode), size); + err = -EFSCORRUPTED; + goto out; + } + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + if (err) + goto out; + + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); + /* + * Compatibility check for old Lustre ea_inode implementation. Old + * version does not have hash validation, but it has a backpointer + * from ea_inode to the parent inode. + */ + if (err == -EFSCORRUPTED) { + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || + ea_inode->i_generation != inode->i_generation) { + ext4_warning_inode(ea_inode, + "EA inode hash validation failed"); + goto out; + } + /* Do not add ea_inode to the cache. */ + ea_inode_cache = NULL; + } else if (err) + goto out; + + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); +out: + iput(ea_inode); + return err; +} + static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) @@ -277,7 +471,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -298,7 +492,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) @@ -308,8 +502,15 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + memcpy(buffer, bh->b_data + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -350,8 +551,15 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -428,7 +636,6 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -450,7 +657,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -539,15 +746,445 @@ static void ext4_xattr_update_super_block(handle_t *handle, } } +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) +{ + struct ext4_iloc iloc = { .bh = NULL }; + struct buffer_head *bh = NULL; + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + qsize_t ea_inode_refs = 0; + void *end; + int ret; + + lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); + + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + ret = xattr_check_inode(inode, header, end); + if (ret) + goto out; + + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + ret = -EIO; + goto out; + } + + if (ext4_xattr_check_block(inode, bh)) { + ret = -EFSCORRUPTED; + goto out; + } + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + *usage = ea_inode_refs + 1; + ret = 0; +out: + brelse(iloc.bh); + brelse(bh); + return ret; +} + +static inline size_t round_up_cluster(struct inode *inode, size_t length) +{ + struct super_block *sb = inode->i_sb; + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + + inode->i_blkbits); + size_t mask = ~(cluster_size - 1); + + return (length + cluster_size - 1) & mask; +} + +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) +{ + int err; + + err = dquot_alloc_inode(inode); + if (err) + return err; + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); + if (err) + dquot_free_inode(inode); + return err; +} + +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) +{ + dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); + dquot_free_inode(inode); +} + +int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create) +{ + int credits; + int blocks; + + /* + * 1) Owner inode update + * 2) Ref count update on old xattr block + * 3) new xattr block + * 4) block bitmap update for new xattr block + * 5) group descriptor for new xattr block + * 6) block bitmap update for old xattr block + * 7) group descriptor for old block + * + * 6 & 7 can happen if we have two racing threads T_a and T_b + * which are each trying to set an xattr on inodes I_a and I_b + * which were both initially sharing an xattr block. + */ + credits = 7; + + /* Quota updates. */ + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (inode && ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + /* We are done if ea_inode feature is not enabled. */ + if (!ext4_has_feature_ea_inode(sb)) + return credits; + + /* New ea_inode, inode map, block bitmap, group descriptor. */ + credits += 4; + + /* Data blocks. */ + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree. */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + + /* Blocks themselves. */ + credits += blocks; + + if (!is_create) { + /* Dereference ea_inode holding old xattr value. + * Old ea_inode, inode map, block bitmap, group descriptor. + */ + credits += 4; + + /* Data blocks for old ea_inode. */ + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree for old + * ea_inode. + */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + } + + /* We may need to clone the existing xattr block in which case we need + * to increment ref counts for existing ea_inodes referenced by it. + */ + if (block_bh) { + struct ext4_xattr_entry *entry = BFIRST(block_bh); + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + /* Ref count update on ea_inode. */ + credits += 1; + } + return credits; +} + +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, + int credits, struct buffer_head *bh, + bool dirty, bool block_csum) +{ + int error; + + if (!ext4_handle_valid(handle)) + return 0; + + if (handle->h_buffer_credits >= credits) + return 0; + + error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); + if (!error) + return 0; + if (error < 0) { + ext4_warning(inode->i_sb, "Extend journal (error %d)", error); + return error; + } + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + + error = ext4_journal_restart(handle, credits); + if (error) { + ext4_warning(inode->i_sb, "Restart journal (error %d)", error); + return error; + } + + if (bh) { + error = ext4_journal_get_write_access(handle, bh); + if (error) { + ext4_warning(inode->i_sb, + "Get write access failed (error %d)", + error); + return error; + } + } + return 0; +} + +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + int ref_change) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); + struct ext4_iloc iloc; + s64 ref_count; + u32 hash; + int ret; + + inode_lock(ea_inode); + + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); + if (ret) { + iloc.bh = NULL; + goto out; + } + + ref_count = ext4_xattr_inode_get_ref(ea_inode); + ref_count += ref_change; + ext4_xattr_inode_set_ref(ea_inode, ref_count); + + if (ref_change > 0) { + WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 1) { + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + set_nlink(ea_inode, 1); + ext4_orphan_del(handle, ea_inode); + + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_create(ea_inode_cache, + GFP_NOFS, hash, + ea_inode->i_ino, + true /* reusable */); + } + } + } else { + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 0) { + WARN_ONCE(ea_inode->i_nlink != 1, + "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); + + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_delete(ea_inode_cache, hash, + ea_inode->i_ino); + } + } + } + + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); + iloc.bh = NULL; + if (ret) + ext4_warning_inode(ea_inode, + "ext4_mark_iloc_dirty() failed ret=%d", ret); +out: + brelse(iloc.bh); + inode_unlock(ea_inode); + return ret; +} + +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, 1); +} + +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, -1); +} + +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, + struct ext4_xattr_entry *first) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *failed_entry; + unsigned int ea_ino; + int err, saved_err; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) + goto cleanup; + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "inc ref error %d", err); + iput(ea_inode); + goto cleanup; + } + iput(ea_inode); + } + return 0; + +cleanup: + saved_err = err; + failed_entry = entry; + + for (entry = first; entry != failed_entry; + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) { + ext4_warning(parent->i_sb, + "cleanup ea_ino %u iget error %d", ea_ino, + err); + continue; + } + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", + err); + iput(ea_inode); + } + return saved_err; +} + +static void +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, + struct buffer_head *bh, + struct ext4_xattr_entry *first, bool block_csum, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits, bool skip_quota) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + bool dirty = false; + unsigned int ea_ino; + int err; + int credits; + + /* One credit for dec ref on ea_inode, one for orphan list addition, */ + credits = 2 + extra_credits; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) + continue; + + err = ext4_expand_inode_array(ea_inode_array, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, + "Expand inode array err=%d", err); + iput(ea_inode); + continue; + } + + err = ext4_xattr_ensure_credits(handle, parent, credits, bh, + dirty, block_csum); + if (err) { + ext4_warning_inode(ea_inode, "Ensure credits err=%d", + err); + continue; + } + + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", + err); + continue; + } + + if (!skip_quota) + ext4_xattr_inode_free_quota(parent, + le32_to_cpu(entry->e_value_size)); + + /* + * Forget about ea_inode within the same transaction that + * decrements the ref count. This avoids duplicate decrements in + * case the rest of the work spills over to subsequent + * transactions. + */ + entry->e_value_inum = 0; + entry->e_value_size = 0; + + dirty = true; + } + + if (dirty) { + /* + * Note that we are deliberately skipping csum calculation for + * the final update because we do not expect any journal + * restarts until xattr block is freed. + */ + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + ext4_warning_inode(parent, + "handle dirty metadata err=%d", err); + } +} + /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) + struct buffer_head *bh, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; @@ -565,9 +1202,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); + + if (ext4_has_feature_ea_inode(inode->i_sb)) + ext4_xattr_inode_dec_ref_all(handle, inode, bh, + BFIRST(bh), + true /* block_csum */, + ea_inode_array, + extra_credits, + true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -577,11 +1224,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; - ce = mb_cache_entry_get(ext4_mb_cache, hash, - bh->b_blocknr); - if (ce) { - ce->e_reusable = 1; - mb_cache_entry_put(ext4_mb_cache, ce); + if (ea_block_cache) { + ce = mb_cache_entry_get(ea_block_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ea_block_cache, ce); + } } } @@ -620,7 +1269,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -631,113 +1280,454 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +/* + * Write the value of the EA in an inode. + */ +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, + const void *buf, int bufsize) +{ + struct buffer_head *bh = NULL; + unsigned long block = 0; + int blocksize = ea_inode->i_sb->s_blocksize; + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int csize, wsize = 0; + int ret = 0; + int retries = 0; + +retry: + while (ret >= 0 && ret < max_blocks) { + struct ext4_map_blocks map; + map.m_lblk = block += ret; + map.m_len = max_blocks -= ret; + + ret = ext4_map_blocks(handle, ea_inode, &map, + EXT4_GET_BLOCKS_CREATE); + if (ret <= 0) { + ext4_mark_inode_dirty(handle, ea_inode); + if (ret == -ENOSPC && + ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + break; + } + } + + if (ret < 0) + return ret; + + block = 0; + while (wsize < bufsize) { + if (bh != NULL) + brelse(bh); + csize = (bufsize - wsize) > blocksize ? blocksize : + bufsize - wsize; + bh = ext4_getblk(handle, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (ret) + goto out; + + memcpy(bh->b_data, buf, csize); + set_buffer_uptodate(bh); + ext4_handle_dirty_metadata(handle, ea_inode, bh); + + buf += csize; + wsize += csize; + block += 1; + } + + inode_lock(ea_inode); + i_size_write(ea_inode, wsize); + ext4_update_i_disksize(ea_inode, wsize); + inode_unlock(ea_inode); + + ext4_mark_inode_dirty(handle, ea_inode); + +out: + brelse(bh); + + return ret; +} + +/* + * Create an inode to store the value of a large EA. + */ +static struct inode *ext4_xattr_inode_create(handle_t *handle, + struct inode *inode, u32 hash) +{ + struct inode *ea_inode = NULL; + uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; + int err; + + /* + * Let the next inode be the goal, so we try and allocate the EA inode + * in the same group, or nearby one. + */ + ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG | 0600, NULL, inode->i_ino + 1, owner, + EXT4_EA_INODE_FL); + if (!IS_ERR(ea_inode)) { + ea_inode->i_op = &ext4_file_inode_operations; + ea_inode->i_fop = &ext4_file_operations; + ext4_set_aops(ea_inode); + ext4_xattr_inode_set_class(ea_inode); + unlock_new_inode(ea_inode); + ext4_xattr_inode_set_ref(ea_inode, 1); + ext4_xattr_inode_set_hash(ea_inode, hash); + err = ext4_mark_inode_dirty(handle, ea_inode); + if (!err) + err = ext4_inode_attach_jinode(ea_inode); + if (err) { + iput(ea_inode); + return ERR_PTR(err); + } + + /* + * Xattr inodes are shared therefore quota charging is performed + * at a higher level. + */ + dquot_free_inode(ea_inode); + dquot_drop(ea_inode); + inode_lock(ea_inode); + ea_inode->i_flags |= S_NOQUOTA; + inode_unlock(ea_inode); + } + + return ea_inode; +} + +static struct inode * +ext4_xattr_inode_cache_find(struct inode *inode, const void *value, + size_t value_len, u32 hash) +{ + struct inode *ea_inode; + struct mb_cache_entry *ce; + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + void *ea_data; + + if (!ea_inode_cache) + return NULL; + + ce = mb_cache_entry_find_first(ea_inode_cache, hash); + if (!ce) + return NULL; + + ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + if (!ea_data) { + mb_cache_entry_put(ea_inode_cache, ce); + return NULL; + } + + while (ce) { + ea_inode = ext4_iget(inode->i_sb, ce->e_value); + if (!IS_ERR(ea_inode) && + !is_bad_inode(ea_inode) && + (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && + i_size_read(ea_inode) == value_len && + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && + !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, + value_len) && + !memcmp(value, ea_data, value_len)) { + mb_cache_entry_touch(ea_inode_cache, ce); + mb_cache_entry_put(ea_inode_cache, ce); + kvfree(ea_data); + return ea_inode; + } + + if (!IS_ERR(ea_inode)) + iput(ea_inode); + ce = mb_cache_entry_find_next(ea_inode_cache, ce); + } + kvfree(ea_data); + return NULL; +} + +/* + * Add value of the EA in an inode. + */ +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, + const void *value, size_t value_len, + struct inode **ret_inode) +{ + struct inode *ea_inode; + u32 hash; + int err; + + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); + if (ea_inode) { + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + iput(ea_inode); + return err; + } + + *ret_inode = ea_inode; + return 0; + } + + /* Create an inode for the EA value */ + ea_inode = ext4_xattr_inode_create(handle, inode, hash); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + + err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); + if (err) { + ext4_xattr_inode_dec_ref(handle, ea_inode); + iput(ea_inode); + return err; + } + + if (EA_INODE_CACHE(inode)) + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, + ea_inode->i_ino, true /* reusable */); + + *ret_inode = ea_inode; + return 0; +} + +/* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. + */ +#define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) + +static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + struct ext4_xattr_search *s, + handle_t *handle, struct inode *inode, + bool is_block) { struct ext4_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + struct ext4_xattr_entry *here = s->here; + size_t min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + struct inode *old_ea_inode = NULL; + struct inode *new_ea_inode = NULL; + size_t old_size, new_size; + int ret; + + /* Space used by old and new values. */ + old_size = (!s->not_found && !here->e_value_inum) ? + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; + + /* + * Optimization for the simple case when old and new values have the + * same padded sizes. Not applicable if external inodes are involved. + */ + if (new_size && new_size == old_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + here->e_value_size = cpu_to_le32(i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, new_size); + } else { + memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, new_size - i->value_len); + } + return 0; + } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT4_XATTR_SIZE(size); - } - free += EXT4_XATTR_LEN(name_len); - } + + /* Check whether we have enough space. */ if (i->value) { - if (free < EXT4_XATTR_LEN(name_len) + - EXT4_XATTR_SIZE(i->value_len)) - return -ENOSPC; + size_t free; + + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) + free += EXT4_XATTR_LEN(name_len) + old_size; + + if (free < EXT4_XATTR_LEN(name_len) + new_size) { + ret = -ENOSPC; + goto out; + } + + /* + * If storing the value in an external inode is an option, + * reserve space for xattr entries/names in the external + * attribute block so that a long value does not occupy the + * whole space and prevent futher entries being added. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + new_size && is_block && + (min_offs + old_size - new_size) < + EXT4_XATTR_BLOCK_RESERVE(inode)) { + ret = -ENOSPC; + goto out; + } } - if (i->value && s->not_found) { - /* Insert the new name. */ - size_t size = EXT4_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); - } else { - if (s->here->e_value_size) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT4_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); - } else { - /* Clear pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); - memcpy(val, i->value, i->value_len); - } - return 0; - } + /* + * Getting access to old and new ea inodes is subject to failures. + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { + ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + &old_ea_inode); + if (ret) { + old_ea_inode = NULL; + goto out; + } + } + if (i->value && in_inode) { + WARN_ON_ONCE(!i->value_len); + + ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); + if (ret) + goto out; + + ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, + i->value_len, + &new_ea_inode); + if (ret) { + new_ea_inode = NULL; + ext4_xattr_inode_free_quota(inode, i->value_len); + goto out; + } + } - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT4_XATTR_NEXT(last); + if (old_ea_inode) { + /* We are ready to release ref count on the old_ea_inode. */ + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); + if (ret) { + /* Release newly required ref count on new_ea_inode. */ + if (new_ea_inode) { + int err; + + err = ext4_xattr_inode_dec_ref(handle, + new_ea_inode); + if (err) + ext4_warning_inode(new_ea_inode, + "dec ref new_ea_inode err=%d", + err); + ext4_xattr_inode_free_quota(inode, + i->value_len); } + goto out; } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT4_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); + + ext4_xattr_inode_free_quota(inode, + le32_to_cpu(here->e_value_size)); + } + + /* No failures allowed past this point. */ + + if (!s->not_found && here->e_value_offs) { + /* Remove the old value. */ + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + memmove(first_val + old_size, first_val, val - first_val); + memset(first_val, 0, old_size); + min_offs += old_size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + + if (!last->e_value_inum && + last->e_value_size && o < offs) + last->e_value_offs = cpu_to_le16(o + old_size); + last = EXT4_XATTR_NEXT(last); } } + if (!i->value) { + /* Remove old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + + last = ENTRY((void *)last - size); + memmove(here, (void *)here + size, + (void *)last - (void *)here + sizeof(__u32)); + memset(last, 0, size); + } else if (s->not_found) { + /* Insert new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)here + sizeof(__u32); + + memmove((void *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = i->name_index; + here->e_name_len = name_len; + memcpy(here->e_name, i->name, name_len); + } else { + /* This is an update, reset value info. */ + here->e_value_inum = 0; + here->e_value_offs = 0; + here->e_value_size = 0; + } + if (i->value) { - /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { - size_t size = EXT4_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); + /* Insert new value. */ + if (in_inode) { + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); + } else if (i->value_len) { + void *val = s->base + min_offs - new_size; + + here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); + memset(val, 0, new_size); } else { - /* Clear the pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, + new_size - i->value_len); } } + here->e_value_size = cpu_to_le32(i->value_len); } - return 0; + + if (i->value) { + __le32 hash = 0; + + /* Entry hash calculation. */ + if (in_inode) { + __le32 crc32c_hash; + + /* + * Feed crc32c hash instead of the raw value for entry + * hash calculation. This is to avoid walking + * potentially long value buffer again. + */ + crc32c_hash = cpu_to_le32( + ext4_xattr_inode_get_hash(new_ea_inode)); + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, + &crc32c_hash, 1); + } else if (is_block) { + __le32 *value = s->base + min_offs - new_size; + + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, value, + new_size >> 2); + } + here->e_hash = hash; + } + + if (is_block) + ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + + ret = 0; +out: + iput(old_ea_inode); + iput(new_ea_inode); + return ret; } struct ext4_xattr_block_find { @@ -794,15 +1784,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; - struct ext4_xattr_search *s = &bs->s; + struct ext4_xattr_search s_copy = bs->s; + struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + struct inode *ea_inode = NULL; + size_t old_ea_inode_size = 0; #define header(x) ((struct ext4_xattr_header *)(x)) - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); @@ -818,17 +1809,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, - bs->bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), - s->here); - ext4_xattr_cache_insert(ext4_mb_cache, - bs->bh); - } + error = ext4_xattr_set_entry(i, s, handle, inode, + true /* is_block */); + if (!error) + ext4_xattr_block_cache_insert(ea_block_cache, + bs->bh); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -854,6 +1843,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; + + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + /* + * Defer quota free call for previous inode + * until success is guaranteed. + */ + old_ea_inode_size = le32_to_cpu( + s->here->e_value_size); + s->here->e_value_inum = 0; + s->here->e_value_size = 0; + } } } else { /* Allocate a buffer where we construct the new block. */ @@ -870,17 +1877,33 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), s->here); + + if (i->value && s->here->e_value_inum) { + unsigned int ea_ino; + + /* + * A ref count on ea_inode has been taken as part of the call to + * ext4_xattr_set_entry() above. We would like to drop this + * extra ref but we have to wait until the xattr block is + * initialized and has its own ref count on the ea_inode. + */ + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (error) { + ea_inode = NULL; + goto cleanup; + } + } inserted: if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), + &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -925,7 +1948,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; @@ -944,8 +1967,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_touch(ext4_mb_cache, ce); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -984,6 +2007,22 @@ getblk_failed: EXT4_FREE_BLOCKS_METADATA); goto cleanup; } + error = ext4_xattr_inode_inc_ref_all(handle, inode, + ENTRY(header(s->base)+1)); + if (error) + goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } + lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { @@ -995,7 +2034,7 @@ getblk_failed: ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) @@ -1003,17 +2042,40 @@ getblk_failed: } } + if (old_ea_inode_size) + ext4_xattr_inode_free_quota(inode, old_ea_inode_size); + /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext4_xattr_release_block(handle, inode, bs->bh); + if (bs->bh && bs->bh != new_bh) { + struct ext4_xattr_inode_array *ea_inode_array = NULL; + + ext4_xattr_release_block(handle, inode, bs->bh, + &ea_inode_array, + 0 /* extra_credits */); + ext4_xattr_inode_array_free(ea_inode_array); + } error = 0; cleanup: + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + /* If there was an error, revert the quota charge. */ + if (error) + ext4_xattr_inode_free_quota(inode, + i_size_read(ea_inode)); + iput(ea_inode); + } if (ce) - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1070,7 +2132,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) { if (error == -ENOSPC && ext4_has_inline_data(inode)) { @@ -1082,7 +2144,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_find(inode, i, is); if (error) return error; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, + false /* is_block */); } if (error) return error; @@ -1098,7 +2161,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(struct inode *inode, +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1108,7 +2171,7 @@ static int ext4_xattr_ibody_set(struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -1127,12 +2190,31 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s, { void *value; + /* When e_value_inum is set the value is stored externally. */ + if (s->here->e_value_inum) + return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } +static struct buffer_head *ext4_xattr_get_block(struct inode *inode) +{ + struct buffer_head *bh; + int error; + + if (!EXT4_I(inode)->i_file_acl) + return NULL; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + return ERR_PTR(-EIO); + error = ext4_xattr_check_block(inode, bh); + if (error) + return ERR_PTR(error); + return bh; +} + /* * ext4_xattr_set_handle() * @@ -1155,7 +2237,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, .name = name, .value = value, .value_len = value_len, - + .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, @@ -1173,6 +2255,28 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_write_lock_xattr(inode, &no_expand); + /* Check journal credits under write lock. */ + if (ext4_handle_valid(handle)) { + struct buffer_head *bh; + int credits; + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto cleanup; + } + + credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, + flags & XATTR_CREATE); + brelse(bh); + + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = -ENOSPC; + goto cleanup; + } + } + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; @@ -1202,9 +2306,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } + if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1215,7 +2320,12 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(inode, &i, &is); + if (ext4_has_feature_ea_inode(inode->i_sb) && + (EXT4_XATTR_SIZE(i.value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + i.in_inode = 1; +retry_inode: + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1226,11 +2336,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); - if (error) - goto cleanup; - if (!is.s.not_found) { + if (!error && !is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } else if (error == -ENOSPC) { + /* + * Xattr does not fit in the block, store at + * external inode if possible. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + !i.in_inode) { + i.in_inode = 1; + goto retry_inode; + } } } } @@ -1256,6 +2375,33 @@ cleanup: return error; } +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) +{ + struct buffer_head *bh; + int err; + + *credits = 0; + + if (!EXT4_SB(inode->i_sb)->s_journal) + return 0; + + down_read(&EXT4_I(inode)->xattr_sem); + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + } else { + *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, is_create); + brelse(bh); + err = 0; + } + + up_read(&EXT4_I(inode)->xattr_sem); + return err; +} + /* * ext4_xattr_set() * @@ -1269,13 +2415,20 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; + struct super_block *sb = inode->i_sb; int error, retries = 0; - int credits = ext4_jbd2_credits_xattr(inode); + int credits; error = dquot_initialize(inode); if (error) return error; + retry: + error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, + &credits); + if (error) + return error; + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -1286,7 +2439,7 @@ retry: value, value_len, flags); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; @@ -1311,7 +2464,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); @@ -1331,18 +2484,16 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; - size_t value_offs, value_size; + size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, + .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int error; - value_offs = le16_to_cpu(entry->e_value_offs); - value_size = le32_to_cpu(entry->e_value_size); - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); buffer = kmalloc(value_size, GFP_NOFS); @@ -1358,7 +2509,15 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, bs->bh = NULL; /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); + if (error) + goto out; + } else { + size_t value_offs = le16_to_cpu(entry->e_value_offs); + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + } + memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; @@ -1372,11 +2531,10 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(inode, &i, is); + error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto out; - i.name = b_entry_name; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); @@ -1420,9 +2578,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); + total_size = EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { @@ -1441,8 +2600,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, } entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + total_size = entry_size; + if (!entry->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) @@ -1571,51 +2732,172 @@ cleanup: return error; } +#define EIA_INCR 16 /* must be 2^n */ +#define EIA_MASK (EIA_INCR - 1) +/* Add the large xattr @inode into @ea_inode_array for deferred iput(). + * If @ea_inode_array is new or full it will be grown and the old + * contents copied over. + */ +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode) +{ + if (*ea_inode_array == NULL) { + /* + * Start with 15 inodes, so it fits into a power-of-two size. + * If *ea_inode_array is NULL, this is essentially offsetof() + */ + (*ea_inode_array) = + kmalloc(offsetof(struct ext4_xattr_inode_array, + inodes[EIA_MASK]), + GFP_NOFS); + if (*ea_inode_array == NULL) + return -ENOMEM; + (*ea_inode_array)->count = 0; + } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { + /* expand the array once all 15 + n * 16 slots are full */ + struct ext4_xattr_inode_array *new_array = NULL; + int count = (*ea_inode_array)->count; + + /* if new_array is NULL, this is essentially offsetof() */ + new_array = kmalloc( + offsetof(struct ext4_xattr_inode_array, + inodes[count + EIA_INCR]), + GFP_NOFS); + if (new_array == NULL) + return -ENOMEM; + memcpy(new_array, *ea_inode_array, + offsetof(struct ext4_xattr_inode_array, inodes[count])); + kfree(*ea_inode_array); + *ea_inode_array = new_array; + } + (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; + return 0; +} /* * ext4_xattr_delete_inode() * - * Free extended attribute resources associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. + * Free extended attribute resources associated with this inode. Traverse + * all entries and decrement reference on any xattr inodes associated with this + * inode. This is called immediately before an inode is freed. We have exclusive + * access to the inode. If an orphan inode is deleted it will also release its + * references on xattr block and xattr inodes. */ -void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_header *header; + struct ext4_iloc iloc = { .bh = NULL }; + struct ext4_xattr_entry *entry; + int error; - if (!EXT4_I(inode)->i_file_acl) - goto cleanup; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); + error = ext4_xattr_ensure_credits(handle, inode, extra_credits, + NULL /* bh */, + false /* dirty */, + false /* block_csum */); + if (error) { + EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - goto cleanup; + + if (ext4_has_feature_ea_inode(inode->i_sb) && + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); + goto cleanup; + } + + error = ext4_journal_get_write_access(handle, iloc.bh); + if (error) { + EXT4_ERROR_INODE(inode, "write access (error %d)", + error); + goto cleanup; + } + + header = IHDR(inode, ext4_raw_inode(&iloc)); + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, + IFIRST(header), + false /* block_csum */, + ea_inode_array, + extra_credits, + false /* skip_quota */); } - ext4_xattr_release_block(handle, inode, bh); - EXT4_I(inode)->i_file_acl = 0; + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + error = ext4_xattr_check_block(inode, bh); + if (error) { + EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", + EXT4_I(inode)->i_file_acl, error); + goto cleanup; + } + + if (ext4_has_feature_ea_inode(inode->i_sb)) { + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ext4_xattr_inode_free_quota(inode, + le32_to_cpu(entry->e_value_size)); + + } + + ext4_xattr_release_block(handle, inode, bh, ea_inode_array, + extra_credits); + /* + * Update i_file_acl value in the same transaction that releases + * block. + */ + EXT4_I(inode)->i_file_acl = 0; + error = ext4_mark_inode_dirty(handle, inode); + if (error) { + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", + error); + goto cleanup; + } + } + error = 0; cleanup: + brelse(iloc.bh); brelse(bh); + return error; +} + +void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) +{ + int idx; + + if (ea_inode_array == NULL) + return; + + for (idx = 0; idx < ea_inode_array->count; ++idx) + iput(ea_inode_array->inodes[idx]); + kfree(ea_inode_array); } /* - * ext4_xattr_cache_insert() + * ext4_xattr_block_cache_insert() * - * Create a new entry in the extended attribute cache, and insert + * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, + struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); @@ -1623,7 +2905,9 @@ ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) EXT4_XATTR_REFCOUNT_MAX; int error; - error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + if (!ea_block_cache) + return; + error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) @@ -1655,11 +2939,11 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || + entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EFSCORRUPTED; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + if (!entry1->e_value_inum && + memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; @@ -1673,7 +2957,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, } /* - * ext4_xattr_cache_find() + * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * @@ -1681,30 +2965,33 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * not found or an error occurred. */ static struct buffer_head * -ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) +ext4_xattr_block_cache_find(struct inode *inode, + struct ext4_xattr_header *header, + struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + if (!ea_block_cache) + return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); - ce = mb_cache_entry_find_first(ext4_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long) ce->e_block); + (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ext4_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } @@ -1717,30 +3004,22 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, * * Compute the hash of an extended attribute. */ -static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count) { __u32 hash = 0; - char *name = entry->e_name; - int n; - for (n = 0; n < entry->e_name_len; n++) { + while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; } - - if (entry->e_value_size != 0) { - __le32 *value = (__le32 *)((char *)header + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); } - entry->e_hash = cpu_to_le32(hash); + return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT @@ -1753,13 +3032,11 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, * * Re-compute the extended attribute hash value after an entry has changed. */ -static void ext4_xattr_rehash(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; - ext4_xattr_hash_entry(header, entry); here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 099c8b670ef5..0d2dde1fa87a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -44,7 +44,7 @@ struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[0]; /* attribute name */ @@ -69,6 +69,13 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +/* + * The minimum size of EA value when you start storing it in an external inode + * size of block - size of header - size of 1 entry - 4 null bytes +*/ +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ + ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) #define BFIRST(bh) ENTRY(BHDR(bh)+1) @@ -77,10 +84,11 @@ struct ext4_xattr_entry { #define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { - int name_index; const char *name; const void *value; size_t value_len; + int name_index; + int in_inode; }; struct ext4_xattr_search { @@ -96,6 +104,11 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; +struct ext4_xattr_inode_array { + unsigned int count; /* # of used items in the array */ + struct inode *inodes[0]; +}; + extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; @@ -139,8 +152,16 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits); +extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create); -extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **array, + int extra_credits); +extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -169,3 +190,11 @@ static inline int ext4_init_security(handle_t *handle, struct inode *inode, return 0; } #endif + +#ifdef CONFIG_LOCKDEP +extern void ext4_xattr_inode_set_class(struct inode *ea_inode); +#else +static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } +#endif + +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage); diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 8f487692c21f..a140c5e3dc54 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -233,7 +233,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ea9c317b5916..56bbf592e487 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -162,6 +162,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op = REQ_OP_READ, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; @@ -207,12 +208,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } @@ -249,13 +248,13 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, - 0, page->index, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -270,6 +269,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) @@ -358,7 +360,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -906,7 +908,7 @@ retry: * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -1051,8 +1053,9 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if ((cpc->reason & CP_UMOUNT) && le32_to_cpu(ckpt->cp_pack_total_block_count) > @@ -1083,14 +1086,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1132,12 +1135,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1295,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c0f6bdf817d..87c1f4150c64 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } #endif if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { if (!PageUptodate(page)) SetPageUptodate(page); } else { @@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio) unlock_page(page); mempool_free(page, sbi->write_io_dummy); - if (unlikely(bio->bi_error)) + if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true); continue; } fscrypt_pullback_bio_page(&page, true); - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } @@ -282,29 +282,32 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, ino, idx); - up_read(&io->io_rwsem); + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw) +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, ino, idx)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -314,29 +317,45 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) { - __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw); + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) +{ + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); +} + +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw) + enum page_type type) { - if (has_merged_page(sbi, inode, ino, idx, type)) - __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* @@ -368,16 +387,29 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -int f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); + + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); @@ -388,10 +420,7 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) /* set submitted = 1 as a return value */ fio->submitted = 1; - if (!is_read) - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); - - down_write(&io->io_rwsem); + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || @@ -402,26 +431,28 @@ alloc_new: if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { err = -EAGAIN; - if (!is_read) - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - BIO_MAX_PAGES, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); return err; } @@ -460,14 +491,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -718,6 +750,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; @@ -726,15 +759,15 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -1321,7 +1354,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { - f2fs_flush_merged_bios(fio->sbi); + f2fs_flush_merged_writes(fio->sbi); congestion_wait(BLK_RW_ASYNC, HZ/50); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; @@ -1368,13 +1401,14 @@ int do_write_data_page(struct f2fs_io_info *fio) if (valid_ipu_blkaddr(fio)) { ipu_force = true; - fio->need_lock = false; + fio->need_lock = LOCK_DONE; goto got_it; } } - if (fio->need_lock) - f2fs_lock_op(fio->sbi); + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) @@ -1388,19 +1422,18 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } got_it: - err = encrypt_one_page(fio); - if (err) - goto out_writepage; - - set_page_writeback(page); - /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); f2fs_put_dnode(&dn); - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = rewrite_data_page(fio); trace_f2fs_do_write_data_page(fio->page, IPU); @@ -1408,6 +1441,20 @@ got_it: return err; } + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + /* LFS mode write path */ write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); @@ -1417,7 +1464,7 @@ got_it: out_writepage: f2fs_put_dnode(&dn); out: - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); return err; } @@ -1443,11 +1490,14 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, - .need_lock = true, + .need_lock = LOCK_RETRY, }; trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1461,8 +1511,6 @@ static int __write_data_page(struct page *page, bool *submitted, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1479,7 +1527,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - fio.need_lock = false; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1498,8 +1546,13 @@ write: goto out; } - if (err == -EAGAIN) + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; @@ -1513,8 +1566,7 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, - DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; @@ -1525,7 +1577,7 @@ out: f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); submitted = NULL; } @@ -1618,7 +1670,7 @@ retry: } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1654,6 +1706,15 @@ continue_unlock: unlock_page(page); ret = 0; continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; } done_index = page->index + 1; done = 1; @@ -1684,8 +1745,8 @@ continue_unlock: mapping->writeback_index = done_index; if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - 0, last_idx, DATA, WRITE); + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } @@ -1706,6 +1767,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1715,10 +1780,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ @@ -1753,8 +1814,10 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -2152,8 +2215,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 94756f55a97e..37f9c7f55605 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -415,7 +415,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2f98d7039701..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -320,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode, } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -358,6 +358,16 @@ out: return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fd2e651bad6d..94a88b233e98 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/quotaops.h> #ifdef CONFIG_F2FS_FS_ENCRYPTION #include <linux/fscrypt_supp.h> #else @@ -88,6 +89,8 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -303,6 +306,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_move_range) #define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -327,6 +332,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; @@ -513,12 +524,19 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; }; static inline void get_extent_info(struct extent_info *ext, @@ -792,17 +810,33 @@ enum page_type { OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ bool submitted; /* indicate IO submission */ - bool need_lock; /* indicate we need to lock cp_rwsem */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ }; #define is_read_io(rw) ((rw) == READ) @@ -812,6 +846,8 @@ struct f2fs_bio_info { sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ }; #define FDEV(i) (sbi->devs[i]) @@ -879,9 +915,9 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ @@ -939,6 +975,8 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ @@ -1228,9 +1266,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1244,22 +1284,26 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { + unsigned long flags; + set_sbi_flag(sbi, SBI_NEED_FSCK); if (lock) - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); kfree(NM_I(sbi)->nat_bits); NM_I(sbi)->nat_bits = NULL; if (lock) - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, @@ -1275,6 +1319,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); @@ -1324,17 +1373,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1342,16 +1388,23 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); - return false; + release = *count; + goto enospc; } #endif /* @@ -1362,32 +1415,42 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + release = diff; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1516,51 +1579,70 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); - sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1835,13 +1917,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); @@ -2236,6 +2326,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); @@ -2258,7 +2349,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, bool recover_newaddr); void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type); + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, @@ -2308,14 +2400,13 @@ void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw); -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); -int f2fs_submit_page_mbio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); @@ -2633,6 +2724,14 @@ int __init create_extent_cache(void); void destroy_extent_cache(void); /* + * sysfs.c + */ +int __init f2fs_register_sysfs(void); +void f2fs_unregister_sysfs(void); +int f2fs_init_sysfs(struct f2fs_sb_info *sbi); +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); + +/* * crypto support */ static inline bool f2fs_encrypted_inode(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 61af721329fa..a0e6d2c65a9e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -33,6 +33,18 @@ #include "trace.h" #include <trace/events/f2fs.h> +static int f2fs_filemap_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; @@ -59,13 +71,14 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vmf->vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -94,6 +107,8 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -101,7 +116,7 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -415,14 +430,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -435,11 +442,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -452,7 +458,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -527,8 +533,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -633,11 +641,31 @@ int f2fs_truncate(struct inode *inode) } int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) + u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags; + + flags = fi->i_flags & FS_FL_USER_VISIBLE; + if (flags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (f2fs_encrypted_inode(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } @@ -681,14 +709,34 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; } else { @@ -696,7 +744,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -839,12 +889,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -957,9 +1009,9 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1083,16 +1135,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1105,6 +1158,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1169,9 +1224,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1185,7 +1241,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1193,7 +1249,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1242,6 +1298,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1271,14 +1329,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1307,6 +1366,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1475,6 +1536,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + inode_unlock(inode); + ret = -EPERM; + goto unlock_out; + } + flags = f2fs_mask_flags(inode->i_mode, flags); oldflags = fi->i_flags; @@ -1493,7 +1561,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); - + f2fs_mark_inode_dirty_sync(inode, false); +unlock_out: inode_unlock(inode); out: mnt_drop_write_file(filp); @@ -1862,6 +1931,50 @@ out: return ret; } +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; +out: + mnt_drop_write_file(filp); + return ret; +} + static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2306,6 +2419,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: @@ -2326,11 +2441,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { @@ -2379,6 +2489,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 026522107ca3..fa3d2e2df8e7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -32,13 +32,14 @@ static int gc_thread_func(void *data) wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current), + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; @@ -258,11 +259,20 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } +static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + + return se->ckpt_valid_blocks > se->valid_blocks ? + se->ckpt_valid_blocks : se->valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + return get_ssr_cost(sbi, segno); /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) @@ -586,9 +596,11 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -632,7 +644,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -670,7 +682,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -712,12 +724,13 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, - .need_lock = true, + .need_lock = LOCK_REQ, }; bool is_dirty = PageDirty(page); int err; @@ -936,8 +949,8 @@ next: } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); @@ -955,7 +968,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; - int ret = -EINVAL; + int ret; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -965,8 +978,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); gc_more: - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; @@ -987,6 +1002,7 @@ gc_more: gc_type = FG_GC; } + ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ if (gc_type == BG_GC && !background) goto stop; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e4c527c4e7d0..e0fd4376e6fb 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -316,12 +316,12 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; + struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, dentry_blk); + make_dentry_ptr_inline(NULL, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -500,7 +500,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + struct f2fs_inline_dentry *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,11 +510,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + inline_dentry = inline_data_addr(ipage); + bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, slots, NR_INLINE_DENTRY); if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -534,7 +534,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, dentry_blk); + make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -586,14 +586,14 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + struct f2fs_inline_dentry *inline_dentry; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + inline_dentry = inline_data_addr(ipage); + bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, NR_INLINE_DENTRY, bit_pos); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 518f49643092..6cd312a17c69 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include <trace/events/f2fs.h> @@ -44,7 +45,6 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -130,7 +130,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -226,6 +226,7 @@ make_now: ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -267,7 +268,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); if (et) { read_lock(&et->lock); @@ -372,6 +373,8 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; + dquot_initialize(inode); + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -404,8 +407,11 @@ retry: if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); @@ -425,9 +431,10 @@ no_delete: if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c31b40e5f9cf..760d85223c81 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/dcache.h> #include <linux/namei.h> +#include <linux/quotaops.h> #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,10 +55,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); @@ -85,6 +95,16 @@ fail: set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +156,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -180,6 +204,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -347,6 +375,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -413,6 +445,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -500,6 +536,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -548,6 +588,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -583,6 +627,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -676,6 +724,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -772,7 +828,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -853,6 +912,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4547c5c5cd98..d53fe620939e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -158,9 +158,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -171,10 +168,18 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -673,15 +678,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -689,7 +690,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1006,7 +1007,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); @@ -1039,10 +1040,9 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } + #ifdef CONFIG_F2FS_CHECK_FS get_node_info(sbi, dn->nid, &new_ni); f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); @@ -1152,6 +1152,7 @@ repeat: f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1165,15 +1166,22 @@ repeat: goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); ClearPageUptodate(page); + err = -EINVAL; out_err: f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } @@ -1373,15 +1381,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0, - page->index, NODE, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); submitted = NULL; } unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); submitted = NULL; } if (submitted) @@ -1518,8 +1526,7 @@ continue_unlock: } out: if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx, - NODE, WRITE); + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } @@ -1625,7 +1632,7 @@ continue_unlock: } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } @@ -1675,6 +1682,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -2192,14 +2202,14 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: update xattr nid in inode */ remove_free_nid(sbi, new_xnid); f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(!inc_valid_node_count(sbi, inode))) + if (unlikely(inc_valid_node_count(sbi, inode, false))) f2fs_bug_on(sbi, 1); update_inode_page(inode); @@ -2257,7 +2267,7 @@ retry: new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); @@ -2424,8 +2434,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, @@ -2553,7 +2562,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; } -inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int i = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 558048e33cf9..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -224,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96845854e7ee..f964b68718c1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> +#include <linux/freezer.h> #include "f2fs.h" #include "segment.h" @@ -312,7 +313,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; - fio.need_lock = false, + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { unlock_page(page); @@ -328,8 +329,7 @@ static int __commit_inmem_pages(struct inode *inode, } if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx, - DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -555,6 +555,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; goto init_thread; } @@ -566,6 +568,9 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -736,12 +741,15 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + f2fs_bug_on(sbi, dc->ref); + if (dc->error == -EOPNOTSUPP) dc->error = 0; if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", dc->error); + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -749,12 +757,36 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - dc->error = bio->bi_error; + dc->error = blk_status_to_errno(bio->bi_status); dc->state = D_DONE; - complete(&dc->wait); + complete_all(&dc->wait); bio_put(bio); } +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + size = min((unsigned long)(end - blk), max_blocks); + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk += size; + } +#endif +} + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) @@ -782,6 +814,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_opf |= REQ_SYNC; submit_bio(bio); list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); } } else { __remove_discard_cmd(sbi, dc); @@ -838,7 +871,6 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->len = blkaddr - dc->lstart; dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; } @@ -848,16 +880,12 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr, NULL, NULL); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } else { dc->lstart++; dc->len--; dc->start++; dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } } } @@ -918,8 +946,6 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -935,16 +961,12 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); merged = true; } if (!merged) { __insert_discard_tree(sbi, bdev, di.lstart, di.start, di.len, NULL, NULL); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } next: prev_dc = next_dc; @@ -983,6 +1005,8 @@ static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) int i, iter = 0; mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; @@ -1000,22 +1024,47 @@ out: mutex_unlock(&dcc->cmd_lock); } +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (!wait_cond || dc->state == D_DONE) { - if (dc->ref) - continue; + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; } } mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + __wait_one_discard_bio(sbi, dc); + goto next; + } } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1037,14 +1086,19 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { - wait_for_completion_io(&dc->wait); - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, dc->state != D_DONE); - dc->ref--; - if (!dc->ref) - __remove_discard_cmd(sbi, dc); - mutex_unlock(&dcc->cmd_lock); + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); } } @@ -1060,18 +1114,24 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; -repeat: - if (kthread_should_stop()) - return 0; - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + set_freezable(); - congestion_wait(BLK_RW_SYNC, HZ/50); + do { + wait_event_interruptible(*q, kthread_should_stop() || + freezing(current) || + atomic_read(&dcc->discard_cmd_cnt)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; - wait_event_interruptible(*q, kthread_should_stop() || - atomic_read(&dcc->discard_cmd_cnt)); - goto repeat; + __issue_discard_cmd(sbi, true); + __wait_discard_cmd(sbi, true); + + congestion_wait(BLK_RW_SYNC, HZ/50); + } while (!kthread_should_stop()); + return 0; } #ifdef CONFIG_BLK_DEV_ZONED @@ -1322,7 +1382,8 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (force && len < cpc->trim_minlen) + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, @@ -1398,12 +1459,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return; - if (dcc->f2fs_issue_discard) { - struct task_struct *discard_thread = dcc->f2fs_issue_discard; - - dcc->f2fs_issue_discard = NULL; - kthread_stop(discard_thread); - } + stop_discard_thread(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@ -2040,66 +2096,80 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + int type = 0; + + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(page, p_type); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2135,29 +2205,35 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); + int type = __get_segment_type(fio); int err; - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_write(fio); if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; } - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2171,13 +2247,14 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2296,8 +2373,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, - 0, page->index, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else @@ -2455,6 +2532,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -2481,6 +2560,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } @@ -3203,7 +3287,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 010f336a7573..6b871b492fd5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -27,6 +27,10 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) + #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 83355ec4a92c..32e4c025e97e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,6 +22,7 @@ #include <linux/random.h> #include <linux/exportfs.h> #include <linux/blkdev.h> +#include <linux/quotaops.h> #include <linux/f2fs_fs.h> #include <linux/sysfs.h> @@ -35,9 +36,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -108,6 +107,8 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_usrquota, + Opt_grpquota, Opt_err, }; @@ -143,212 +144,11 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - DCC_INFO, /* struct discard_cmd_control */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == DCC_INFO) - return (unsigned char *)SM_I(sbi)->dcc_info; - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); -} - -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) - return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void f2fs_sb_release(struct kobject *kobj) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ -} - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(min_hot_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), -#endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -585,6 +385,20 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; +#else + case Opt_usrquota: + case Opt_grpquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -624,7 +438,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; return &fi->vfs_inode; @@ -765,18 +584,13 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } +static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - kobject_del(&sbi->s_kobj); - - stop_gc_thread(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -797,7 +611,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ f2fs_wait_discard_bios(sbi); - if (!sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -817,7 +631,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -827,8 +641,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_exit_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -838,6 +652,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_device_list(sbi); mempool_destroy(sbi->write_io_dummy); destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -888,6 +704,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -898,11 +715,19 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = min(buf->f_files - valid_node_count(sbi), - buf->f_bavail); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; @@ -980,79 +805,19 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) - seq_puts(seq, ",fault_injection"); + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); #endif return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, false)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, false)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -1089,6 +854,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -1102,6 +868,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; /* recover superblocks we couldn't write due to previous RO mount */ @@ -1113,7 +880,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ @@ -1128,6 +894,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1192,12 +968,237 @@ restore_gc: restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return err; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, -1); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, -1); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +static void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_next_id = dquot_get_next_id, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +static inline void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, @@ -1205,6 +1206,11 @@ static struct super_operations f2fs_sops = { .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1521,6 +1527,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1542,6 +1550,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1552,7 +1574,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1584,8 +1606,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); } @@ -1908,6 +1931,7 @@ try_onemore: if (f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; goto free_sb_buf; } #endif @@ -1929,6 +1953,12 @@ try_onemore: sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; @@ -1937,7 +1967,7 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); - memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; @@ -1950,13 +1980,24 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) { + err = -ENOMEM; + goto free_options; + } + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + } } init_rwsem(&sbi->cp_rwsem); @@ -1970,8 +2011,10 @@ try_onemore: if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); - if (!sbi->write_io_dummy) + if (!sbi->write_io_dummy) { + err = -ENOMEM; goto free_options; + } } /* get an inode for meta space */ @@ -2003,6 +2046,7 @@ try_onemore: sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -2078,22 +2122,9 @@ try_onemore: goto free_root_inode; } - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + err = f2fs_init_sysfs(sbi); if (err) - goto free_proc; + goto free_root_inode; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2104,7 +2135,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_sysfs; } if (need_fsck) @@ -2118,7 +2149,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_sysfs; } } else { err = recover_fsync_data(sbi, true); @@ -2127,7 +2158,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -2142,7 +2173,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_sysfs; } kfree(options); @@ -2160,17 +2191,9 @@ skip_recovery: f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_sysfs: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } + f2fs_exit_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2202,6 +2225,8 @@ free_meta_inode: free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); kfree(options); free_sb_buf: @@ -2228,8 +2253,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } @@ -2283,30 +2311,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_register_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_unregister_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2323,11 +2347,10 @@ fail: static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_unregister_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..9adc202fcd6f --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,364 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu <chao@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/proc_fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; +static struct kset *f2fs_kset; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_blocks), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + +int __init f2fs_register_sysfs(void) +{ + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) + return -ENOMEM; + return 0; +} + +void f2fs_unregister_sysfs(void) +{ + kset_unregister(f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); +} + +int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto err_out; + return 0; +err_out: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + return err; +} + +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +{ + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } +} diff --git a/fs/fcntl.c b/fs/fcntl.c index f4e7267d117f..3b01b646e528 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -109,20 +109,34 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -void f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; - struct pid *pid; - int who = arg; + struct pid *pid = NULL; + int who = arg, ret = 0; + type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return -EINVAL; + type = PIDTYPE_PGID; who = -who; } + rcu_read_lock(); - pid = find_vpid(who); - __f_setown(filp, pid, type, force); + if (who) { + pid = find_vpid(who); + if (!pid) + ret = -ESRCH; + } + + if (!ret) + __f_setown(filp, pid, type, force); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL(f_setown); @@ -243,9 +257,72 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif +static bool rw_hint_valid(enum rw_hint hint) +{ + switch (hint) { + case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NONE: + case RWH_WRITE_LIFE_SHORT: + case RWH_WRITE_LIFE_MEDIUM: + case RWH_WRITE_LIFE_LONG: + case RWH_WRITE_LIFE_EXTREME: + return true; + default: + return false; + } +} + +static long fcntl_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 *argp = (u64 __user *)arg; + enum rw_hint hint; + u64 h; + + switch (cmd) { + case F_GET_FILE_RW_HINT: + h = file_write_hint(file); + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_FILE_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + spin_lock(&file->f_lock); + file->f_write_hint = hint; + spin_unlock(&file->f_lock); + return 0; + case F_GET_RW_HINT: + h = inode->i_write_hint; + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + inode_lock(inode); + inode->i_write_hint = hint; + inode_unlock(inode); + return 0; + default: + return -EINVAL; + } +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { + void __user *argp = (void __user *)arg; + struct flock flock; long err = -EINVAL; switch (cmd) { @@ -273,7 +350,11 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_OFD_GETLK: #endif case F_GETLK: - err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_getlk(filp, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -283,7 +364,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, /* Fallthrough */ case F_SETLK: case F_SETLKW: - err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* @@ -297,8 +380,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - f_setown(filp, arg, 1); - err = 0; + err = f_setown(filp, arg, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -337,6 +419,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GET_SEALS: err = shmem_fcntl(filp, cmd, arg); break; + case F_GET_RW_HINT: + case F_SET_RW_HINT: + case F_GET_FILE_RW_HINT: + case F_SET_FILE_RW_HINT: + err = fcntl_rw_hint(filp, cmd, arg); + break; default: break; } @@ -383,7 +471,9 @@ out: SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { + void __user *argp = (void __user *)arg; struct fd f = fdget_raw(fd); + struct flock64 flock; long err = -EBADF; if (!f.file) @@ -401,14 +491,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: case F_OFD_GETLK: - err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_getlk64(f.file, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: - err = fcntl_setlk64(fd, f.file, cmd, - (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_setlk64(fd, f.file, cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, f.file); @@ -422,57 +519,56 @@ out: #endif #ifdef CONFIG_COMPAT -static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) +/* careful - don't use anywhere else */ +#define copy_flock_fields(dst, src) \ + (dst)->l_type = (src)->l_type; \ + (dst)->l_whence = (src)->l_whence; \ + (dst)->l_start = (src)->l_start; \ + (dst)->l_len = (src)->l_len; \ + (dst)->l_pid = (src)->l_pid; + +static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl) +{ + struct compat_flock fl; + + if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; + copy_flock_fields(kfl, &fl); return 0; } -static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl) { - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock64 fl; + + if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; + copy_flock_fields(kfl, &fl); return 0; } -#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 -static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl) { - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock fl; + + memset(&fl, 0, sizeof(struct compat_flock)); + copy_flock_fields(&fl, kfl); + if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } -#endif -#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 -static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl) { - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock64 fl; + + memset(&fl, 0, sizeof(struct compat_flock64)); + copy_flock_fields(&fl, kfl); + if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } -#endif +#undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) @@ -489,76 +585,92 @@ convert_fcntl_cmd(unsigned int cmd) return cmd; } +/* + * GETLK was successful and we need to return the data, but it needs to fit in + * the compat structure. + * l_start shouldn't be too big, unless the original start + end is greater than + * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return + * -EOVERFLOW in that case. l_len could be too big, in which case we just + * truncate it, and only allow the app to see that part of the conflicting lock + * that might make sense to it anyway + */ +static int fixup_compat_flock(struct flock *flock) +{ + if (flock->l_start > COMPAT_OFF_T_MAX) + return -EOVERFLOW; + if (flock->l_len > COMPAT_OFF_T_MAX) + flock->l_len = COMPAT_OFF_T_MAX; + return 0; +} + COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; + struct fd f = fdget_raw(fd); + struct flock flock; + long err = -EBADF; + + if (!f.file) + return err; + + if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) + goto out_put; + } + + err = security_file_fcntl(f.file, cmd, arg); + if (err) + goto out_put; switch (cmd) { case F_GETLK: + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock(&flock, compat_ptr(arg)); + break; + case F_GETLK64: + case F_OFD_GETLK: + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock64(&flock, compat_ptr(arg)); + break; case F_SETLK: case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - - case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - default: - ret = sys_fcntl(fd, cmd, arg); + err = do_fcntl(fd, cmd, arg, f.file); break; } - return ret; +out_put: + fdput(f); + return err; } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, diff --git a/fs/file.c b/fs/file.c index 1c2972e3a405..1fc7fbbb4510 100644 --- a/fs/file.c +++ b/fs/file.c @@ -30,21 +30,6 @@ unsigned int sysctl_nr_open_min = BITS_PER_LONG; unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; -static void *alloc_fdmem(size_t size) -{ - /* - * Very large allocations can stress page reclaim, so fall back to - * vmalloc() if the allocation size will be considered "large" by the VM. - */ - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | - __GFP_NOWARN | __GFP_NORETRY); - if (data != NULL) - return data; - } - return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); -} - static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); @@ -131,13 +116,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (!fdt) goto out; fdt->max_fds = nr; - data = alloc_fdmem(nr * sizeof(struct file *)); + data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; - data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); + data = kvmalloc(max_t(size_t, + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), + GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; diff --git a/fs/file_table.c b/fs/file_table.c index 954d510b765a..72e861a35a7f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -168,6 +168,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode, file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; + file->f_wb_err = filemap_sample_wb_err(file->f_mapping); if ((mode & FMODE_READ) && likely(fop->read || fop->read_iter)) mode |= FMODE_CAN_READ; diff --git a/fs/filesystems.c b/fs/filesystems.c index cac75547d35c..8b99955e3504 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -275,8 +275,10 @@ struct file_system_type *get_fs_type(const char *name) int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); - if (!fs && (request_module("fs-%.*s", len, name) == 0)) + if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); + WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name); + } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 63ee2940775c..8b426f83909f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2052,11 +2052,13 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) } /** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. + * __mark_inode_dirty - internal function + * + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. * * Put the inode on the super block's dirty list. * diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 611b5408f6ec..e747b3d720ee 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) void pin_kill(struct fs_pin *p) { - wait_queue_t wait; + wait_queue_entry_t wait; if (!p) { rcu_read_unlock(); @@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p) rcu_read_unlock(); schedule(); rcu_read_lock(); - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) break; /* OK, we know p couldn't have been freed yet */ spin_lock_irq(&p->wait.lock); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4d810be532dd..9fa3aef9a5b3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -970,7 +970,7 @@ more_rgrps: continue; bn = be64_to_cpu(*p); if (gfs2_holder_initialized(rd_gh)) { - rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object; + rgd = gfs2_glock2rgrp(rd_gh->gh_gl); gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); } else { diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 79113219be5f..db427658ccd9 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1444,7 +1444,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, "g.offset (%u)\n", (unsigned long long)bh->b_blocknr, entries2, g.offset); - + gfs2_consist_inode(ip); error = -EIO; goto out_free; } @@ -1612,6 +1612,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, (unsigned long long)dip->i_no_addr, dip->i_entries, g.offset); + gfs2_consist_inode(dip); error = -EIO; goto out; } @@ -2031,8 +2032,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 959a19ced4d5..c38ab6c81898 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; -void gfs2_glock_free(struct gfs2_glock *gl) +static void gfs2_glock_dealloc(struct rcu_head *rcu) { - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); @@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl) kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); } @@ -152,20 +159,34 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -/** - * gfs2_glock_put() - Decrement reference count on glock - * @gl: The glock to put - * +/* + * Enqueue the glock on the work queue. Passes one glock reference on to the + * work queue. */ +static void __gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) { + /* + * We are holding the lockref spinlock, and the work was still + * queued above. The queued work (glock_work_func) takes that + * spinlock before dropping its glock reference(s), so it + * cannot have dropped them in the meantime. + */ + GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); + gl->gl_lockref.count--; + } +} -void gfs2_glock_put(struct gfs2_glock *gl) +static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + spin_lock(&gl->gl_lockref.lock); + __gfs2_glock_queue_work(gl, delay); + spin_unlock(&gl->gl_lockref.lock); +} + +static void __gfs2_glock_put(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (lockref_put_or_lock(&gl->gl_lockref)) - return; - lockref_mark_dead(&gl->gl_lockref); gfs2_glock_remove_from_lru(gl); @@ -178,6 +199,20 @@ void gfs2_glock_put(struct gfs2_glock *gl) } /** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +void gfs2_glock_put(struct gfs2_glock *gl) +{ + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + __gfs2_glock_put(gl); +} + +/** * may_grant - check if its ok to grant a new lock * @gl: The glock * @gh: The lock request which we wish to grant @@ -482,8 +517,7 @@ __acquires(&gl->gl_lockref.lock) target == LM_ST_UNLOCKED && test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { finish_xmote(gl, target); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_queue_work(gl, 0); } else if (ret) { pr_err("lm_lock ret %d\n", ret); @@ -492,8 +526,7 @@ __acquires(&gl->gl_lockref.lock) } } else { /* lock_nolock */ finish_xmote(gl, target); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_queue_work(gl, 0); } spin_lock(&gl->gl_lockref.lock); @@ -565,8 +598,7 @@ out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); gl->gl_lockref.count++; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); return; out_unlock: @@ -601,11 +633,11 @@ static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); - int drop_ref = 0; + unsigned int drop_refs = 1; if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); - drop_ref = 1; + drop_refs++; } spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -623,17 +655,25 @@ static void glock_work_func(struct work_struct *work) } } run_queue(gl, 0); - spin_unlock(&gl->gl_lockref.lock); - if (!delay) - gfs2_glock_put(gl); - else { + if (delay) { + /* Keep one glock reference for the work we requeue. */ + drop_refs--; if (gl->gl_name.ln_type != LM_TYPE_INODE) delay = 0; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); + __gfs2_glock_queue_work(gl, delay); } - if (drop_ref) - gfs2_glock_put(gl); + + /* + * Drop the remaining glock references manually here. (Mind that + * __gfs2_glock_queue_work depends on the lockref spinlock begin held + * here as well.) + */ + gl->gl_lockref.count -= drop_refs; + if (!gl->gl_lockref.count) { + __gfs2_glock_put(gl); + return; + } + spin_unlock(&gl->gl_lockref.lock); } /** @@ -986,8 +1026,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gl->gl_lockref.count++; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); } run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); @@ -1047,17 +1086,15 @@ void gfs2_glock_dq(struct gfs2_holder *gh) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); + if (unlikely(!fast_path)) { + gl->gl_lockref.count++; + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; + __gfs2_glock_queue_work(gl, delay); + } spin_unlock(&gl->gl_lockref.lock); - if (likely(fast_path)) - return; - - gfs2_glock_hold(gl); - if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags) && - gl->gl_name.ln_type == LM_TYPE_INODE) - delay = gl->gl_hold_time; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) @@ -1233,9 +1270,8 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); + __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); } /** @@ -1294,10 +1330,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); - - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); } static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1355,8 +1389,7 @@ add_back_to_lru: if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } @@ -1462,13 +1495,12 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) static void thaw_glock(struct gfs2_glock *gl) { - if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) - goto out; - set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) { -out: + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) { gfs2_glock_put(gl); + return; } + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_queue_work(gl, 0); } /** @@ -1484,9 +1516,8 @@ static void clear_glock(struct gfs2_glock *gl) spin_lock(&gl->gl_lockref.lock); if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0, false); + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); } /** diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index ab1ef322f7a5..9ad4a6ac6c84 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -257,4 +257,11 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +static inline void glock_set_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + gl->gl_object = object; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5db59d444838..5e69636d4dd3 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -137,7 +137,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) * * Called when demoting or unlocking an EX glock. We must flush * to disk all dirty buffers/pages relating to this glock, and must not - * not return to caller to demote/unlock the glock until I/O is complete. + * return to caller to demote/unlock the glock until I/O is complete. */ static void rgrp_go_sync(struct gfs2_glock *gl) @@ -184,7 +184,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; - struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); if (rgd) gfs2_rgrp_brelse(rgd); @@ -197,6 +197,38 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } +static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) + set_bit(GIF_GLOP_PENDING, &ip->i_flags); + spin_unlock(&gl->gl_lockref.lock); + return ip; +} + +struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&gl->gl_lockref.lock); + rgd = gl->gl_object; + spin_unlock(&gl->gl_lockref.lock); + + return rgd; +} + +static void gfs2_clear_glop_pending(struct gfs2_inode *ip) +{ + if (!ip) + return; + + clear_bit_unlock(GIF_GLOP_PENDING, &ip->i_flags); + wake_up_bit(&ip->i_flags, GIF_GLOP_PENDING); +} + /** * inode_go_sync - Sync the dirty data and/or metadata for an inode glock * @gl: the glock protecting the inode @@ -205,25 +237,24 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) static void inode_go_sync(struct gfs2_glock *gl) { - struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gfs2_glock2inode(gl); + int isreg = ip && S_ISREG(ip->i_inode.i_mode); struct address_space *metamapping = gfs2_glock2aspace(gl); int error; - if (ip && !S_ISREG(ip->i_inode.i_mode)) - ip = NULL; - if (ip) { + if (isreg) { if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); inode_dio_wait(&ip->i_inode); } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) - return; + goto out; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH); filemap_fdatawrite(metamapping); - if (ip) { + if (isreg) { struct address_space *mapping = ip->i_inode.i_mapping; filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); @@ -238,6 +269,9 @@ static void inode_go_sync(struct gfs2_glock *gl) */ smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); + +out: + gfs2_clear_glop_pending(ip); } /** @@ -253,7 +287,7 @@ static void inode_go_sync(struct gfs2_glock *gl) static void inode_go_inval(struct gfs2_glock *gl, int flags) { - struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gfs2_glock2inode(gl); gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count)); @@ -274,6 +308,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); + + gfs2_clear_glop_pending(ip); } /** @@ -541,7 +577,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl) */ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { - struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_inode *ip = gl->gl_object; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b7cf65d13561..73fce76e67ee 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -336,7 +336,6 @@ enum { }; struct gfs2_glock { - struct hlist_bl_node gl_list; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; @@ -374,6 +373,7 @@ struct gfs2_glock { loff_t end; } gl_vm; }; + struct rcu_head gl_rcu; struct rhash_head gl_node; }; @@ -386,6 +386,7 @@ enum { GIF_SW_PAGED = 3, GIF_ORDERED = 4, GIF_FREE_VFS_INODE = 5, + GIF_GLOP_PENDING = 6, }; struct gfs2_inode { @@ -815,13 +816,11 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; - int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; unsigned int sd_log_flush_head; - u64 sd_log_flush_wrapped; spinlock_t sd_ail_lock; struct list_head sd_ail1_list; @@ -858,5 +857,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) preempt_enable(); } +extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); + #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9f605ea4810c..acca501f8110 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -144,7 +144,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) goto fail; - ip->i_gl->gl_object = ip; + flush_delayed_work(&ip->i_gl->gl_work); + glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -173,8 +174,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - - ip->i_iopen_gh.gh_gl->gl_object = ip; + flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -201,14 +202,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - ip->i_iopen_gh.gh_gl->gl_object = NULL; + glock_set_object(ip->i_iopen_gh.gh_gl, NULL); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - ip->i_gl->gl_object = NULL; + glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -607,6 +608,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; + gfs2_holder_mark_uninitialized(ghs + 1); error = create_ok(dip, name, mode); if (error) @@ -705,7 +707,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - ip->i_gl->gl_object = ip; + glock_set_object(ip->i_gl, ip); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -731,7 +733,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; - ip->i_iopen_gh.gh_gl->gl_object = ip; + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -778,7 +780,6 @@ fail_gunlock3: fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); - gfs2_glock_dq_uninit(ghs + 1); fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); @@ -799,6 +800,8 @@ fail_gunlock: &GFS2_I(inode)->i_flags); iput(inode); } + if (gfs2_holder_initialized(ghs + 1)) + gfs2_glock_dq_uninit(ghs + 1); fail: return error; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index d2955daf17a4..9a624f694400 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -722,7 +722,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; @@ -775,7 +774,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); - sdp->sd_log_flush_wrapped = 0; log_write_header(sdp, 0); sdp->sd_log_head = sdp->sd_log_flush_head; } @@ -880,7 +878,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index b1f9144b42c7..3010f9edd177 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -71,7 +71,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) { struct gfs2_glock *gl = bd->bd_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; @@ -134,10 +134,8 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && (sdp->sd_log_flush_head != sdp->sd_log_head)); - if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) sdp->sd_log_flush_head = 0; - sdp->sd_log_flush_wrapped = 1; - } } static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) @@ -170,7 +168,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) */ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, - int error) + blk_status_t error) { struct buffer_head *bh, *next; struct page *page = bvec->bv_page; @@ -182,7 +180,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, bh = bh->b_this_page; do { if (error) - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); unlock_buffer(bh); next = bh->b_this_page; size -= bh->b_size; @@ -209,15 +207,13 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_error) { - sdp->sd_log_error = bio->bi_error; - fs_err(sdp, "Error %d writing to log\n", bio->bi_error); - } + if (bio->bi_status) + fs_err(sdp, "Error %d writing to log\n", bio->bi_status); bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); else mempool_free(page, gfs2_page_pool); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 67d1fc4668f7..0a89e6f7a314 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -52,7 +52,6 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - INIT_HLIST_BL_NODE(&gl->gl_list); spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 663ffc135ef3..fabe1614f879 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio) do { struct buffer_head *next = bh->b_this_page; len -= bh->b_size; - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bh = next; } while (bh && len); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ed67548b286c..e76058d34b74 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", bio->bi_error); + pr_warn("error %d reading superblock\n", bio->bi_status); unlock_page(page); } @@ -203,7 +203,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); - memcpy(s->s_uuid, str->sb_uuid, 16); + memcpy(&s->s_uuid, str->sb_uuid, 16); } /** diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 83c9909ff14a..836e38ba5d0a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,9 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - spin_lock(&gl->gl_lockref.lock); - gl->gl_object = NULL; - spin_unlock(&gl->gl_lockref.lock); + glock_set_object(gl, NULL); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } @@ -917,7 +915,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) error = rgd_insert(rgd); spin_unlock(&sdp->sd_rindex_spin); if (!error) { - rgd->rd_gl->gl_object = rgd; + glock_set_object(rgd->rd_gl, rgd); rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 29b0473f6e74..fdedec379b78 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1105,9 +1105,12 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host gfs2_holder_uninit(gh); error = err; } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); + if (!error) { + struct gfs2_rgrpd *rgd = + gfs2_glock2rgrp(gh->gh_gl); + + error = statfs_slow_fill(rgd, sc); + } gfs2_glock_dq_uninit(gh); } } @@ -1535,6 +1538,12 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) goto out; + if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); + gfs2_holder_mark_uninitialized(&gh); + goto alloc_failed; + } + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { @@ -1543,11 +1552,9 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } - if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; - } + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; if (test_bit(GIF_INVALID, &ip->i_flags)) { error = gfs2_inode_refresh(ip); @@ -1555,6 +1562,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } +alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; @@ -1621,7 +1629,8 @@ out_unlock: } gfs2_holder_uninit(&ip->i_iopen_gh); } - gfs2_glock_dq_uninit(&gh); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1631,13 +1640,13 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - ip->i_gl->gl_object = NULL; - flush_delayed_work(&ip->i_gl->gl_work); + glock_set_object(ip->i_gl, NULL); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; + glock_set_object(ip->i_iopen_gh.gh_gl, NULL); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7a515345610c..ca1f97ff898c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -71,25 +71,14 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); } -static int gfs2_uuid_valid(const u8 *uuid) -{ - int i; - - for (i = 0; i < 16; i++) { - if (uuid[i]) - return 1; - } - return 0; -} - static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) { struct super_block *s = sdp->sd_vfs; - const u8 *uuid = s->s_uuid; + buf[0] = '\0'; - if (!gfs2_uuid_valid(uuid)) + if (uuid_is_null(&s->s_uuid)) return 0; - return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); + return snprintf(buf, PAGE_SIZE, "%pUB\n", &s->s_uuid); } static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) @@ -637,12 +626,12 @@ static struct attribute *tune_attrs[] = { NULL, }; -static struct attribute_group tune_group = { +static const struct attribute_group tune_group = { .name = "tune", .attrs = tune_attrs, }; -static struct attribute_group lock_module_group = { +static const struct attribute_group lock_module_group = { .name = "lock_module", .attrs = lock_module_attrs, }; @@ -712,14 +701,13 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); struct super_block *s = sdp->sd_vfs; - const u8 *uuid = s->s_uuid; add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); - if (gfs2_uuid_valid(uuid)) - add_uevent_var(env, "UUID=%pUB", uuid); + if (!uuid_is_null(&s->s_uuid)) + add_uevent_var(env, "UUID=%pUB", &s->s_uuid); return 0; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index d87721aeb575..54179554c7d2 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1327,8 +1327,8 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; } diff --git a/fs/inode.c b/fs/inode.c index db5914783a71..50370599e371 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; + inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; @@ -1891,11 +1892,11 @@ static void __wait_on_freeing_inode(struct inode *inode) wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); spin_lock(&inode_hash_lock); } @@ -1914,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1927,20 +1926,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1958,14 +1952,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) @@ -2023,7 +2014,7 @@ bool inode_owner_or_capable(const struct inode *inode) return true; ns = current_user_ns(); - if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) + if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } @@ -2038,11 +2029,11 @@ static void __inode_dio_wait(struct inode *inode) DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); do { - prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); if (atomic_read(&inode->i_dio_count)) schedule(); } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wait); + finish_wait(wq, &q.wq_entry); } /** diff --git a/fs/iomap.c b/fs/iomap.c index 4b10892967a5..173222863aca 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -584,6 +584,100 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } EXPORT_SYMBOL_GPL(iomap_fiemap); +static loff_t +iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, + void *data, struct iomap *iomap) +{ + switch (iomap->type) { + case IOMAP_UNWRITTEN: + offset = page_cache_seek_hole_data(inode, offset, length, + SEEK_HOLE); + if (offset < 0) + return length; + /* fall through */ + case IOMAP_HOLE: + *(loff_t *)data = offset; + return 0; + default: + return length; + } +} + +loff_t +iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +{ + loff_t size = i_size_read(inode); + loff_t length = size - offset; + loff_t ret; + + /* Nothing to be found beyond the end of the file. */ + if (offset >= size) + return -ENXIO; + + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, + &offset, iomap_seek_hole_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + offset += ret; + length -= ret; + } + + return offset; +} +EXPORT_SYMBOL_GPL(iomap_seek_hole); + +static loff_t +iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, + void *data, struct iomap *iomap) +{ + switch (iomap->type) { + case IOMAP_HOLE: + return length; + case IOMAP_UNWRITTEN: + offset = page_cache_seek_hole_data(inode, offset, length, + SEEK_DATA); + if (offset < 0) + return length; + /*FALLTHRU*/ + default: + *(loff_t *)data = offset; + return 0; + } +} + +loff_t +iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +{ + loff_t size = i_size_read(inode); + loff_t length = size - offset; + loff_t ret; + + /* Nothing to be found beyond the end of the file. */ + if (offset >= size) + return -ENXIO; + + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, + &offset, iomap_seek_data_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + offset += ret; + length -= ret; + } + + if (length <= 0) + return -ENXIO; + return offset; +} +EXPORT_SYMBOL_GPL(iomap_seek_data); + /* * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: @@ -672,8 +766,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_error) - iomap_dio_set_error(dio, bio->bi_error); + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (atomic_dec_and_test(&dio->ref)) { if (is_sync_kiocb(dio->iocb)) { @@ -793,6 +887,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_bdev = iomap->bdev; bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -881,6 +976,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, start, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + flags |= IOMAP_NOWAIT; + } + ret = filemap_write_and_wait_range(mapping, start, end); if (ret) goto out_free_dio; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b6b194ec1b4f..3c1c31321d9b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -263,18 +263,10 @@ static int journal_finish_inode_data_buffers(journal_t *journal, continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); - if (err) { - /* - * Because AS_EIO is cleared by - * filemap_fdatawait_range(), set it again so - * that user process can get -EIO from fsync(). - */ - mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO); - - if (!ret) - ret = err; - } + err = filemap_fdatawait_keep_errors( + jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ebad34266bcf..7d5ef3bf3f3e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2579,10 +2579,10 @@ restart: wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_list_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); goto restart; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2d30a6da7013..8b08044b3120 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -409,25 +409,6 @@ static handle_t *new_handle(int nblocks) return handle; } -/** - * handle_t *jbd2_journal_start() - Obtain a new handle. - * @journal: Journal to start transaction on. - * @nblocks: number of block buffer we might modify - * - * We make sure that the transaction can guarantee at least nblocks of - * modified buffers in the log. We block until the log can guarantee - * that much space. Additionally, if rsv_blocks > 0, we also create another - * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction - * and thus doesn't block transaction commit. If the caller uses this reserved - * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() - * on the parent handle will dispose the reserved one. Reserved handle has to - * be converted to a normal handle using jbd2_journal_start_reserved() before - * it can be used. - * - * Return a pointer to a newly allocated handle, or an ERR_PTR() value - * on failure. - */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, gfp_t gfp_mask, unsigned int type, unsigned int line_no) @@ -478,6 +459,25 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, EXPORT_SYMBOL(jbd2__journal_start); +/** + * handle_t *jbd2_journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * is stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); @@ -1072,10 +1072,10 @@ out: * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * - * Returns an error code or 0 on success. + * Returns: error code or 0 on success. * * In full data journalling mode the buffer may be of type BJ_AsyncData, - * because we're write()ing a buffer which is also part of a shared mapping. + * because we're ``write()ing`` a buffer which is also part of a shared mapping. */ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bb1da1feafeb..a21f0e9eecd4 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio) bp->l_flag |= lbmDONE; - if (bio->bi_error) { + if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1403e5..65120a471729 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio) BUG_ON(!PagePrivate(page)); - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } @@ -664,6 +664,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, INCREMENT(mpStat.pagealloc); mp = alloc_metapage(GFP_NOFS); mp->page = page; + mp->sb = inode->i_sb; mp->flag = 0; mp->xflag = COMMIT_PAGE; mp->count = 1; @@ -711,7 +712,8 @@ void force_metapage(struct metapage *mp) get_page(page); lock_page(page); set_page_dirty(page); - write_one_page(page, 1); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); clear_bit(META_forcewrite, &mp->flag); put_page(page); } @@ -756,7 +758,8 @@ void release_metapage(struct metapage * mp) set_page_dirty(page); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); - write_one_page(page, 1); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); lock_page(page); /* write_one_page unlocks the page */ } } else if (mp->lsn) /* discard_metapage doesn't remove it */ diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index a869fb4a20d6..8b0ee514eb84 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -38,6 +38,7 @@ struct metapage { /* implementation */ struct page *page; + struct super_block *sb; unsigned int logical_size; /* Journal management */ diff --git a/fs/libfs.c b/fs/libfs.c index a04395334bb1..3aabe553fc45 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -974,7 +974,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int err; int ret; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; @@ -991,6 +991,10 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, out: inode_unlock(inode); + /* check and advance again to catch errors after syncing out buffers */ + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); diff --git a/fs/locks.c b/fs/locks.c index af2031a1fcff..afefeb4ad6de 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1858,8 +1858,8 @@ EXPORT_SYMBOL(generic_setlease); * * Call this to establish a lease on the file. The "lease" argument is not * used for F_UNLCK requests and may be NULL. For commands that set or alter - * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; - * if not, this function will return -ENOLCK (and generate a scary-looking + * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be + * set; if not, this function will return -ENOLCK (and generate a scary-looking * stack trace). * * The "priv" pointer is passed directly to the lm_setup function as-is. It @@ -1972,15 +1972,13 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * @cmd: the type of lock to apply. * * Apply a %FL_FLOCK style lock to an open file descriptor. - * The @cmd can be one of + * The @cmd can be one of: * - * %LOCK_SH -- a shared lock. - * - * %LOCK_EX -- an exclusive lock. - * - * %LOCK_UN -- remove an existing lock. - * - * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes. + * - %LOCK_SH -- a shared lock. + * - %LOCK_EX -- an exclusive lock. + * - %LOCK_UN -- remove an existing lock. + * - %LOCK_MAND -- a 'mandatory' flock. + * This exists to emulate Windows Share Modes. * * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other * processes read and write access respectively. @@ -2086,26 +2084,22 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock file_lock; - struct flock flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, &flock); + error = flock_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK; @@ -2117,15 +2111,12 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(&flock, &file_lock); + error = posix_lock_to_flock(flock, &file_lock); if (error) goto rel_priv; } - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; rel_priv: locks_release_private(&file_lock); out: @@ -2218,26 +2209,16 @@ check_fmode_for_setlk(struct file_lock *fl) * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock __user *l) + struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - inode = locks_inode(filp); - - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2246,7 +2227,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock_to_posix_lock(filp, file_lock, &flock); + error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2261,7 +2242,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK; @@ -2270,7 +2251,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW; @@ -2315,26 +2296,22 @@ out: /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock file_lock; - struct flock64 flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, &flock); + error = flock64_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK64; @@ -2346,13 +2323,9 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(&flock, &file_lock); - - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; + posix_lock_to_flock64(flock, &file_lock); locks_release_private(&file_lock); out: @@ -2363,26 +2336,16 @@ out: * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock64 __user *l) + struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock64 flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - - inode = locks_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2391,7 +2354,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock64_to_posix_lock(filp, file_lock, &flock); + error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2406,7 +2369,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK64; @@ -2415,7 +2378,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW64; diff --git a/fs/mbcache.c b/fs/mbcache.c index b19be429d655..d818fd236787 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -10,13 +10,14 @@ /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in - * mb_cache_entry_delete_block()). + * mb_cache_entry_delete()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. - * They use hash of a block contents as a key and block number as a value. - * That's why keys need not be unique (different xattr blocks may end up having - * the same hash). However block number always uniquely identifies a cache - * entry. + * Ext4 also uses it for deduplication of xattr values stored in inodes. + * They use hash of data as a key and provide a value that may represent a + * block or inode number. That's why keys need not be unique (hash of different + * data may be the same). However user provided value always uniquely + * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed @@ -62,15 +63,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry - * @block - block that contains data - * @reusable - is the block reusable by other inodes? + * @value - value of the entry + * @reusable - is the entry reusable by others? * - * Creates entry in @cache with key @key and records that data is stored in - * block @block. The function returns -EBUSY if entry with the same key - * and for the same block already exists in cache. Otherwise 0 is returned. + * Creates entry in @cache with key @key and value @value. The function returns + * -EBUSY if entry with the same key and value already exists in cache. + * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block, bool reusable) + u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; @@ -91,12 +92,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, /* One ref for hash, one ref returned */ atomic_set(&entry->e_refcnt, 1); entry->e_key = key; - entry->e_block = block; + entry->e_value = value; entry->e_reusable = reusable; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { - if (dup->e_key == key && dup->e_block == block) { + if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; @@ -187,13 +188,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, EXPORT_SYMBOL(mb_cache_entry_find_next); /* - * mb_cache_entry_get - get a cache entry by block number (and key) + * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with - * @key - key of block number @block - * @block - block number + * @key - key + * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, - sector_t block) + u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; @@ -202,7 +203,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { + if (entry->e_key == key && entry->e_value == value) { atomic_inc(&entry->e_refcnt); goto out; } @@ -214,15 +215,14 @@ out: } EXPORT_SYMBOL(mb_cache_entry_get); -/* mb_cache_entry_delete_block - remove information about block from cache +/* mb_cache_entry_delete - remove a cache entry * @cache - cache we work with - * @key - key of block @block - * @block - block number + * @key - key + * @value - value * - * Remove entry from cache @cache with key @key with data stored in @block. + * Remove entry from cache @cache with key @key and value @value. */ -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, - sector_t block) +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; @@ -231,7 +231,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { + if (entry->e_key == key && entry->e_value == value) { /* We keep hash list reference to keep entry alive */ hlist_bl_del_init(&entry->e_hash_list); hlist_bl_unlock(head); @@ -248,7 +248,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, } hlist_bl_unlock(head); } -EXPORT_SYMBOL(mb_cache_entry_delete_block); +EXPORT_SYMBOL(mb_cache_entry_delete); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 7edc9b395700..baa9721f1299 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -57,7 +57,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 4c57c9af6946..2d1ca08870f7 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -142,7 +142,7 @@ changed: return -EAGAIN; } -static inline int get_block(struct inode * inode, sector_t block, +static int get_block(struct inode * inode, sector_t block, struct buffer_head *bh, int create) { int err = -EIO; diff --git a/fs/mount.h b/fs/mount.h index bf1fda6eed8f..de45d9e76748 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,6 +58,7 @@ struct mount { struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/mpage.c b/fs/mpage.c index baff8f820c29..2e4c41ccb5c9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); @@ -344,6 +345,7 @@ confused: * * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: + * * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * * because the indirect block has to be read to get the mappings of blocks @@ -614,6 +616,7 @@ alloc_new: goto confused; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/namei.c b/fs/namei.c index 6571a5f5112e..e0b46eb0e212 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1008,7 +1008,7 @@ static int may_linkat(struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (inode_owner_or_capable(inode) || safe_hardlink_source(inode)) + if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; audit_log_link_denied("linkat", link); @@ -4332,6 +4332,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: + * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on @@ -4362,11 +4363,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, { int error; bool is_dir = d_is_dir(old_dentry); - const unsigned char *old_name; struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + struct name_snapshot old_name; if (source == target) return 0; @@ -4413,7 +4414,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); @@ -4468,14 +4469,14 @@ out: inode_unlock(target); dput(new_dentry); if (!error) { - fsnotify_move(old_dir, new_dir, old_name, is_dir, + fsnotify_move(old_dir, new_dir, old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, new_is_dir, NULL, new_dentry); } } - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); return error; } diff --git a/fs/namespace.c b/fs/namespace.c index 5a4438445bf7..81f934b5d571 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_LIST_HEAD(&mnt->mnt_umounting); init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; @@ -3238,7 +3239,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3247,22 +3247,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 0c3905e0542e..6719c0be674d 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf) * -- nyc */ count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0ca370d23ddb..d8863a804b15 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio) struct parallel_io *par = bio->bi_private; struct nfs_pgio_header *header = par->data; - if (bio->bi_error) { + if (bio->bi_status) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c14758e08d73..390ac9c39c59 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -753,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session, * A single slot, so highest used slotid is either 0 or -1 */ nfs4_free_slot(tbl, slot); - nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32ccd7754f8a..2ac00bf4ecf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1946,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); -static void -nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) -{ - struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - - nfs_mark_for_revalidate(old_inode); - - switch (task->tk_status) { - case 0: - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(data->new_dir)); - break; - case -ENOENT: - nfs_dentry_handle_enoent(old_dentry); - } -} - /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -1999,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL; + struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; @@ -2022,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the * rename, we unhash the dentry in advance. */ - if (!d_unhashed(new_dentry)) + if (!d_unhashed(new_dentry)) { d_drop(new_dentry); + rehash = new_dentry; + } if (d_count(new_dentry) > 2) { int err; @@ -2040,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } @@ -2048,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, - nfs_complete_rename); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2059,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); + nfs_mark_for_revalidate(old_inode); out: + if (rehash) + d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + /* + * The d_move() should be here instead of in an async RPC completion + * handler because we need the proper locks to move the dentry. If + * we're interrupted by a signal, the async RPC completion handler + * should mark the directories for revalidation. + */ + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + /* new dentry created? */ if (dentry) dput(dentry); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3e24392f2caa..8701d7617964 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -7,6 +7,7 @@ #include <linux/security.h> #include <linux/crc32.h> #include <linux/nfs_page.h> +#include <linux/wait_bit.h> #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c08c46a3b8cd..98b0b662af09 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2589,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, /* Except MODE, it seems harmless of setting twice. */ if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - attrset[1] & FATTR4_WORD1_MODE) + (attrset[1] & FATTR4_WORD1_MODE || + attrset[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) @@ -6372,7 +6373,7 @@ struct nfs4_lock_waiter { }; static int -nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; struct cb_notify_lock_args *cbnl = key; @@ -6415,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .inode = state->inode, .owner = &owner, .notified = false }; - wait_queue_t wait; + wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) @@ -8416,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata) size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); + nfs4_sequence_free_slot(&lgp->res.seq_res); nfs4_free_pages(lgp->args.layout.pages, max_pages); pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); @@ -8490,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); - nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b34de036501b..cbf82b0d4467 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2134,6 +2134,8 @@ again: put_rpccred(cred); switch (status) { case 0: + case -EINTR: + case -ERESTARTSYS: break; case -ETIMEDOUT: if (clnt->cl_softrtry) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index eceb4eabb064..c5334c0e23a1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2545,10 +2545,25 @@ EXPORT_SYMBOL_GPL(nfs_set_sb_security); int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { + int error; + unsigned long kflags = 0, kflags_out = 0; + /* clone any lsm security options from the parent to the new sb */ if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; - return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_clone_mnt_opts(mount_info->cloned->sb, s, kflags, + &kflags_out); + if (error) + return error; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + return 0; } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index fb5213afc854..c862c2489df0 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -219,6 +219,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, u8 *buf, *d, type, assoc; int error; + if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) + return -EINVAL; + buf = kzalloc(bufflen, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -229,7 +232,6 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, goto out_free_buf; } req = scsi_req(rq); - scsi_req_init(rq); error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); if (error) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index e71f11b1a180..3bc08c394a3f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -486,7 +486,7 @@ secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static inline int -uuid_parse(char **mesg, char *buf, unsigned char **puuid) +nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; @@ -586,7 +586,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) - err = uuid_parse(&mesg, buf, &exp.ex_uuid); + err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2be32955d7f2..38d0383dc7f9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -911,24 +911,13 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - mm_segment_t oldfs; + struct iov_iter iter; int host_err; - oldfs = get_fs(); - set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0); - set_fs(oldfs); - return nfsd_finish_read(file, count, host_err); -} + iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); + host_err = vfs_iter_read(file, &iter, &offset, 0); -static __be32 -nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) -{ - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - return nfsd_splice_read(rqstp, file, offset, count); - else - return nfsd_readv(file, offset, vec, vlen, count); + return nfsd_finish_read(file, count, host_err); } /* @@ -974,7 +963,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, unsigned long *cnt, int stable) { struct svc_export *exp; - mm_segment_t oldfs; + struct iov_iter iter; __be32 err = 0; int host_err; int use_wgather; @@ -1000,10 +989,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (stable && !use_wgather) flags |= RWF_SYNC; - /* Write the data. */ - oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, flags); - set_fs(oldfs); + iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt); + host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; *cnt = host_err; @@ -1044,7 +1031,12 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ra = nfsd_init_raparms(file); trace_read_opened(rqstp, fhp, offset, vlen); - err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + err = nfsd_splice_read(rqstp, file, offset, count); + else + err = nfsd_readv(file, offset, vec, vlen, count); + trace_read_io_done(rqstp, fhp, offset, vlen); if (ra) @@ -1464,41 +1456,34 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) { - mm_segment_t oldfs; __be32 err; - int host_err; + const char *link; struct path path; + DEFINE_DELAYED_CALL(done); + int len; err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); - if (err) - goto out; + if (unlikely(err)) + return err; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - err = nfserr_inval; - if (!d_is_symlink(path.dentry)) - goto out; + if (unlikely(!d_is_symlink(path.dentry))) + return nfserr_inval; touch_atime(&path); - /* N.B. Why does this call need a get_fs()?? - * Remove the set_fs and watch the fireworks:-) --okir - */ - oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp); - set_fs(oldfs); - - if (host_err < 0) - goto out_nfserr; - *lenp = host_err; - err = 0; -out: - return err; + link = vfs_get_link(path.dentry, &done); + if (IS_ERR(link)) + return nfserrno(PTR_ERR(link)); -out_nfserr: - err = nfserrno(host_err); - goto out; + len = strlen(link); + if (len < *lenp) + *lenp = len; + memcpy(buf, link, *lenp); + do_delayed_call(&done); + return 0; } /* diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6f87b2ac1aeb..e73c86d9855c 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index febed1217b3f..70ded52dc1dd 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino) } struct nilfs_segctor_wait_request { - wait_queue_t wq; + wait_queue_entry_t wq; __u32 seq; int err; atomic_t done; @@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) unsigned long flags; spin_lock_irqsave(&sci->sc_wait_request.lock, flags); - list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, - wq.task_list) { + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) { if (!atomic_read(&wrq->done) && nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { wrq->err = err; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 01a9f0f007d4..0c4583b61717 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -161,16 +161,20 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask if (unlikely(!fsnotify_inode_watches_children(p_inode))) __fsnotify_update_child_dentry_flags(p_inode); else if (p_inode->i_fsnotify_mask & mask) { + struct name_snapshot name; + /* we are notifying a parent so come up with the new mask which * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; + take_dentry_name_snapshot(&name, dentry); if (path) ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - dentry->d_name.name, 0); + name.name, 0); else ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); + name.name, 0); + release_dentry_name_snapshot(&name); } dput(parent); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0da0332725aa..ffe003982d95 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (bio->bi_error) { - mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); - wc->wc_error = bio->bi_error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 564c504d6efd..74a21f6695c8 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -426,6 +426,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); + kfree(dummy_sc); return seq_release_private(inode, file); } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 382401d3e88f..1a1e0078ab38 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -136,7 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { - int rc = 0; + int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 44d178b8d1aa..5bb4a89f9045 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -25,6 +25,8 @@ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H +#include <linux/magic.h> + /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 @@ -56,9 +58,6 @@ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE -/* Filesystem magic number */ -#define OCFS2_SUPER_MAGIC 0x7461636f - /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 820359096c7a..d6c350ba25b9 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -631,7 +631,7 @@ static struct attribute *ocfs2_attrs[] = { NULL, }; -static struct attribute_group ocfs2_attr_group = { +static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ca1646fbcaef..83005f486451 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2062,7 +2062,7 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); - memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + memcpy(&sb->s_uuid, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; diff --git a/fs/open.c b/fs/open.c index cd0c5be8d012..35bb784763a4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -707,6 +707,9 @@ static int do_dentry_open(struct file *f, f->f_inode = inode; f->f_mapping = inode->i_mapping; + /* Ensure that we skip any errors that predate opening of the file */ + f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; @@ -759,6 +762,7 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 83b506020718..038d67545d9f 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -46,8 +46,8 @@ static void run_down(struct slot_map *m) spin_lock(&m->q.lock); if (m->c != -1) { for (;;) { - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail(&m->q, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (m->c == -1) @@ -84,8 +84,8 @@ static int wait_for_free(struct slot_map *m) do { long n = left, t; - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail_exclusive(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail_exclusive(&m->q, &wait); set_current_state(TASK_INTERRUPTIBLE); if (m->c > 0) @@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m) left = -EINTR; } while (left > 0); - if (!list_empty(&wait.task_list)) - list_del(&wait.task_list); + if (!list_empty(&wait.entry)) + list_del(&wait.entry); else if (left <= 0 && waitqueue_active(&m->q)) __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); __set_current_state(TASK_RUNNING); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 7a44533f4bbf..e5869f91b3ab 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -233,7 +233,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } -static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_be *uuid) +static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_t *uuid) { struct ovl_fh *fh; int fh_type, fh_len, dwords; @@ -284,7 +284,6 @@ static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, struct dentry *upper) { struct super_block *sb = lower->d_sb; - uuid_be *uuid = (uuid_be *) &sb->s_uuid; const struct ovl_fh *fh = NULL; int err; @@ -294,8 +293,8 @@ static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, * up and a pure upper inode. */ if (sb->s_export_op && sb->s_export_op->fh_to_dentry && - uuid_be_cmp(*uuid, NULL_UUID_BE)) { - fh = ovl_encode_fh(lower, uuid); + !uuid_is_null(&sb->s_uuid)) { + fh = ovl_encode_fh(lower, &sb->s_uuid); if (IS_ERR(fh)) return PTR_ERR(fh); } @@ -330,15 +329,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, .link = link }; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto out; - err = security_inode_copy_up(dentry, &new_creds); if (err < 0) - goto out1; + goto out; if (new_creds) old_creds = override_creds(new_creds); @@ -362,7 +355,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, } if (err) - goto out2; + goto out; if (S_ISREG(stat->mode)) { struct path upperpath; @@ -398,10 +391,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, /* * Store identifier of lower inode in upper inode xattr to * allow lookup of the copy up origin inode. + * + * Don't set origin when we are breaking the association with a lower + * hard link. */ - err = ovl_set_origin(dentry, lowerpath->dentry, temp); - if (err) + if (S_ISDIR(stat->mode) || stat->nlink == 1) { + err = ovl_set_origin(dentry, lowerpath->dentry, temp); + if (err) + goto out_cleanup; + } + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + if (IS_ERR(upper)) { + err = PTR_ERR(upper); + upper = NULL; goto out_cleanup; + } if (tmpfile) err = ovl_do_link(temp, udir, upper, true); @@ -416,17 +422,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, /* Restore timestamps on parent (best effort) */ ovl_set_timestamps(upperdir, pstat); -out2: +out: dput(temp); -out1: dput(upper); -out: return err; out_cleanup: if (!tmpfile) ovl_cleanup(wdir, temp); - goto out2; + goto out; } /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index f3136c31e72a..de0d4f742f36 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -135,7 +135,7 @@ static struct dentry *ovl_get_origin(struct dentry *dentry, * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ - if (uuid_be_cmp(fh->uuid, *(uuid_be *) &mnt->mnt_sb->s_uuid)) + if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) goto out; origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0623cebeefff..10863b4105fa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -56,7 +56,7 @@ struct ovl_fh { u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ - uuid_be uuid; /* uuid of filesystem */ + uuid_t uuid; /* uuid of filesystem */ u8 fid[0]; /* file identifier */ } __packed; diff --git a/fs/pnode.c b/fs/pnode.c index 5bc7896d122a..53d411a371ce 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p) return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } +static inline struct mount *last_slave(struct mount *p) +{ + return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); +} + static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m, } } +static struct mount *skip_propagation_subtree(struct mount *m, + struct mount *origin) +{ + /* + * Advance m such that propagation_next will not return + * the slaves of m. + */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + m = last_slave(m); + + return m; +} + static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { @@ -413,65 +431,104 @@ void propagate_mount_unlock(struct mount *mnt) } } -/* - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. - */ -static void mark_umount_candidates(struct mount *mnt) +static void umount_one(struct mount *mnt, struct list_head *to_umount) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); - - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) - continue; - if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { - SET_MNT_MARK(child); - } - } + CLEAR_MNT_MARK(mnt); + mnt->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_umounting); + list_move_tail(&mnt->mnt_list, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct mount *mnt) +static bool __propagate_umount(struct mount *mnt, + struct list_head *to_umount, + struct list_head *to_restore) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; + bool progress = false; + struct mount *child; - BUG_ON(parent == mnt); + /* + * The state of the parent won't change if this mount is + * already unmounted or marked as without children. + */ + if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) + goto out; - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *topper; - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - /* - * umount the child only if the child has no children - * and the child is marked safe to unmount. - */ - if (!child || !IS_MNT_MARKED(child)) + /* Verify topper is the only grandchild that has not been + * speculatively unmounted. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; - CLEAR_MNT_MARK(child); + if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) + continue; + /* Found a mounted child */ + goto children; + } - /* If there is exactly one mount covering all of child - * replace child with that mount. - */ - topper = find_topper(child); - if (topper) - mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, - topper); + /* Mark mounts that can be unmounted if not locked */ + SET_MNT_MARK(mnt); + progress = true; + + /* If a mount is without children and not locked umount it. */ + if (!IS_MNT_LOCKED(mnt)) { + umount_one(mnt, to_umount); + } else { +children: + list_move_tail(&mnt->mnt_umounting, to_restore); + } +out: + return progress; +} + +static void umount_list(struct list_head *to_umount, + struct list_head *to_restore) +{ + struct mount *mnt, *child, *tmp; + list_for_each_entry(mnt, to_umount, mnt_list) { + list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { + /* topper? */ + if (child->mnt_mountpoint == mnt->mnt.mnt_root) + list_move_tail(&child->mnt_umounting, to_restore); + else + umount_one(child, to_umount); + } + } +} - if (list_empty(&child->mnt_mounts)) { - list_del_init(&child->mnt_child); - child->mnt.mnt_flags |= MNT_UMOUNT; - list_move_tail(&child->mnt_list, &mnt->mnt_list); +static void restore_mounts(struct list_head *to_restore) +{ + /* Restore mounts to a clean working state */ + while (!list_empty(to_restore)) { + struct mount *mnt, *parent; + struct mountpoint *mp; + + mnt = list_first_entry(to_restore, struct mount, mnt_umounting); + CLEAR_MNT_MARK(mnt); + list_del_init(&mnt->mnt_umounting); + + /* Should this mount be reparented? */ + mp = mnt->mnt_mp; + parent = mnt->mnt_parent; + while (parent->mnt.mnt_flags & MNT_UMOUNT) { + mp = parent->mnt_mp; + parent = parent->mnt_parent; } + if (parent != mnt->mnt_parent) + mnt_change_mountpoint(parent, mp, mnt); + } +} + +static void cleanup_umount_visitations(struct list_head *visited) +{ + while (!list_empty(visited)) { + struct mount *mnt = + list_first_entry(visited, struct mount, mnt_umounting); + list_del_init(&mnt->mnt_umounting); } } @@ -485,11 +542,68 @@ static void __propagate_umount(struct mount *mnt) int propagate_umount(struct list_head *list) { struct mount *mnt; + LIST_HEAD(to_restore); + LIST_HEAD(to_umount); + LIST_HEAD(visited); + + /* Find candidates for unmounting */ + list_for_each_entry_reverse(mnt, list, mnt_list) { + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + /* + * If this mount has already been visited it is known that it's + * entire peer group and all of their slaves in the propagation + * tree for the mountpoint has already been visited and there is + * no need to visit them again. + */ + if (!list_empty(&mnt->mnt_umounting)) + continue; + + list_add_tail(&mnt->mnt_umounting, &visited); + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt(&m->mnt, + mnt->mnt_mountpoint); + if (!child) + continue; + + if (!list_empty(&child->mnt_umounting)) { + /* + * If the child has already been visited it is + * know that it's entire peer group and all of + * their slaves in the propgation tree for the + * mountpoint has already been visited and there + * is no need to visit this subtree again. + */ + m = skip_propagation_subtree(m, parent); + continue; + } else if (child->mnt.mnt_flags & MNT_UMOUNT) { + /* + * We have come accross an partially unmounted + * mount in list that has not been visited yet. + * Remember it has been visited and continue + * about our merry way. + */ + list_add_tail(&child->mnt_umounting, &visited); + continue; + } + + /* Check the child and parents while progress is made */ + while (__propagate_umount(child, + &to_umount, &to_restore)) { + /* Is the parent a umount candidate? */ + child = child->mnt_parent; + if (list_empty(&child->mnt_umounting)) + break; + } + } + } - list_for_each_entry_reverse(mnt, list, mnt_list) - mark_umount_candidates(mnt); + umount_list(&to_umount, &to_restore); + restore_mounts(&to_restore); + cleanup_umount_visitations(&visited); + list_splice_tail(&to_umount, list); - list_for_each_entry(mnt, list, mnt_list) - __propagate_umount(mnt); return 0; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4ee55274f155..45629f4b5402 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -504,7 +504,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if (is_vmalloc_or_module_addr((void *)start)) { + } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, buf, tsz)) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 792a4e5f9226..4d02c3b65061 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -349,48 +349,48 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) switch (record->type) { case PSTORE_TYPE_DMESG: - scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", + scnprintf(name, sizeof(name), "dmesg-%s-%llu%s", record->psi->name, record->id, record->compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%lld", + scnprintf(name, sizeof(name), "console-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%lld", + scnprintf(name, sizeof(name), "ftrace-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%lld", + scnprintf(name, sizeof(name), "mce-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%lld", + scnprintf(name, sizeof(name), "rtas-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OF: - scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_COMMON: - scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-common-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%lld", + scnprintf(name, sizeof(name), "pmsg-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OPAL: - scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%lld", + scnprintf(name, sizeof(name), "unknown-%s-%llu", record->psi->name, record->id); break; default: - scnprintf(name, sizeof(name), "type%d-%s-%lld", + scnprintf(name, sizeof(name), "type%d-%s-%llu", record->type, record->psi->name, record->id); break; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index c416e653dc4f..58051265626f 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -30,5 +30,7 @@ extern void pstore_get_backend_records(struct pstore_info *psi, extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); extern bool pstore_is_mounted(void); +extern void pstore_record_init(struct pstore_record *record, + struct pstore_info *psi); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d468eec9b8a6..1b6e0ff6bff5 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -474,6 +474,20 @@ static size_t copy_kmsg_to_buffer(int hsize, size_t len) return total_len; } +void pstore_record_init(struct pstore_record *record, + struct pstore_info *psinfo) +{ + memset(record, 0, sizeof(*record)); + + record->psi = psinfo; + + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(&record->time)) { + record->time.tv_sec = 0; + record->time.tv_nsec = 0; + } +} + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -509,15 +523,14 @@ static void pstore_dump(struct kmsg_dumper *dumper, int header_size; int zipped_len = -1; size_t dump_size; - struct pstore_record record = { - .type = PSTORE_TYPE_DMESG, - .count = oopscount, - .reason = reason, - .part = part, - .compressed = false, - .buf = psinfo->buf, - .psi = psinfo, - }; + struct pstore_record record; + + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_DMESG; + record.count = oopscount; + record.reason = reason; + record.part = part; + record.buf = psinfo->buf; if (big_oops_buf && is_locked) { dst = big_oops_buf; @@ -587,12 +600,12 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) const char *e = s + c; while (s < e) { - struct pstore_record record = { - .type = PSTORE_TYPE_CONSOLE, - .psi = psinfo, - }; + struct pstore_record record; unsigned long flags; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; + if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -640,19 +653,16 @@ static int pstore_write_user_compat(struct pstore_record *record, if (record->buf) return -EINVAL; - record->buf = kmalloc(record->size, GFP_KERNEL); - if (!record->buf) - return -ENOMEM; - - if (unlikely(copy_from_user(record->buf, buf, record->size))) { - ret = -EFAULT; + record->buf = memdup_user(buf, record->size); + if (unlikely(IS_ERR(record->buf))) { + ret = PTR_ERR(record->buf); goto out; } ret = record->psi->write(record); -out: kfree(record->buf); +out: record->buf = NULL; return unlikely(ret < 0) ? ret : record->size; @@ -770,8 +780,11 @@ static void decompress_record(struct pstore_record *record) int unzipped_len; char *decompressed; + if (!record->compressed) + return; + /* Only PSTORE_TYPE_DMESG support compression. */ - if (!record->compressed || record->type != PSTORE_TYPE_DMESG) { + if (record->type != PSTORE_TYPE_DMESG) { pr_warn("ignored compressed record type %d\n", record->type); return; } @@ -819,6 +832,7 @@ void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet) { int failed = 0; + unsigned int stop_loop = 65536; if (!psi || !root) return; @@ -832,7 +846,7 @@ void pstore_get_backend_records(struct pstore_info *psi, * may reallocate record.buf. On success, pstore_mkfile() will keep * the record.buf, so free it only on failure. */ - for (;;) { + for (; stop_loop; stop_loop--) { struct pstore_record *record; int rc; @@ -841,13 +855,15 @@ void pstore_get_backend_records(struct pstore_info *psi, pr_err("out of memory creating record\n"); break; } - record->psi = psi; + pstore_record_init(record, psi); record->size = psi->read(record); /* No more records left in backend? */ - if (record->size <= 0) + if (record->size <= 0) { + kfree(record); break; + } decompress_record(record); rc = pstore_mkfile(root, record); @@ -865,8 +881,11 @@ out: mutex_unlock(&psi->read_mutex); if (failed) - pr_warn("failed to load %d record(s) from '%s'\n", + pr_warn("failed to create %d record(s) from '%s'\n", failed, psi->name); + if (!stop_loop) + pr_err("looping? Too many records seen from '%s'\n", + psi->name); } static void pstore_dowork(struct work_struct *work) diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 209755e0d7c8..24db02de1787 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -22,16 +22,16 @@ static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct pstore_record record = { - .type = PSTORE_TYPE_PMSG, - .size = count, - .psi = psinfo, - }; + struct pstore_record record; int ret; if (!count) return 0; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_PMSG; + record.size = count; + /* check outside lock, page in any data. write_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 5cb022c8cd33..7125b398d312 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -27,7 +27,6 @@ #include <linux/module.h> #include <linux/version.h> #include <linux/pstore.h> -#include <linux/time.h> #include <linux/io.h> #include <linux/ioport.h> #include <linux/platform_device.h> @@ -356,20 +355,15 @@ out: } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, - bool compressed) + struct pstore_record *record) { char *hdr; - struct timespec timestamp; size_t len; - /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(×tamp)) { - timestamp.tv_sec = 0; - timestamp.tv_nsec = 0; - } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", - (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000), - compressed ? 'C' : 'D'); + record->time.tv_sec, + record->time.tv_nsec / 1000, + record->compressed ? 'C' : 'D'); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -440,7 +434,7 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; /* Build header and append record contents. */ - hlen = ramoops_write_kmsg_hdr(prz, record->compressed); + hlen = ramoops_write_kmsg_hdr(prz, record); size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 48813aeaab80..53a17496c5c5 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1910,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t space, cur_space; qsize_t rsv_space = 0; + qsize_t inode_usage = 1; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; char is_valid[MAXQUOTAS] = {}; @@ -1919,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (IS_NOQUOTA(inode)) return 0; + + if (inode->i_sb->dq_op->get_inode_usage) { + ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage); + if (ret) + return ret; + } + /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { warn_to[cnt].w_type = QUOTA_NL_NOWARN; @@ -1946,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = i_dquot(inode)[cnt]; - ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); + ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]); if (ret) goto over_quota; ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); @@ -1963,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; - wtype = info_idq_free(transfer_from[cnt], 1); + wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); @@ -1971,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); - dquot_decr_inodes(transfer_from[cnt], 1); + dquot_decr_inodes(transfer_from[cnt], inode_usage); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); } - dquot_incr_inodes(transfer_to[cnt], 1); + dquot_incr_inodes(transfer_to[cnt], inode_usage); dquot_incr_space(transfer_to[cnt], cur_space); dquot_resv_space(transfer_to[cnt], rsv_space); diff --git a/fs/read_write.c b/fs/read_write.c index 19d4d88fa285..0cc7033aa413 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -356,46 +356,6 @@ out_putf: } #endif -ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - if (!file->f_op->read_iter) - return -EINVAL; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - iter->type |= READ; - ret = call_read_iter(file, &kiocb, iter); - BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(vfs_iter_read); - -ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - if (!file->f_op->write_iter) - return -EINVAL; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - iter->type |= WRITE; - ret = call_write_iter(file, &kiocb, iter); - BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(vfs_iter_write); - int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; @@ -678,16 +638,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) - return -EOPNOTSUPP; - init_sync_kiocb(&kiocb, filp); - if (flags & RWF_HIPRI) - kiocb.ki_flags |= IOCB_HIPRI; - if (flags & RWF_DSYNC) - kiocb.ki_flags |= IOCB_DSYNC; - if (flags & RWF_SYNC) - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + ret = kiocb_set_rw_flags(&kiocb, flags); + if (ret) + return ret; kiocb.ki_pos = *ppos; if (type == READ) @@ -916,86 +870,114 @@ out: } #endif -static ssize_t __do_readv_writev(int type, struct file *file, - struct iov_iter *iter, loff_t *pos, int flags) +static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, + loff_t *pos, int flags) { size_t tot_len; ssize_t ret = 0; + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(type, file, pos, tot_len); + ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) - goto out; - - if (type != READ) - file_start_write(file); + return ret; - if ((type == READ && file->f_op->read_iter) || - (type == WRITE && file->f_op->write_iter)) - ret = do_iter_readv_writev(file, iter, pos, type, flags); + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, iter, pos, READ, flags); else - ret = do_loop_readv_writev(file, iter, pos, type, flags); - - if (type != READ) - file_end_write(file); - + ret = do_loop_readv_writev(file, iter, pos, READ, flags); out: - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } + if (ret >= 0) + fsnotify_access(file); return ret; } -static ssize_t do_readv_writev(int type, struct file *file, - const struct iovec __user *uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, + int flags) { - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; + if (!file->f_op->read_iter) + return -EINVAL; + return do_iter_read(file, iter, ppos, flags); +} +EXPORT_SYMBOL(vfs_iter_read); + +static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, + loff_t *pos, int flags) +{ + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) return ret; - ret = __do_readv_writev(type, file, &iter, pos, flags); - kfree(iov); - + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); return ret; } +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, + int flags) +{ + if (!file->f_op->write_iter) + return -EINVAL; + return do_iter_write(file, iter, ppos, flags); +} +EXPORT_SYMBOL(vfs_iter_write); + ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - if (!(file->f_mode & FMODE_READ)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_READ)) - return -EINVAL; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; - return do_readv_writev(READ, file, vec, vlen, pos, flags); -} + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + ret = do_iter_read(file, &iter, pos, flags); + kfree(iov); + } + return ret; +} EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - if (!(file->f_mode & FMODE_WRITE)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_WRITE)) - return -EINVAL; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; - return do_readv_writev(WRITE, file, vec, vlen, pos, flags); + ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + file_start_write(file); + ret = do_iter_write(file, &iter, pos, flags); + file_end_write(file); + kfree(iov); + } + return ret; } - EXPORT_SYMBOL(vfs_writev); static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, @@ -1143,44 +1125,20 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, } #ifdef CONFIG_COMPAT - -static ssize_t compat_do_readv_writev(int type, struct file *file, - const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos, int flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; - ret = compat_import_iovec(type, uvector, nr_segs, - UIO_FASTIOV, &iov, &iter); - if (ret < 0) - return ret; - - ret = __do_readv_writev(type, file, &iter, pos, flags); - kfree(iov); - - return ret; -} - -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!(file->f_mode & FMODE_CAN_READ)) - goto out; - - ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags); - -out: + ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); + if (ret >= 0) { + ret = do_iter_read(file, &iter, pos, flags); + kfree(iov); + } if (ret > 0) add_rchar(current, ret); inc_syscr(current); @@ -1276,18 +1234,18 @@ static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!(file->f_mode & FMODE_CAN_WRITE)) - goto out; - - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags); + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; -out: + ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); + if (ret >= 0) { + file_start_write(file); + ret = do_iter_write(file, &iter, pos, flags); + file_end_write(file); + kfree(iov); + } if (ret > 0) add_wchar(current, ret); inc_syscw(current); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 39bb1e838d8d..a11d773e5ff3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s) static void queue_log_writer(struct super_block *s) { - wait_queue_t wait; + wait_queue_entry_t wait; struct reiserfs_journal *journal = SB_JOURNAL(s); set_bit(J_WRITERS_QUEUED, &journal->j_state); diff --git a/fs/select.c b/fs/select.c index d6c652a31e99..9d5f15ed87fe 100644 --- a/fs/select.c +++ b/fs/select.c @@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; @@ -1161,59 +1161,25 @@ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); if (ufdset) { - unsigned long odd; - - if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) - return -EFAULT; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - *fdset++ = h << 32 | l; - nr -= 2; - } - if (odd && __get_user(*fdset, ufdset)) - return -EFAULT; + return compat_get_bitmap(fdset, ufdset, nr); } else { /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that + * kernel fdset at the end, ALIGN makes sure that * actually happens. */ - memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + memset(fdset, 0, ALIGN(nr, BITS_PER_LONG)); + return 0; } - return 0; } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { - unsigned long odd; - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - if (!ufdset) return 0; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - nr -= 2; - } - if (odd && __put_user(*fdset, ufdset)) - return -EFAULT; - return 0; + return compat_put_bitmap(ufdset, fdset, nr); } diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e3d71109f51..593b022ac11b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -43,7 +43,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) if (likely(!waitqueue_active(wqh))) return; - /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ wake_up_poll(wqh, POLLHUP | POLLFREE); } diff --git a/fs/splice.c b/fs/splice.c index 540c4a44756c..ae41201d0325 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -762,7 +762,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos); + ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; diff --git a/fs/statfs.c b/fs/statfs.c index 4e4623c7a126..fab9b6a3c116 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -38,6 +38,8 @@ static int flags_by_sb(int s_flags) flags |= ST_SYNCHRONOUS; if (s_flags & MS_MANDLOCK) flags |= ST_MANDLOCK; + if (s_flags & MS_RDONLY) + flags |= ST_RDONLY; return flags; } @@ -244,6 +246,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) #ifdef CONFIG_COMPAT static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) { + struct compat_statfs buf; if (sizeof ubuf->f_blocks == 4) { if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) @@ -257,20 +260,20 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + memset(&buf, 0, sizeof(struct compat_statfs)); + buf.f_type = kbuf->f_type; + buf.f_bsize = kbuf->f_bsize; + buf.f_blocks = kbuf->f_blocks; + buf.f_bfree = kbuf->f_bfree; + buf.f_bavail = kbuf->f_bavail; + buf.f_files = kbuf->f_files; + buf.f_ffree = kbuf->f_ffree; + buf.f_namelen = kbuf->f_namelen; + buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; + buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; + buf.f_frsize = kbuf->f_frsize; + buf.f_flags = kbuf->f_flags; + if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs))) return -EFAULT; return 0; } @@ -299,6 +302,7 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { + struct compat_statfs64 buf; if (sizeof(ubuf->f_bsize) == 4) { if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) @@ -312,20 +316,20 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + memset(&buf, 0, sizeof(struct compat_statfs64)); + buf.f_type = kbuf->f_type; + buf.f_bsize = kbuf->f_bsize; + buf.f_blocks = kbuf->f_blocks; + buf.f_bfree = kbuf->f_bfree; + buf.f_bavail = kbuf->f_bavail; + buf.f_files = kbuf->f_files; + buf.f_ffree = kbuf->f_ffree; + buf.f_namelen = kbuf->f_namelen; + buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; + buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; + buf.f_frsize = kbuf->f_frsize; + buf.f_flags = kbuf->f_flags; + if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs64))) return -EFAULT; return 0; } diff --git a/fs/sync.c b/fs/sync.c index 11ba023434b1..2a54c1f22035 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -192,7 +192,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) spin_unlock(&inode->i_lock); mark_inode_dirty_sync(inode); } - return call_fsync(file, start, end, datasync); + return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 5bdae85ceef7..f5191cb2c947 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -45,7 +45,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/timerfd.c b/fs/timerfd.c index c543cdb5f8ed..ece0c02d7e63 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -169,7 +169,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, - const struct itimerspec *ktmr) + const struct itimerspec64 *ktmr) { enum hrtimer_mode htmode; ktime_t texp; @@ -178,10 +178,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; - texp = timespec_to_ktime(ktmr->it_value); + texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; - ctx->tintv = timespec_to_ktime(ktmr->it_interval); + ctx->tintv = timespec64_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { alarm_init(&ctx->t.alarm, @@ -432,16 +432,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) } static int do_timerfd_settime(int ufd, int flags, - const struct itimerspec *new, - struct itimerspec *old) + const struct itimerspec64 *new, + struct itimerspec64 *old) { struct fd f; struct timerfd_ctx *ctx; int ret; if ((flags & ~TFD_SETTIME_FLAGS) || - !timespec_valid(&new->it_value) || - !timespec_valid(&new->it_interval)) + !itimerspec64_valid(new)) return -EINVAL; ret = timerfd_fget(ufd, &f); @@ -487,8 +486,8 @@ static int do_timerfd_settime(int ufd, int flags, hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); } - old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - old->it_interval = ktime_to_timespec(ctx->tintv); + old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + old->it_interval = ktime_to_timespec64(ctx->tintv); /* * Re-program the timer to the new value ... @@ -500,7 +499,7 @@ static int do_timerfd_settime(int ufd, int flags, return ret; } -static int do_timerfd_gettime(int ufd, struct itimerspec *t) +static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { struct fd f; struct timerfd_ctx *ctx; @@ -525,8 +524,8 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t) hrtimer_restart(&ctx->t.tmr); } } - t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - t->it_interval = ktime_to_timespec(ctx->tintv); + t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); fdput(f); return 0; @@ -536,15 +535,15 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct itimerspec __user *, utmr, struct itimerspec __user *, otmr) { - struct itimerspec new, old; + struct itimerspec64 new, old; int ret; - if (copy_from_user(&new, utmr, sizeof(new))) + if (get_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && copy_to_user(otmr, &old, sizeof(old))) + if (otmr && put_itimerspec64(&old, otmr)) return -EFAULT; return ret; @@ -552,11 +551,11 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { - struct itimerspec kotmr; + struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; + return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT @@ -564,15 +563,15 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct compat_itimerspec __user *, utmr, struct compat_itimerspec __user *, otmr) { - struct itimerspec new, old; + struct itimerspec64 new, old; int ret; - if (get_compat_itimerspec(&new, utmr)) + if (get_compat_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && put_compat_itimerspec(otmr, &old)) + if (otmr && put_compat_itimerspec64(&old, otmr)) return -EFAULT; return ret; } @@ -580,10 +579,10 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct compat_itimerspec __user *, otmr) { - struct itimerspec kotmr; + struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; + return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #endif diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index de01b8f2aa78..48609f1d9580 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -53,7 +53,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1d622f276e3a..cadcd12a3d35 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx { struct userfaultfd_wait_queue { struct uffd_msg msg; - wait_queue_t wq; + wait_queue_entry_t wq; struct userfaultfd_ctx *ctx; bool waken; }; @@ -91,7 +91,7 @@ struct userfaultfd_wake_range { unsigned long len; }; -static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; @@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, * wouldn't be enough, the smp_mb__before_spinlock is * enough to avoid an explicit smp_mb() here. */ - list_del_init(&wq->task_list); + list_del_init(&wq->entry); out: return ret; } @@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); if (!pte) goto out; @@ -243,6 +244,7 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -448,7 +450,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + vmf->address, vmf->flags, reason); up_read(&mm->mmap_sem); @@ -522,13 +525,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ - if (!list_empty_careful(&uwq.wq.task_list)) { + if (!list_empty_careful(&uwq.wq.entry)) { spin_lock(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ - list_del(&uwq.wq.task_list); + list_del(&uwq.wq.entry); spin_unlock(&ctx->fault_pending_wqh.lock); } @@ -860,7 +863,7 @@ wakeup: static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) { - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; VM_BUG_ON(!spin_is_locked(&wqh->lock)); @@ -869,7 +872,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); + wq = list_last_entry(&wqh->head, typeof(*wq), entry); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; @@ -1003,14 +1006,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in - * handle_userfault(). The uwq->wq.task_list + * handle_userfault(). The uwq->wq.head list * must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. */ - list_del(&uwq->wq.task_list); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1032,7 +1035,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; - list_move(&uwq->wq.task_list, &fork_event); + list_move(&uwq->wq.entry, &fork_event); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1069,8 +1072,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!list_empty(&fork_event)) { uwq = list_first_entry(&fork_event, typeof(*uwq), - wq.task_list); - list_del(&uwq->wq.task_list); + wq.entry); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); userfaultfd_event_complete(ctx, uwq); } @@ -1114,11 +1117,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { - unsigned long start, end; - - start = range->start; - end = range->start + range->len; - spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) @@ -1747,17 +1745,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); - list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } - list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 35faf128f36d..1b98cfa342ab 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -96,3 +96,16 @@ config XFS_DEBUG not useful unless you are debugging a particular problem. Say N unless you are an XFS developer, or you play one on TV. + +config XFS_ASSERT_FATAL + bool "XFS fatal asserts" + default y + depends on XFS_FS && XFS_DEBUG + help + Set the default DEBUG mode ASSERT failure behavior. + + Say Y here to cause DEBUG mode ASSERT failures to result in fatal + errors that BUG() the kernel by default. If you say N, ASSERT failures + result in warnings. + + This behavior can be modified at runtime via sysfs. diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5c90f82b8f6b..a6e955bfead8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,8 +98,7 @@ xfs-y += xfs_aops.o \ xfs_sysfs.o \ xfs_trans.o \ xfs_xattr.o \ - kmem.o \ - uuid.o + kmem.o # low-level transaction/log code xfs-y += xfs_log.o \ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 33db69be4832..b008ff3250eb 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -111,8 +111,7 @@ xfs_ag_resv_critical( /* Critically low if less than 10% or max btree height remains. */ return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, - pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, - XFS_RANDOM_AG_RESV_CRITICAL); + pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); } /* diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7486401ccbd3..744dcaec34cc 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -606,7 +606,7 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -STATIC int /* error */ +int /* error */ xfs_alloc_read_agfl( xfs_mount_t *mp, /* mount point structure */ xfs_trans_t *tp, /* transaction pointer */ @@ -2454,8 +2454,7 @@ xfs_agf_read_verify( !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) xfs_buf_ioerror(bp, -EFSBADCRC); else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, - XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF)) + XFS_ERRTAG_ALLOC_READ_AGF)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -2842,8 +2841,7 @@ xfs_free_extent( ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT, - XFS_RANDOM_FREE_EXTENT)) + XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, agno, &agbp); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 77d9c27330ab..ef26edc2e938 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -213,6 +213,8 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index e1fcfe7f0a9a..cfde0a0f9706 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -253,7 +253,7 @@ xfs_allocbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } -STATIC __int64_t +STATIC int64_t xfs_bnobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -261,42 +261,42 @@ xfs_bnobt_key_diff( xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -STATIC __int64_t +STATIC int64_t xfs_cntbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - __int64_t diff; + int64_t diff; - diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) return diff; - return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -STATIC __int64_t +STATIC int64_t xfs_bnobt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) - + return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - be32_to_cpu(k2->alloc.ar_startblock); } -STATIC __int64_t +STATIC int64_t xfs_cntbt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - __int64_t diff; + int64_t diff; diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); @@ -395,7 +395,6 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bnobt_keys_inorder( struct xfs_btree_cur *cur, @@ -442,7 +441,6 @@ xfs_cntbt_recs_inorder( be32_to_cpu(r1->alloc.ar_startblock) < be32_to_cpu(r2->alloc.ar_startblock)); } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), @@ -462,10 +460,8 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .key_diff = xfs_bnobt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, -#endif }; static const struct xfs_btree_ops xfs_cntbt_ops = { @@ -486,10 +482,8 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .key_diff = xfs_cntbt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6622d46ddec3..ef8a1c75a467 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -114,6 +114,23 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ +/* Retrieve an extended attribute and its value. Must have iolock. */ +int +xfs_attr_get_ilocked( + struct xfs_inode *ip, + struct xfs_da_args *args) +{ + if (!xfs_inode_hasattr(ip)) + return -ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + return xfs_attr_shortform_getvalue(args); + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + return xfs_attr_leaf_get(args); + else + return xfs_attr_node_get(args); +} + +/* Retrieve an extended attribute by name, and its value. */ int xfs_attr_get( struct xfs_inode *ip, @@ -141,14 +158,7 @@ xfs_attr_get( args.op_flags = XFS_DA_OP_OKNOENT; lock_mode = xfs_ilock_attr_map_shared(ip); - if (!xfs_inode_hasattr(ip)) - error = -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) - error = xfs_attr_shortform_getvalue(&args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) - error = xfs_attr_leaf_get(&args); - else - error = xfs_attr_node_get(&args); + error = xfs_attr_get_ilocked(ip, &args); xfs_iunlock(ip, lock_mode); *valuelenp = args.valuelen; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2852521fc8ec..c6c15e5717e4 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -351,7 +351,7 @@ xfs_attr3_leaf_read( err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d52f525f5b2d..5236d8e45146 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -253,7 +253,7 @@ xfs_attr_rmtval_copyout( xfs_ino_t ino, int *offset, int *valuelen, - __uint8_t **dst) + uint8_t **dst) { char *src = bp->b_addr; xfs_daddr_t bno = bp->b_bn; @@ -301,7 +301,7 @@ xfs_attr_rmtval_copyin( xfs_ino_t ino, int *offset, int *valuelen, - __uint8_t **src) + uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = bp->b_bn; @@ -355,7 +355,7 @@ xfs_attr_rmtval_get( struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; - __uint8_t *dst = args->value; + uint8_t *dst = args->value; int valuelen; int nmap; int error; @@ -386,7 +386,8 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, args->trans, + mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) @@ -395,7 +396,7 @@ xfs_attr_rmtval_get( error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_buf_relse(bp); + xfs_trans_brelse(args->trans, bp); if (error) return error; @@ -421,7 +422,7 @@ xfs_attr_rmtval_set( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - __uint8_t *src = args->value; + uint8_t *src = args->value; int blkcnt; int valuelen; int nmap; diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 90928bbe693c..afd684ae3136 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -31,10 +31,10 @@ typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; * We generate this then sort it, attr_list() must return things in hash-order. */ typedef struct xfs_attr_sf_sort { - __uint8_t entno; /* entry number in original list */ - __uint8_t namelen; /* length of name value (no null) */ - __uint8_t valuelen; /* length of value */ - __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + uint8_t entno; /* entry number in original list */ + uint8_t namelen; /* length of name value (no null) */ + uint8_t valuelen; /* length of value */ + uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; @@ -42,7 +42,7 @@ typedef struct xfs_attr_sf_sort { #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ - ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) + ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) #define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) #define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index e1649c0d3e02..61c6b2025d0c 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -25,47 +25,47 @@ /* * masks with n high/low bits set, 64-bit values */ -static inline __uint64_t xfs_mask64hi(int n) +static inline uint64_t xfs_mask64hi(int n) { - return (__uint64_t)-1 << (64 - (n)); + return (uint64_t)-1 << (64 - (n)); } -static inline __uint32_t xfs_mask32lo(int n) +static inline uint32_t xfs_mask32lo(int n) { - return ((__uint32_t)1 << (n)) - 1; + return ((uint32_t)1 << (n)) - 1; } -static inline __uint64_t xfs_mask64lo(int n) +static inline uint64_t xfs_mask64lo(int n) { - return ((__uint64_t)1 << (n)) - 1; + return ((uint64_t)1 << (n)) - 1; } /* Get high bit set out of 32-bit argument, -1 if none set */ -static inline int xfs_highbit32(__uint32_t v) +static inline int xfs_highbit32(uint32_t v) { return fls(v) - 1; } /* Get high bit set out of 64-bit argument, -1 if none set */ -static inline int xfs_highbit64(__uint64_t v) +static inline int xfs_highbit64(uint64_t v) { return fls64(v) - 1; } /* Get low bit set out of 32-bit argument, -1 if none set */ -static inline int xfs_lowbit32(__uint32_t v) +static inline int xfs_lowbit32(uint32_t v) { return ffs(v) - 1; } /* Get low bit set out of 64-bit argument, -1 if none set */ -static inline int xfs_lowbit64(__uint64_t v) +static inline int xfs_lowbit64(uint64_t v) { - __uint32_t w = (__uint32_t)v; + uint32_t w = (uint32_t)v; int n = 0; if (w) { /* lower bits */ n = ffs(w); } else { /* upper bits */ - w = (__uint32_t)(v >> 32); + w = (uint32_t)(v >> 32); if (w) { n = ffs(w); if (n) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a7048eafa8e6..0a9880777c9c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3992,7 +3992,7 @@ xfs_bmapi_read( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -4473,7 +4473,7 @@ xfs_bmapi_write( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -4694,7 +4694,7 @@ xfs_bmapi_remap( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -5434,6 +5434,7 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ + xfs_fileoff_t max_len; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5455,6 +5456,16 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); + /* + * Guesstimate how many blocks we can unmap without running the risk of + * blowing out the transaction with a mix of EFIs and reflink + * adjustments. + */ + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); + else + max_len = len; + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5499,7 +5510,7 @@ __xfs_bunmapi( extno = 0; while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && - (nexts == 0 || extno < nexts)) { + (nexts == 0 || extno < nexts) && max_len > 0) { /* * Is the found extent after a hole in which bno lives? * Just back up to the previous extent, if so. @@ -5531,6 +5542,15 @@ __xfs_bunmapi( } if (del.br_startoff + del.br_blockcount > bno + 1) del.br_blockcount = bno + 1 - del.br_startoff; + + /* How much can we safely unmap? */ + if (max_len < del.br_blockcount) { + del.br_startoff += del.br_blockcount - max_len; + if (!wasdel) + del.br_startblock += del.br_blockcount - max_len; + del.br_blockcount = max_len; + } + sum = del.br_startblock + del.br_blockcount; if (isrt && (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { @@ -5707,6 +5727,7 @@ __xfs_bunmapi( if (!isrt && wasdel) xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); + max_len -= del.br_blockcount; bno = del.br_startoff - 1; nodelete: /* @@ -6077,7 +6098,7 @@ xfs_bmap_shift_extents( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmap_shift_extents", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -6229,7 +6250,7 @@ xfs_bmap_split_extent_at( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmap_split_extent_at", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -6472,33 +6493,33 @@ xfs_bmap_finish_one( int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, + xfs_filblks_t *blockcount, xfs_exntst_t state) { - int error = 0, done; + xfs_fsblock_t firstfsb; + int error = 0; trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - ip->i_ino, whichfork, startoff, blockcount, state); + ip->i_ino, whichfork, startoff, *blockcount, state); if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; if (XFS_TEST_ERROR(false, tp->t_mountp, - XFS_ERRTAG_BMAP_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE)) + XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (type) { case XFS_BMAP_MAP: - error = xfs_bmapi_remap(tp, ip, startoff, blockcount, + error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, startblock, dfops); + *blockcount = 0; break; case XFS_BMAP_UNMAP: - error = xfs_bunmapi(tp, ip, startoff, blockcount, - XFS_BMAPI_REMAP, 1, &startblock, dfops, &done); - ASSERT(done); + error = __xfs_bunmapi(tp, ip, startoff, blockcount, + XFS_BMAPI_REMAP, 1, &firstfsb, dfops); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index c35a14fa1527..851982a5dfbc 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -271,7 +271,7 @@ struct xfs_bmap_intent { int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, xfs_exntst_t state); + xfs_filblks_t *blockcount, xfs_exntst_t state); int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6cba69aff077..85de22513014 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -94,8 +94,8 @@ xfs_bmdr_to_bmbt( */ STATIC void __xfs_bmbt_get_all( - __uint64_t l0, - __uint64_t l1, + uint64_t l0, + uint64_t l1, xfs_bmbt_irec_t *s) { int ext_flag; @@ -573,6 +573,16 @@ xfs_bmbt_init_key_from_rec( } STATIC void +xfs_bmbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = cpu_to_be64( + xfs_bmbt_disk_get_startoff(&rec->bmbt) + + xfs_bmbt_disk_get_blockcount(&rec->bmbt) - 1); +} + +STATIC void xfs_bmbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -588,15 +598,25 @@ xfs_bmbt_init_ptr_from_cur( ptr->l = 0; } -STATIC __int64_t +STATIC int64_t xfs_bmbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - cur->bc_rec.b.br_startoff; } +STATIC int64_t +xfs_bmbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - + be64_to_cpu(k2->bmbt.br_startoff); +} + static bool xfs_bmbt_verify( struct xfs_buf *bp) @@ -687,7 +707,6 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bmbt_keys_inorder( struct xfs_btree_cur *cur, @@ -708,7 +727,6 @@ xfs_bmbt_recs_inorder( xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= xfs_bmbt_disk_get_startoff(&r2->bmbt); } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), @@ -722,14 +740,14 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .get_minrecs = xfs_bmbt_get_minrecs, .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .diff_two_keys = xfs_bmbt_diff_two_keys, .buf_ops = &xfs_bmbt_buf_ops, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 3a673ba201aa..4da85fff69ad 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -43,7 +43,7 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { +static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, XFS_FIBT_MAGIC, 0 }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, @@ -51,12 +51,12 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { XFS_REFC_CRC_MAGIC } }; -__uint32_t +uint32_t xfs_btree_magic( int crc, xfs_btnum_t btnum) { - __uint32_t magic = xfs_magics[crc][btnum]; + uint32_t magic = xfs_magics[crc][btnum]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); @@ -101,8 +101,7 @@ xfs_btree_check_lblock( be64_to_cpu(block->bb_u.l.bb_rightsib))); if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, - XFS_ERRTAG_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); @@ -153,8 +152,7 @@ xfs_btree_check_sblock( block->bb_u.s.bb_rightsib; if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, - XFS_ERRTAG_BTREE_CHECK_SBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); @@ -568,7 +566,7 @@ xfs_btree_ptr_offset( /* * Return a pointer to the n-th record in the btree block. */ -STATIC union xfs_btree_rec * +union xfs_btree_rec * xfs_btree_rec_addr( struct xfs_btree_cur *cur, int n, @@ -581,7 +579,7 @@ xfs_btree_rec_addr( /* * Return a pointer to the n-th key in the btree block. */ -STATIC union xfs_btree_key * +union xfs_btree_key * xfs_btree_key_addr( struct xfs_btree_cur *cur, int n, @@ -594,7 +592,7 @@ xfs_btree_key_addr( /* * Return a pointer to the n-th high key in the btree block. */ -STATIC union xfs_btree_key * +union xfs_btree_key * xfs_btree_high_key_addr( struct xfs_btree_cur *cur, int n, @@ -607,7 +605,7 @@ xfs_btree_high_key_addr( /* * Return a pointer to the n-th block pointer in the btree block. */ -STATIC union xfs_btree_ptr * +union xfs_btree_ptr * xfs_btree_ptr_addr( struct xfs_btree_cur *cur, int n, @@ -641,7 +639,7 @@ xfs_btree_get_iroot( * Retrieve the block pointer from the cursor at the given level. * This may be an inode btree root or from a buffer. */ -STATIC struct xfs_btree_block * /* generic btree block pointer */ +struct xfs_btree_block * /* generic btree block pointer */ xfs_btree_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in btree */ @@ -778,14 +776,14 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - __int64_t fields, /* bitmask of fields */ + int64_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - __int64_t imask; /* mask for current bit number */ + int64_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* @@ -1756,7 +1754,7 @@ error0: return error; } -STATIC int +int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in the btree */ @@ -1846,7 +1844,7 @@ xfs_btree_lookup( int *stat) /* success/failure */ { struct xfs_btree_block *block; /* current btree block */ - __int64_t diff; /* difference for the current key */ + int64_t diff; /* difference for the current key */ int error; /* error return value */ int keyno; /* current key number */ int level; /* level in the btree */ @@ -4435,7 +4433,7 @@ xfs_btree_visit_blocks( * recovery completion writes the changes to disk. */ struct xfs_btree_block_change_owner_info { - __uint64_t new_owner; + uint64_t new_owner; struct list_head *buffer_list; }; @@ -4481,7 +4479,7 @@ xfs_btree_block_change_owner( int xfs_btree_change_owner( struct xfs_btree_cur *cur, - __uint64_t new_owner, + uint64_t new_owner, struct list_head *buffer_list) { struct xfs_btree_block_change_owner_info bbcoi; @@ -4585,7 +4583,7 @@ xfs_btree_simple_query_range( { union xfs_btree_rec *recp; union xfs_btree_key rec_key; - __int64_t diff; + int64_t diff; int stat; bool firstrec = true; int error; @@ -4682,8 +4680,8 @@ xfs_btree_overlapped_query_range( union xfs_btree_key *hkp; union xfs_btree_rec *recp; struct xfs_btree_block *block; - __int64_t ldiff; - __int64_t hdiff; + int64_t ldiff; + int64_t hdiff; int level; struct xfs_buf *bp; int i; @@ -4849,12 +4847,14 @@ xfs_btree_query_all( xfs_btree_query_range_fn fn, void *priv) { - union xfs_btree_irec low_rec; - union xfs_btree_irec high_rec; + union xfs_btree_key low_key; + union xfs_btree_key high_key; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + memset(&low_key, 0, sizeof(low_key)); + memset(&high_key, 0xFF, sizeof(high_key)); - memset(&low_rec, 0, sizeof(low_rec)); - memset(&high_rec, 0xFF, sizeof(high_rec)); - return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv); + return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 27bed08261c5..9c95e965cfe5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -76,7 +76,7 @@ union xfs_btree_rec { #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) #define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) -__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); +uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. @@ -150,20 +150,19 @@ struct xfs_btree_ops { union xfs_btree_rec *rec); /* difference between key value and cursor value */ - __int64_t (*key_diff)(struct xfs_btree_cur *cur, + int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. */ - __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, + int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, union xfs_btree_key *key1, union xfs_btree_key *key2); const struct xfs_buf_ops *buf_ops; -#if defined(DEBUG) || defined(XFS_WARN) /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, @@ -173,7 +172,6 @@ struct xfs_btree_ops { int (*recs_inorder)(struct xfs_btree_cur *cur, union xfs_btree_rec *r1, union xfs_btree_rec *r2); -#endif }; /* @@ -213,11 +211,11 @@ typedef struct xfs_btree_cur union xfs_btree_irec bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ - __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ + uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ #define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ #define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ - __uint8_t bc_nlevels; /* number of levels in the tree */ - __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + uint8_t bc_nlevels; /* number of levels in the tree */ + uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { @@ -330,7 +328,7 @@ xfs_btree_islastblock( */ void xfs_btree_offsets( - __int64_t fields, /* bitmask of fields */ + int64_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -408,7 +406,7 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); -int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, +int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list); /* @@ -434,7 +432,7 @@ static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) } static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, - __uint16_t numrecs) + uint16_t numrecs) { block->bb_numrecs = cpu_to_be16(numrecs); } @@ -506,4 +504,17 @@ int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, + union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); +struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, + int level, struct xfs_buf **bpp); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index a416c7cb23ea..8211f48b98e6 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -1,7 +1,7 @@ #ifndef _XFS_CKSUM_H #define _XFS_CKSUM_H 1 -#define XFS_CRC_SEED (~(__uint32_t)0) +#define XFS_CRC_SEED (~(uint32_t)0) /* * Calculate the intermediate checksum for a buffer that has the CRC field @@ -9,11 +9,11 @@ * cksum_offset parameter. We do not modify the buffer during verification, * hence we have to split the CRC calculation across the cksum_offset. */ -static inline __uint32_t +static inline uint32_t xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t zero = 0; - __uint32_t crc; + uint32_t zero = 0; + uint32_t crc; /* Calculate CRC up to the checksum. */ crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); @@ -30,7 +30,7 @@ xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) * Fast CRC method where the buffer is modified. Callers must have exclusive * access to the buffer while the calculation takes place. */ -static inline __uint32_t +static inline uint32_t xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) { /* zero the CRC field */ @@ -48,7 +48,7 @@ xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) * so that it is consistent on disk. */ static inline __le32 -xfs_end_cksum(__uint32_t crc) +xfs_end_cksum(uint32_t crc) { return ~cpu_to_le32(crc); } @@ -62,7 +62,7 @@ xfs_end_cksum(__uint32_t crc) static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); + uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -73,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); + uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 1bdf2888295b..6d4335815c3f 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -263,7 +263,7 @@ xfs_da3_node_read( err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, which_fork, &xfs_da3_node_buf_ops); - if (!err && tp) { + if (!err && tp && *bpp) { struct xfs_da_blkinfo *info = (*bpp)->b_addr; int type; @@ -1282,7 +1282,7 @@ xfs_da3_fixhashpath( return; break; case XFS_DIR2_LEAFN_MAGIC: - lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); + lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count); if (count == 0) return; break; @@ -1502,8 +1502,8 @@ xfs_da3_node_lookup_int( if (blk->magic == XFS_DIR2_LEAFN_MAGIC || blk->magic == XFS_DIR3_LEAFN_MAGIC) { blk->magic = XFS_DIR2_LEAFN_MAGIC; - blk->hashval = xfs_dir2_leafn_lasthash(args->dp, - blk->bp, NULL); + blk->hashval = xfs_dir2_leaf_lasthash(args->dp, + blk->bp, NULL); break; } @@ -1929,8 +1929,8 @@ xfs_da3_path_shift( blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - blk->hashval = xfs_dir2_leafn_lasthash(args->dp, - blk->bp, NULL); + blk->hashval = xfs_dir2_leaf_lasthash(args->dp, + blk->bp, NULL); break; default: ASSERT(0); @@ -1952,7 +1952,7 @@ xfs_da3_path_shift( * This is implemented with some source-level loop unrolling. */ xfs_dahash_t -xfs_da_hashname(const __uint8_t *name, int namelen) +xfs_da_hashname(const uint8_t *name, int namelen) { xfs_dahash_t hash; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 4e29cb6a3627..ae6de17467f2 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -60,10 +60,10 @@ enum xfs_dacmp { */ typedef struct xfs_da_args { struct xfs_da_geometry *geo; /* da block geometry */ - const __uint8_t *name; /* string (maybe not NULL terminated) */ + const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ - __uint8_t filetype; /* filetype of inode for directories */ - __uint8_t *value; /* set of bytes (maybe contain NULLs) */ + uint8_t filetype; /* filetype of inode for directories */ + uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ xfs_dahash_t hashval; /* hash value of name */ @@ -207,7 +207,7 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); -uint xfs_da_hashname(const __uint8_t *name_string, int name_length); +uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index f1e8d4dbb600..6d77d1a8498a 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -49,7 +49,7 @@ xfs_dir3_sf_entsize( struct xfs_dir2_sf_hdr *hdr, int len) { - return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); + return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t); } static struct xfs_dir2_sf_entry * @@ -77,7 +77,7 @@ xfs_dir3_sf_nextentry( * not necessary. For non-filetype enable directories, the type is always * unknown and we never store the value. */ -static __uint8_t +static uint8_t xfs_dir2_sfe_get_ftype( struct xfs_dir2_sf_entry *sfep) { @@ -87,16 +87,16 @@ xfs_dir2_sfe_get_ftype( static void xfs_dir2_sfe_put_ftype( struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype) + uint8_t ftype) { ASSERT(ftype < XFS_DIR3_FT_MAX); } -static __uint8_t +static uint8_t xfs_dir3_sfe_get_ftype( struct xfs_dir2_sf_entry *sfep) { - __uint8_t ftype; + uint8_t ftype; ftype = sfep->name[sfep->namelen]; if (ftype >= XFS_DIR3_FT_MAX) @@ -107,7 +107,7 @@ xfs_dir3_sfe_get_ftype( static void xfs_dir3_sfe_put_ftype( struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype) + uint8_t ftype) { ASSERT(ftype < XFS_DIR3_FT_MAX); @@ -124,7 +124,7 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - __uint8_t *from) + uint8_t *from) { if (hdr->i8count) return get_unaligned_be64(from) & 0x00ffffffffffffffULL; @@ -135,7 +135,7 @@ xfs_dir2_sf_get_ino( static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - __uint8_t *to, + uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); @@ -225,7 +225,7 @@ xfs_dir3_sfe_put_ino( #define XFS_DIR3_DATA_ENTSIZE(n) \ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ - sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \ XFS_DIR2_DATA_ALIGN) static int @@ -242,7 +242,7 @@ xfs_dir3_data_entsize( return XFS_DIR3_DATA_ENTSIZE(n); } -static __uint8_t +static uint8_t xfs_dir2_data_get_ftype( struct xfs_dir2_data_entry *dep) { @@ -252,16 +252,16 @@ xfs_dir2_data_get_ftype( static void xfs_dir2_data_put_ftype( struct xfs_dir2_data_entry *dep, - __uint8_t ftype) + uint8_t ftype) { ASSERT(ftype < XFS_DIR3_FT_MAX); } -static __uint8_t +static uint8_t xfs_dir3_data_get_ftype( struct xfs_dir2_data_entry *dep) { - __uint8_t ftype = dep->name[dep->namelen]; + uint8_t ftype = dep->name[dep->namelen]; if (ftype >= XFS_DIR3_FT_MAX) return XFS_DIR3_FT_UNKNOWN; @@ -271,7 +271,7 @@ xfs_dir3_data_get_ftype( static void xfs_dir3_data_put_ftype( struct xfs_dir2_data_entry *dep, - __uint8_t type) + uint8_t type) { ASSERT(type < XFS_DIR3_FT_MAX); ASSERT(dep->namelen != 0); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 9a492a9e19bd..3771edcb301d 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -111,11 +111,11 @@ struct xfs_da3_intnode { * appropriate. */ struct xfs_da3_icnode_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t level; + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t level; }; /* @@ -187,14 +187,14 @@ struct xfs_da3_icnode_hdr { /* * Byte offset in data block and shortform entry. */ -typedef __uint16_t xfs_dir2_data_off_t; +typedef uint16_t xfs_dir2_data_off_t; #define NULLDATAOFF 0xffffU typedef uint xfs_dir2_data_aoff_t; /* argument form */ /* * Offset in data space of a data entry. */ -typedef __uint32_t xfs_dir2_dataptr_t; +typedef uint32_t xfs_dir2_dataptr_t; #define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) #define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) @@ -206,7 +206,7 @@ typedef xfs_off_t xfs_dir2_off_t; /* * Directory block number (logical dirblk in file) */ -typedef __uint32_t xfs_dir2_db_t; +typedef uint32_t xfs_dir2_db_t; #define XFS_INO32_SIZE 4 #define XFS_INO64_SIZE 8 @@ -226,9 +226,9 @@ typedef __uint32_t xfs_dir2_db_t; * over them. */ typedef struct xfs_dir2_sf_hdr { - __uint8_t count; /* count of entries */ - __uint8_t i8count; /* count of 8-byte inode #s */ - __uint8_t parent[8]; /* parent dir inode number */ + uint8_t count; /* count of entries */ + uint8_t i8count; /* count of 8-byte inode #s */ + uint8_t parent[8]; /* parent dir inode number */ } __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { @@ -447,11 +447,11 @@ struct xfs_dir3_leaf_hdr { }; struct xfs_dir3_icleaf_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t stale; + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t stale; }; /* @@ -538,10 +538,10 @@ struct xfs_dir3_free { * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. */ struct xfs_dir3_icfree_hdr { - __uint32_t magic; - __uint32_t firstdb; - __uint32_t nvalid; - __uint32_t nused; + uint32_t magic; + uint32_t firstdb; + uint32_t nvalid; + uint32_t nused; }; @@ -632,10 +632,10 @@ typedef struct xfs_attr_shortform { __u8 padding; } hdr; struct xfs_attr_sf_entry { - __uint8_t namelen; /* actual length of name (no NULL) */ - __uint8_t valuelen; /* actual length of value (no NULL) */ - __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - __uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t namelen; /* actual length of name (no NULL) */ + uint8_t valuelen; /* actual length of value (no NULL) */ + uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + uint8_t nameval[1]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ } xfs_attr_shortform_t; @@ -725,22 +725,22 @@ struct xfs_attr3_leafblock { * incore, neutral version of the attribute leaf header */ struct xfs_attr3_icleaf_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t usedbytes; + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t usedbytes; /* * firstused is 32-bit here instead of 16-bit like the on-disk variant * to support maximum fsb size of 64k without overflow issues throughout * the attr code. Instead, the overflow condition is handled on * conversion to/from disk. */ - __uint32_t firstused; + uint32_t firstused; __u8 holes; struct { - __uint16_t base; - __uint16_t size; + uint16_t base; + uint16_t size; } freemap[XFS_ATTR_LEAF_MAPSIZE]; }; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 2f389d366e93..ccf9783fd3f0 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -218,8 +218,7 @@ xfs_dir_ino_validate( agblkno != 0 && ioff < (1 << mp->m_sb.sb_inopblog) && XFS_AGINO_TO_INO(mp, agno, agino) == ino; - if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, - XFS_RANDOM_DIR_INO_VALIDATE))) { + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index d6e6d9d16f6c..21c8f8bf94d5 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -47,9 +47,9 @@ struct xfs_dir_ops { struct xfs_dir2_sf_entry * (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep); - __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype); + uint8_t ftype); xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep); void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, @@ -60,9 +60,9 @@ struct xfs_dir_ops { xfs_ino_t ino); int (*data_entsize)(int len); - __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, - __uint8_t ftype); + uint8_t ftype); __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); struct xfs_dir2_data_free * (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index aa17cb788946..43c902f7a68d 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -139,7 +139,7 @@ xfs_dir3_block_read( err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index b887fb2a2bcf..27297a689d9c 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -145,7 +145,7 @@ xfs_dir3_leaf_check_int( static bool xfs_dir3_leaf_verify( struct xfs_buf *bp, - __uint16_t magic) + uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; @@ -154,7 +154,7 @@ xfs_dir3_leaf_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - __uint16_t magic3; + uint16_t magic3; magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC : XFS_DIR3_LEAFN_MAGIC; @@ -178,7 +178,7 @@ xfs_dir3_leaf_verify( static void __read_verify( struct xfs_buf *bp, - __uint16_t magic) + uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -195,7 +195,7 @@ __read_verify( static void __write_verify( struct xfs_buf *bp, - __uint16_t magic) + uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; @@ -256,7 +256,7 @@ const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .verify_write = xfs_dir3_leafn_write_verify, }; -static int +int xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, @@ -268,7 +268,7 @@ xfs_dir3_leaf_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); return err; } @@ -285,7 +285,7 @@ xfs_dir3_leafn_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); return err; } @@ -299,7 +299,7 @@ xfs_dir3_leaf_init( struct xfs_trans *tp, struct xfs_buf *bp, xfs_ino_t owner, - __uint16_t type) + uint16_t type) { struct xfs_dir2_leaf *leaf = bp->b_addr; @@ -343,7 +343,7 @@ xfs_dir3_leaf_get_buf( xfs_da_args_t *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, - __uint16_t magic) + uint16_t magic) { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index bbd1238852b3..682e2bf370c7 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -528,7 +528,7 @@ xfs_dir2_free_hdr_check( * Stale entries are ok. */ xfs_dahash_t /* hash value */ -xfs_dir2_leafn_lasthash( +xfs_dir2_leaf_lasthash( struct xfs_inode *dp, struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ @@ -540,7 +540,9 @@ xfs_dir2_leafn_lasthash( dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || - leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); if (count) *count = leafhdr.count; @@ -1405,8 +1407,8 @@ xfs_dir2_leafn_split( /* * Update last hashval in each block since we added the name. */ - oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); - newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + oldblk->hashval = xfs_dir2_leaf_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leaf_lasthash(dp, newblk->bp, NULL); xfs_dir3_leaf_check(dp, oldblk->bp); xfs_dir3_leaf_check(dp, newblk->bp); return error; diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 39f8604f764e..4badd26c47e6 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -58,6 +58,8 @@ extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); /* xfs_dir2_leaf.c */ +extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, @@ -69,7 +71,7 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_buf **bpp, __uint16_t magic); + struct xfs_buf **bpp, uint16_t magic); extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, struct xfs_buf *bp, int first, int last); extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, @@ -93,7 +95,7 @@ extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, +extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp, struct xfs_buf *bp, int *count); extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, @@ -128,7 +130,7 @@ extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern int xfs_dir2_sf_verify(struct xfs_inode *ip); /* xfs_dir2_readdir.c */ -extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, - size_t bufsize); +extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, + struct dir_context *ctx, size_t bufsize); #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e84af093b2ab..be8b9755f66a 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -647,7 +647,7 @@ xfs_dir2_sf_verify( int offset; int size; int error; - __uint8_t filetype; + uint8_t filetype; ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index a1dccd8d96bc..23229f0c5b15 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -103,8 +103,8 @@ struct xfs_ifork; * Must be padded to 64 bit alignment. */ typedef struct xfs_sb { - __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ - __uint32_t sb_blocksize; /* logical block size, bytes */ + uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + uint32_t sb_blocksize; /* logical block size, bytes */ xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ xfs_rtblock_t sb_rextents; /* number of realtime extents */ @@ -118,45 +118,45 @@ typedef struct xfs_sb { xfs_agnumber_t sb_agcount; /* number of allocation groups */ xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ xfs_extlen_t sb_logblocks; /* number of log blocks */ - __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ - __uint16_t sb_sectsize; /* volume sector size, bytes */ - __uint16_t sb_inodesize; /* inode size, bytes */ - __uint16_t sb_inopblock; /* inodes per block */ + uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + uint16_t sb_sectsize; /* volume sector size, bytes */ + uint16_t sb_inodesize; /* inode size, bytes */ + uint16_t sb_inopblock; /* inodes per block */ char sb_fname[12]; /* file system name */ - __uint8_t sb_blocklog; /* log2 of sb_blocksize */ - __uint8_t sb_sectlog; /* log2 of sb_sectsize */ - __uint8_t sb_inodelog; /* log2 of sb_inodesize */ - __uint8_t sb_inopblog; /* log2 of sb_inopblock */ - __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ - __uint8_t sb_rextslog; /* log2 of sb_rextents */ - __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ - __uint8_t sb_imax_pct; /* max % of fs for inode space */ + uint8_t sb_blocklog; /* log2 of sb_blocksize */ + uint8_t sb_sectlog; /* log2 of sb_sectsize */ + uint8_t sb_inodelog; /* log2 of sb_inodesize */ + uint8_t sb_inopblog; /* log2 of sb_inopblock */ + uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + uint8_t sb_rextslog; /* log2 of sb_rextents */ + uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + uint8_t sb_imax_pct; /* max % of fs for inode space */ /* statistics */ /* * These fields must remain contiguous. If you really * want to change their layout, make sure you fix the * code in xfs_trans_apply_sb_deltas(). */ - __uint64_t sb_icount; /* allocated inodes */ - __uint64_t sb_ifree; /* free inodes */ - __uint64_t sb_fdblocks; /* free data blocks */ - __uint64_t sb_frextents; /* free realtime extents */ + uint64_t sb_icount; /* allocated inodes */ + uint64_t sb_ifree; /* free inodes */ + uint64_t sb_fdblocks; /* free data blocks */ + uint64_t sb_frextents; /* free realtime extents */ /* * End contiguous fields. */ xfs_ino_t sb_uquotino; /* user quota inode */ xfs_ino_t sb_gquotino; /* group quota inode */ - __uint16_t sb_qflags; /* quota flags */ - __uint8_t sb_flags; /* misc. flags */ - __uint8_t sb_shared_vn; /* shared version number */ + uint16_t sb_qflags; /* quota flags */ + uint8_t sb_flags; /* misc. flags */ + uint8_t sb_shared_vn; /* shared version number */ xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ - __uint32_t sb_unit; /* stripe or raid unit */ - __uint32_t sb_width; /* stripe or raid width */ - __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ - __uint8_t sb_logsectlog; /* log2 of the log sector size */ - __uint16_t sb_logsectsize; /* sector size for the log, bytes */ - __uint32_t sb_logsunit; /* stripe unit size for the log */ - __uint32_t sb_features2; /* additional feature bits */ + uint32_t sb_unit; /* stripe or raid unit */ + uint32_t sb_width; /* stripe or raid width */ + uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + uint8_t sb_logsectlog; /* log2 of the log sector size */ + uint16_t sb_logsectsize; /* sector size for the log, bytes */ + uint32_t sb_logsunit; /* stripe unit size for the log */ + uint32_t sb_features2; /* additional feature bits */ /* * bad features2 field as a result of failing to pad the sb structure to @@ -167,17 +167,17 @@ typedef struct xfs_sb { * the value in sb_features2 when formatting the incore superblock to * the disk buffer. */ - __uint32_t sb_bad_features2; + uint32_t sb_bad_features2; /* version 5 superblock fields start here */ /* feature masks */ - __uint32_t sb_features_compat; - __uint32_t sb_features_ro_compat; - __uint32_t sb_features_incompat; - __uint32_t sb_features_log_incompat; + uint32_t sb_features_compat; + uint32_t sb_features_ro_compat; + uint32_t sb_features_incompat; + uint32_t sb_features_log_incompat; - __uint32_t sb_crc; /* superblock crc */ + uint32_t sb_crc; /* superblock crc */ xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ @@ -449,7 +449,7 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) static inline bool xfs_sb_has_compat_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_compat & feature) != 0; } @@ -465,7 +465,7 @@ xfs_sb_has_compat_feature( static inline bool xfs_sb_has_ro_compat_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_ro_compat & feature) != 0; } @@ -482,7 +482,7 @@ xfs_sb_has_ro_compat_feature( static inline bool xfs_sb_has_incompat_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_incompat & feature) != 0; } @@ -492,7 +492,7 @@ xfs_sb_has_incompat_feature( static inline bool xfs_sb_has_incompat_log_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_log_incompat & feature) != 0; } @@ -594,8 +594,8 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) */ #define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) #define XFS_B_TO_FSB(mp,b) \ - ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) + ((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) /* @@ -1072,7 +1072,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * next agno_log bits - ag number * high agno_log-agblklog-inopblog bits - 0 */ -#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1) #define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog #define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog #define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log @@ -1211,6 +1211,7 @@ struct xfs_dsymlink_hdr { #define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) +#define XFS_SYMLINK_MAXLEN 1024 /* * The maximum pathlen is 1024 bytes. Since the minimum file system * blocksize is 512 bytes, we can get a max of 3 extents back from @@ -1269,16 +1270,16 @@ typedef __be32 xfs_alloc_ptr_t; #define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ #define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ -typedef __uint64_t xfs_inofree_t; +typedef uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) #define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) #define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ -#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(uint16_t)) #define XFS_INODES_PER_HOLEMASK_BIT \ - (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(uint16_t))) static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { @@ -1312,9 +1313,9 @@ typedef struct xfs_inobt_rec { typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __uint16_t ir_holemask; /* hole mask for sparse chunks */ - __uint8_t ir_count; /* total inode count */ - __uint8_t ir_freecount; /* count of free inodes (set bits) */ + uint16_t ir_holemask; /* hole mask for sparse chunks */ + uint8_t ir_count; /* total inode count */ + uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; @@ -1397,15 +1398,15 @@ struct xfs_rmap_rec { * rm_offset:54-60 aren't used and should be zero * rm_offset:0-53 is the block offset within the inode */ -#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63) -#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62) -#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61) +#define XFS_RMAP_OFF_ATTR_FORK ((uint64_t)1ULL << 63) +#define XFS_RMAP_OFF_BMBT_BLOCK ((uint64_t)1ULL << 62) +#define XFS_RMAP_OFF_UNWRITTEN ((uint64_t)1ULL << 61) -#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U) +#define XFS_RMAP_LEN_MAX ((uint32_t)~0U) #define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \ XFS_RMAP_OFF_BMBT_BLOCK | \ XFS_RMAP_OFF_UNWRITTEN) -#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL) +#define XFS_RMAP_OFF_MASK ((uint64_t)0x3FFFFFFFFFFFFFULL) #define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK) @@ -1431,8 +1432,8 @@ struct xfs_rmap_rec { struct xfs_rmap_irec { xfs_agblock_t rm_startblock; /* extent start block */ xfs_extlen_t rm_blockcount; /* extent length */ - __uint64_t rm_owner; /* extent owner */ - __uint64_t rm_offset; /* offset within the owner */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ unsigned int rm_flags; /* state flags */ }; @@ -1544,11 +1545,11 @@ typedef struct xfs_bmbt_rec { __be64 l0, l1; } xfs_bmbt_rec_t; -typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; typedef struct xfs_bmbt_rec_host { - __uint64_t l0, l1; + uint64_t l0, l1; } xfs_bmbt_rec_host_t; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 095bdf049a3f..8c61f21535d4 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -302,10 +302,10 @@ typedef struct xfs_bstat { * and using two 16bit values to hold new 32bit projid was choosen * to retain compatibility with "old" filesystems). */ -static inline __uint32_t +static inline uint32_t bstat_get_projid(struct xfs_bstat *bs) { - return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; + return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; } /* @@ -446,19 +446,15 @@ typedef struct xfs_handle { } xfs_handle_t; #define ha_fsid ha_u._ha_fsid -#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ - - (char *) &(handle)) \ - + (handle).ha_fid.fid_len) - /* * Structure passed to XFS_IOC_SWAPEXT */ typedef struct xfs_swapext { - __int64_t sx_version; /* version */ + int64_t sx_version; /* version */ #define XFS_SX_VERSION 0 - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ + int64_t sx_fdtarget; /* fd of target file */ + int64_t sx_fdtmp; /* fd of tmp file */ xfs_off_t sx_offset; /* offset into file */ xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ @@ -546,7 +542,7 @@ typedef struct xfs_swapext #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) -#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) +#define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d41ade5d293e..ffd5a15d1bb6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -46,7 +46,7 @@ /* * Allocation group level functions. */ -static inline int +int xfs_ialloc_cluster_alignment( struct xfs_mount *mp) { @@ -98,24 +98,15 @@ xfs_inobt_update( return xfs_btree_update(cur, &rec); } -/* - * Get the data from the pointed-to record. - */ -int /* error */ -xfs_inobt_get_rec( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_inobt_rec_incore_t *irec, /* btree record */ - int *stat) /* output: success/failure */ +/* Convert on-disk btree record to incore inobt record. */ +void +xfs_inobt_btrec_to_irec( + struct xfs_mount *mp, + union xfs_btree_rec *rec, + struct xfs_inobt_rec_incore *irec) { - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || *stat == 0) - return error; - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; @@ -130,6 +121,25 @@ xfs_inobt_get_rec( be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } irec->ir_free = be64_to_cpu(rec->inobt.ir_free); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || *stat == 0) + return error; + + xfs_inobt_btrec_to_irec(cur->bc_mp, rec, irec); return 0; } @@ -140,9 +150,9 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, - __uint16_t holemask, - __uint8_t count, - __int32_t freecount, + uint16_t holemask, + uint8_t count, + int32_t freecount, xfs_inofree_t free, int *stat) { @@ -2542,8 +2552,7 @@ xfs_agi_read_verify( !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_buf_ioerror(bp, -EFSBADCRC); else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, - XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI)) + XFS_ERRTAG_IALLOC_READ_AGI)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 0bb89669fc07..b32cfb5aeb5b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -168,5 +168,10 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +union xfs_btree_rec; +void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, + struct xfs_inobt_rec_incore *irec); + +int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 7c471881c9a6..317caba9faa6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -175,6 +175,18 @@ xfs_inobt_init_key_from_rec( } STATIC void +xfs_inobt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->inobt.ir_startino); + x += XFS_INODES_PER_CHUNK - 1; + key->inobt.ir_startino = cpu_to_be32(x); +} + +STATIC void xfs_inobt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -219,15 +231,25 @@ xfs_finobt_init_ptr_from_cur( ptr->s = agi->agi_free_root; } -STATIC __int64_t +STATIC int64_t xfs_inobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + return (int64_t)be32_to_cpu(key->inobt.ir_startino) - cur->bc_rec.i.ir_startino; } +STATIC int64_t +xfs_inobt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - + be32_to_cpu(k2->inobt.ir_startino); +} + static int xfs_inobt_verify( struct xfs_buf *bp) @@ -302,7 +324,6 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_write = xfs_inobt_write_verify, }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -322,7 +343,6 @@ xfs_inobt_recs_inorder( return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= be32_to_cpu(r2->inobt.ir_startino); } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), @@ -335,14 +355,14 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#if defined(DEBUG) || defined(XFS_WARN) + .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, -#endif }; static const struct xfs_btree_ops xfs_finobt_ops = { @@ -356,14 +376,14 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#if defined(DEBUG) || defined(XFS_WARN) + .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09c3d1aecef2..378f8fbc91a7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -105,8 +105,7 @@ xfs_inode_buf_verify( di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && xfs_dinode_good_version(mp, dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { + XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); @@ -381,7 +380,7 @@ xfs_log_dinode_to_disk( } } -static bool +bool xfs_dinode_verify( struct xfs_mount *mp, xfs_ino_t ino, @@ -444,7 +443,7 @@ xfs_dinode_calc_crc( struct xfs_mount *mp, struct xfs_dinode *dip) { - __uint32_t crc; + uint32_t crc; if (dip->di_version < 3) return; diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 6848a0afbce7..a9c97a356c30 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -28,26 +28,26 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint16_t di_flushiter; /* incremented on flush */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint16_t di_projid_lo; /* lower part of owner's project id */ - __uint16_t di_projid_hi; /* higher part of owner's project id */ + int8_t di_version; /* inode version */ + int8_t di_format; /* format of di_c data */ + uint16_t di_flushiter; /* incremented on flush */ + uint32_t di_uid; /* owner's user id */ + uint32_t di_gid; /* owner's group id */ + uint16_t di_projid_lo; /* lower part of owner's project id */ + uint16_t di_projid_hi; /* higher part of owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ xfs_extnum_t di_nextents; /* number of extents in data fork */ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + int8_t di_aformat; /* format of attr fork's data */ + uint32_t di_dmevmask; /* DMIG event mask */ + uint16_t di_dmstate; /* DMIG state info */ + uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint64_t di_flags2; /* more random flags */ - __uint32_t di_cowextsize; /* basic cow extent size for file */ + uint64_t di_flags2; /* more random flags */ + uint32_t di_cowextsize; /* basic cow extent size for file */ xfs_ictimestamp_t di_crtime; /* time created */ }; @@ -82,4 +82,7 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ +bool xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_dinode *dip); + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 7ae571f8e34a..8372e9bcd7b6 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -31,7 +31,7 @@ struct xfs_trans_res; * through all the log items definitions and everything they encode into the * log. */ -typedef __uint32_t xlog_tid_t; +typedef uint32_t xlog_tid_t; #define XLOG_MIN_ICLOGS 2 #define XLOG_MAX_ICLOGS 8 @@ -211,7 +211,7 @@ typedef struct xfs_log_iovec { typedef struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ - __int32_t th_tid; /* transaction id (unused) */ + int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ } xfs_trans_header_t; @@ -265,52 +265,52 @@ typedef struct xfs_trans_header { * must be added on to the end. */ typedef struct xfs_inode_log_format { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ + uint16_t ilf_type; /* inode log item type */ + uint16_t ilf_size; /* size of this item */ + uint32_t ilf_fields; /* flags for fields logged */ + uint16_t ilf_asize; /* size of attr d/ext/root */ + uint16_t ilf_dsize; /* size of data/ext/root */ + uint64_t ilf_ino; /* inode number */ union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ + int64_t ilf_blkno; /* blkno of inode buffer */ + int32_t ilf_len; /* len of inode buffer */ + int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; typedef struct xfs_inode_log_format_32 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ + uint16_t ilf_type; /* inode log item type */ + uint16_t ilf_size; /* size of this item */ + uint32_t ilf_fields; /* flags for fields logged */ + uint16_t ilf_asize; /* size of attr d/ext/root */ + uint16_t ilf_dsize; /* size of data/ext/root */ + uint64_t ilf_ino; /* inode number */ union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ + int64_t ilf_blkno; /* blkno of inode buffer */ + int32_t ilf_len; /* len of inode buffer */ + int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)) xfs_inode_log_format_32_t; typedef struct xfs_inode_log_format_64 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint32_t ilf_pad; /* pad for 64 bit boundary */ - __uint64_t ilf_ino; /* inode number */ + uint16_t ilf_type; /* inode log item type */ + uint16_t ilf_size; /* size of this item */ + uint32_t ilf_fields; /* flags for fields logged */ + uint16_t ilf_asize; /* size of attr d/ext/root */ + uint16_t ilf_dsize; /* size of data/ext/root */ + uint32_t ilf_pad; /* pad for 64 bit boundary */ + uint64_t ilf_ino; /* inode number */ union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ + int64_t ilf_blkno; /* blkno of inode buffer */ + int32_t ilf_len; /* len of inode buffer */ + int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_64_t; @@ -379,8 +379,8 @@ static inline int xfs_ilog_fdata(int w) * information. */ typedef struct xfs_ictimestamp { - __int32_t t_sec; /* timestamp seconds */ - __int32_t t_nsec; /* timestamp nanoseconds */ + int32_t t_sec; /* timestamp seconds */ + int32_t t_nsec; /* timestamp nanoseconds */ } xfs_ictimestamp_t; /* @@ -388,18 +388,18 @@ typedef struct xfs_ictimestamp { * kept identical to struct xfs_dinode except for the endianness annotations. */ struct xfs_log_dinode { - __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __uint16_t di_mode; /* mode and type of file */ - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint8_t di_pad3[2]; /* unused in v2/3 inodes */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid_lo; /* lower part of owner's project id */ - __uint16_t di_projid_hi; /* higher part of owner's project id */ - __uint8_t di_pad[6]; /* unused, zeroed space */ - __uint16_t di_flushiter; /* incremented on flush */ + uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + uint16_t di_mode; /* mode and type of file */ + int8_t di_version; /* inode version */ + int8_t di_format; /* format of di_c data */ + uint8_t di_pad3[2]; /* unused in v2/3 inodes */ + uint32_t di_uid; /* owner's user id */ + uint32_t di_gid; /* owner's group id */ + uint32_t di_nlink; /* number of links to file */ + uint16_t di_projid_lo; /* lower part of owner's project id */ + uint16_t di_projid_hi; /* higher part of owner's project id */ + uint8_t di_pad[6]; /* unused, zeroed space */ + uint16_t di_flushiter; /* incremented on flush */ xfs_ictimestamp_t di_atime; /* time last accessed */ xfs_ictimestamp_t di_mtime; /* time last modified */ xfs_ictimestamp_t di_ctime; /* time created/inode modified */ @@ -408,23 +408,23 @@ struct xfs_log_dinode { xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ xfs_extnum_t di_nextents; /* number of extents in data fork */ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint32_t di_gen; /* generation number */ + uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + int8_t di_aformat; /* format of attr fork's data */ + uint32_t di_dmevmask; /* DMIG event mask */ + uint16_t di_dmstate; /* DMIG state info */ + uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + uint32_t di_gen; /* generation number */ /* di_next_unlinked is the only non-core field in the old dinode */ xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ /* start of the extended dinode, writable fields */ - __uint32_t di_crc; /* CRC of the inode */ - __uint64_t di_changecount; /* number of attribute changes */ + uint32_t di_crc; /* CRC of the inode */ + uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ - __uint64_t di_flags2; /* more random flags */ - __uint32_t di_cowextsize; /* basic cow extent size for file */ - __uint8_t di_pad2[12]; /* more padding for future expansion */ + uint64_t di_flags2; /* more random flags */ + uint32_t di_cowextsize; /* basic cow extent size for file */ + uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ @@ -483,7 +483,7 @@ typedef struct xfs_buf_log_format { unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ unsigned short blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ + int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ } xfs_buf_log_format_t; @@ -533,7 +533,7 @@ xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); } -static inline __uint16_t +static inline uint16_t xfs_blft_from_flags(struct xfs_buf_log_format *blf) { return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; @@ -554,14 +554,14 @@ typedef struct xfs_extent { * conversion routine. */ typedef struct xfs_extent_32 { - __uint64_t ext_start; - __uint32_t ext_len; + uint64_t ext_start; + uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; typedef struct xfs_extent_64 { - __uint64_t ext_start; - __uint32_t ext_len; - __uint32_t ext_pad; + uint64_t ext_start; + uint32_t ext_len; + uint32_t ext_pad; } xfs_extent_64_t; /* @@ -570,26 +570,26 @@ typedef struct xfs_extent_64 { * size is given by efi_nextents. */ typedef struct xfs_efi_log_format { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ + uint16_t efi_type; /* efi log item type */ + uint16_t efi_size; /* size of this item */ + uint32_t efi_nextents; /* # extents to free */ + uint64_t efi_id; /* efi identifier */ xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; typedef struct xfs_efi_log_format_32 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ + uint16_t efi_type; /* efi log item type */ + uint16_t efi_size; /* size of this item */ + uint32_t efi_nextents; /* # extents to free */ + uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; typedef struct xfs_efi_log_format_64 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ + uint16_t efi_type; /* efi log item type */ + uint16_t efi_size; /* size of this item */ + uint32_t efi_nextents; /* # extents to free */ + uint64_t efi_id; /* efi identifier */ xfs_extent_64_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_64_t; @@ -599,26 +599,26 @@ typedef struct xfs_efi_log_format_64 { * size is given by efd_nextents; */ typedef struct xfs_efd_log_format { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ + uint16_t efd_type; /* efd log item type */ + uint16_t efd_size; /* size of this item */ + uint32_t efd_nextents; /* # of extents freed */ + uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; typedef struct xfs_efd_log_format_32 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ + uint16_t efd_type; /* efd log item type */ + uint16_t efd_size; /* size of this item */ + uint32_t efd_nextents; /* # of extents freed */ + uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; typedef struct xfs_efd_log_format_64 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ + uint16_t efd_type; /* efd log item type */ + uint16_t efd_size; /* size of this item */ + uint32_t efd_nextents; /* # of extents freed */ + uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_64_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_64_t; @@ -626,11 +626,11 @@ typedef struct xfs_efd_log_format_64 { * RUI/RUD (reverse mapping) log format definitions */ struct xfs_map_extent { - __uint64_t me_owner; - __uint64_t me_startblock; - __uint64_t me_startoff; - __uint32_t me_len; - __uint32_t me_flags; + uint64_t me_owner; + uint64_t me_startblock; + uint64_t me_startoff; + uint32_t me_len; + uint32_t me_flags; }; /* rmap me_flags: upper bits are flags, lower byte is type code */ @@ -659,10 +659,10 @@ struct xfs_map_extent { * size is given by rui_nextents. */ struct xfs_rui_log_format { - __uint16_t rui_type; /* rui log item type */ - __uint16_t rui_size; /* size of this item */ - __uint32_t rui_nextents; /* # extents to free */ - __uint64_t rui_id; /* rui identifier */ + uint16_t rui_type; /* rui log item type */ + uint16_t rui_size; /* size of this item */ + uint32_t rui_nextents; /* # extents to free */ + uint64_t rui_id; /* rui identifier */ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; @@ -680,19 +680,19 @@ xfs_rui_log_format_sizeof( * size is given by rud_nextents; */ struct xfs_rud_log_format { - __uint16_t rud_type; /* rud log item type */ - __uint16_t rud_size; /* size of this item */ - __uint32_t __pad; - __uint64_t rud_rui_id; /* id of corresponding rui */ + uint16_t rud_type; /* rud log item type */ + uint16_t rud_size; /* size of this item */ + uint32_t __pad; + uint64_t rud_rui_id; /* id of corresponding rui */ }; /* * CUI/CUD (refcount update) log format definitions */ struct xfs_phys_extent { - __uint64_t pe_startblock; - __uint32_t pe_len; - __uint32_t pe_flags; + uint64_t pe_startblock; + uint32_t pe_len; + uint32_t pe_flags; }; /* refcount pe_flags: upper bits are flags, lower byte is type code */ @@ -707,10 +707,10 @@ struct xfs_phys_extent { * size is given by cui_nextents. */ struct xfs_cui_log_format { - __uint16_t cui_type; /* cui log item type */ - __uint16_t cui_size; /* size of this item */ - __uint32_t cui_nextents; /* # extents to free */ - __uint64_t cui_id; /* cui identifier */ + uint16_t cui_type; /* cui log item type */ + uint16_t cui_size; /* size of this item */ + uint32_t cui_nextents; /* # extents to free */ + uint64_t cui_id; /* cui identifier */ struct xfs_phys_extent cui_extents[]; /* array of extents */ }; @@ -728,10 +728,10 @@ xfs_cui_log_format_sizeof( * size is given by cud_nextents; */ struct xfs_cud_log_format { - __uint16_t cud_type; /* cud log item type */ - __uint16_t cud_size; /* size of this item */ - __uint32_t __pad; - __uint64_t cud_cui_id; /* id of corresponding cui */ + uint16_t cud_type; /* cud log item type */ + uint16_t cud_size; /* size of this item */ + uint32_t __pad; + uint64_t cud_cui_id; /* id of corresponding cui */ }; /* @@ -755,10 +755,10 @@ struct xfs_cud_log_format { * size is given by bui_nextents. */ struct xfs_bui_log_format { - __uint16_t bui_type; /* bui log item type */ - __uint16_t bui_size; /* size of this item */ - __uint32_t bui_nextents; /* # extents to free */ - __uint64_t bui_id; /* bui identifier */ + uint16_t bui_type; /* bui log item type */ + uint16_t bui_size; /* size of this item */ + uint32_t bui_nextents; /* # extents to free */ + uint64_t bui_id; /* bui identifier */ struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ }; @@ -776,10 +776,10 @@ xfs_bui_log_format_sizeof( * size is given by bud_nextents; */ struct xfs_bud_log_format { - __uint16_t bud_type; /* bud log item type */ - __uint16_t bud_size; /* size of this item */ - __uint32_t __pad; - __uint64_t bud_bui_id; /* id of corresponding bui */ + uint16_t bud_type; /* bud log item type */ + uint16_t bud_size; /* size of this item */ + uint32_t __pad; + uint64_t bud_bui_id; /* id of corresponding bui */ }; /* @@ -789,12 +789,12 @@ struct xfs_bud_log_format { * 32 bits : log_recovery code assumes that. */ typedef struct xfs_dq_logformat { - __uint16_t qlf_type; /* dquot log item type */ - __uint16_t qlf_size; /* size of this item */ + uint16_t qlf_type; /* dquot log item type */ + uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ - __int64_t qlf_blkno; /* blkno of dquot buffer */ - __int32_t qlf_len; /* len of dquot buffer */ - __uint32_t qlf_boffset; /* off of dquot in buffer */ + int64_t qlf_blkno; /* blkno of dquot buffer */ + int32_t qlf_len; /* len of dquot buffer */ + uint32_t qlf_boffset; /* off of dquot in buffer */ } xfs_dq_logformat_t; /* @@ -853,8 +853,8 @@ typedef struct xfs_qoff_logformat { * decoding can be done correctly. */ struct xfs_icreate_log { - __uint16_t icl_type; /* type of log format structure */ - __uint16_t icl_size; /* size of log format structure */ + uint16_t icl_type; /* type of log format structure */ + uint16_t icl_size; /* size of log format structure */ __be32 icl_ag; /* ag being allocated in */ __be32 icl_agbno; /* start block of inode range */ __be32 icl_count; /* number of inodes to initialise */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 29a01ec89dd0..66948a9fd486 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -26,7 +26,7 @@ #define XLOG_RHASH_SIZE 16 #define XLOG_RHASH_SHIFT 2 #define XLOG_RHASH(tid) \ - ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) + ((((uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) #define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 8eed51275bb3..2834574cb6e7 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -27,8 +27,8 @@ * they may need 64-bit accounting. Hence, 64-bit quota-counters, * and quota-limits. This is a waste in the common case, but hey ... */ -typedef __uint64_t xfs_qcnt_t; -typedef __uint16_t xfs_qwarncnt_t; +typedef uint64_t xfs_qcnt_t; +typedef uint16_t xfs_qwarncnt_t; /* * flags for q_flags field in the dquot. @@ -136,6 +136,8 @@ typedef __uint16_t xfs_qwarncnt_t; */ #define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_NOLOCK 0x2000000 /* don't ilock during dqget */ + /* * flags to xfs_trans_mod_dquot. */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 82a38d86ebad..900ea231f9a3 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -784,14 +784,6 @@ xfs_refcount_merge_extents( } /* - * While we're adjusting the refcounts records of an extent, we have - * to keep an eye on the number of extents we're dirtying -- run too - * many in a single transaction and we'll exceed the transaction's - * reservation and crash the fs. Each record adds 12 bytes to the - * log (plus any key updates) so we'll conservatively assume 24 bytes - * per record. We must also leave space for btree splits on both ends - * of the range and space for the CUD and a new CUI. - * * XXX: This is a pretty hand-wavy estimate. The penalty for guessing * true incorrectly is a shutdown FS; the penalty for guessing false * incorrectly is more transaction rolls than might be necessary. @@ -813,8 +805,7 @@ xfs_refcount_still_have_space( */ if (cur->bc_private.a.priv.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) + XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_private.a.priv.refc.nr_ops == 0) @@ -822,7 +813,7 @@ xfs_refcount_still_have_space( else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * 32; + cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -1076,8 +1067,7 @@ xfs_refcount_finish_one( blockcount); if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_REFCOUNT_FINISH_ONE, - XFS_RANDOM_REFCOUNT_FINISH_ONE)) + XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 098dc668ab2c..eafb9d1f3b37 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, xfs_agnumber_t agno); +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 32 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + */ +#define XFS_REFCOUNT_ITEM_OVERHEAD 32 + +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) +{ + return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; +} + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 50add5272807..3c59dd3d58d7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -202,7 +202,7 @@ xfs_refcountbt_init_ptr_from_cur( ptr->s = agf->agf_refcount_root; } -STATIC __int64_t +STATIC int64_t xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -210,16 +210,16 @@ xfs_refcountbt_key_diff( struct xfs_refcount_irec *rec = &cur->bc_rec.rc; struct xfs_refcount_key *kp = &key->refc; - return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; } -STATIC __int64_t +STATIC int64_t xfs_refcountbt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - be32_to_cpu(k2->refc.rc_startblock); } @@ -285,7 +285,6 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .verify_write = xfs_refcountbt_write_verify, }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_refcountbt_keys_inorder( struct xfs_btree_cur *cur, @@ -306,7 +305,6 @@ xfs_refcountbt_recs_inorder( be32_to_cpu(r1->refc.rc_blockcount) <= be32_to_cpu(r2->refc.rc_startblock); } -#endif static const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), @@ -325,10 +323,8 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { .key_diff = xfs_refcountbt_key_diff, .buf_ops = &xfs_refcountbt_buf_ops, .diff_two_keys = xfs_refcountbt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 06cfb93c2ef9..55c88a732690 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -179,7 +179,8 @@ done: return error; } -static int +/* Convert an internal btree record to an rmap record. */ +int xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) @@ -2061,7 +2062,7 @@ int xfs_rmap_finish_one( struct xfs_trans *tp, enum xfs_rmap_intent_type type, - __uint64_t owner, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, @@ -2086,8 +2087,7 @@ xfs_rmap_finish_one( startoff, blockcount, state); if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_RMAP_FINISH_ONE, - XFS_RANDOM_RMAP_FINISH_ONE)) + XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* @@ -2182,7 +2182,7 @@ __xfs_rmap_add( struct xfs_mount *mp, struct xfs_defer_ops *dfops, enum xfs_rmap_intent_type type, - __uint64_t owner, + uint64_t owner, int whichfork, struct xfs_bmbt_irec *bmap) { @@ -2266,7 +2266,7 @@ xfs_rmap_alloc_extent( xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner) + uint64_t owner) { struct xfs_bmbt_irec bmap; @@ -2290,7 +2290,7 @@ xfs_rmap_free_extent( xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner) + uint64_t owner) { struct xfs_bmbt_irec bmap; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 98f908fea103..466ede637080 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -179,7 +179,7 @@ enum xfs_rmap_intent_type { struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; - __uint64_t ri_owner; + uint64_t ri_owner; int ri_whichfork; struct xfs_bmbt_irec ri_bmap; }; @@ -196,15 +196,15 @@ int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *imap); int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner); + uint64_t owner); int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner); + uint64_t owner); void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, - __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); @@ -216,5 +216,8 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); +union xfs_btree_rec; +int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 74e5a54bc428..9d9c9192584c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -199,7 +199,7 @@ xfs_rmapbt_init_high_key_from_rec( union xfs_btree_key *key, union xfs_btree_rec *rec) { - __uint64_t off; + uint64_t off; int adj; adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; @@ -241,7 +241,7 @@ xfs_rmapbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } -STATIC __int64_t +STATIC int64_t xfs_rmapbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -249,9 +249,9 @@ xfs_rmapbt_key_diff( struct xfs_rmap_irec *rec = &cur->bc_rec.r; struct xfs_rmap_key *kp = &key->rmap; __u64 x, y; - __int64_t d; + int64_t d; - d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; if (d) return d; @@ -271,7 +271,7 @@ xfs_rmapbt_key_diff( return 0; } -STATIC __int64_t +STATIC int64_t xfs_rmapbt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, @@ -279,10 +279,10 @@ xfs_rmapbt_diff_two_keys( { struct xfs_rmap_key *kp1 = &k1->rmap; struct xfs_rmap_key *kp2 = &k2->rmap; - __int64_t d; + int64_t d; __u64 x, y; - d = (__int64_t)be32_to_cpu(kp1->rm_startblock) - + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - be32_to_cpu(kp2->rm_startblock); if (d) return d; @@ -377,17 +377,16 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .verify_write = xfs_rmapbt_write_verify, }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_rmapbt_keys_inorder( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - __uint32_t x; - __uint32_t y; - __uint64_t a; - __uint64_t b; + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; x = be32_to_cpu(k1->rmap.rm_startblock); y = be32_to_cpu(k2->rmap.rm_startblock); @@ -414,10 +413,10 @@ xfs_rmapbt_recs_inorder( union xfs_btree_rec *r1, union xfs_btree_rec *r2) { - __uint32_t x; - __uint32_t y; - __uint64_t a; - __uint64_t b; + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; x = be32_to_cpu(r1->rmap.rm_startblock); y = be32_to_cpu(r2->rmap.rm_startblock); @@ -437,7 +436,6 @@ xfs_rmapbt_recs_inorder( return 1; return 0; } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), @@ -456,10 +454,8 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .key_diff = xfs_rmapbt_key_diff, .buf_ops = &xfs_rmapbt_buf_ops, .diff_two_keys = xfs_rmapbt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e47b99e59f60..5d4e43ef4eea 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -static int +int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ @@ -1011,7 +1011,7 @@ xfs_rtfree_extent( mp->m_sb.sb_rextents) { if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; - *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; + *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } return 0; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 584ec896a533..9b5aae2bcc0b 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -448,7 +448,7 @@ xfs_sb_quota_to_disk( struct xfs_dsb *to, struct xfs_sb *from) { - __uint16_t qflags = from->sb_qflags; + uint16_t qflags = from->sb_qflags; to->sb_uquotino = cpu_to_be64(from->sb_uquotino); if (xfs_sb_version_has_pquotino(from)) { @@ -756,7 +756,7 @@ xfs_sb_mount_common( mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + mp->m_ialloc_inos = (int)MAX((uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 2e2c6716b623..c484877129a0 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -114,7 +114,7 @@ xfs_symlink_verify( if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) return false; if (be32_to_cpu(dsl->sl_offset) + - be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) return false; if (dsl->sl_owner == 0) return false; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index b456cca1bfb2..6bd916bd35e2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -477,14 +477,14 @@ xfs_calc_mkdir_reservation( /* * Making a new symplink is the same as creating a new file, but * with the added blocks for remote symlink data which can be up to 1kB in - * length (MAXPATHLEN). + * length (XFS_SYMLINK_MAXLEN). */ STATIC uint xfs_calc_symlink_reservation( struct xfs_mount *mp) { return xfs_calc_create_reservation(mp) + - xfs_calc_buf_res(1, MAXPATHLEN); + xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN); } /* diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 717909f2f7b7..0220159bd463 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -18,34 +18,34 @@ #ifndef __XFS_TYPES_H__ #define __XFS_TYPES_H__ -typedef __uint32_t prid_t; /* project ID */ +typedef uint32_t prid_t; /* project ID */ -typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ -typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ -typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ -typedef __uint32_t xfs_agnumber_t; /* allocation group number */ -typedef __int32_t xfs_extnum_t; /* # of extents in a file */ -typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */ -typedef __int64_t xfs_fsize_t; /* bytes in a file */ -typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ +typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ +typedef uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef uint32_t xfs_agnumber_t; /* allocation group number */ +typedef int32_t xfs_extnum_t; /* # of extents in a file */ +typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef int64_t xfs_fsize_t; /* bytes in a file */ +typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ -typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */ -typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */ +typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */ -typedef __int64_t xfs_lsn_t; /* log sequence number */ -typedef __int32_t xfs_tid_t; /* transaction identifier */ +typedef int64_t xfs_lsn_t; /* log sequence number */ +typedef int32_t xfs_tid_t; /* transaction identifier */ -typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ -typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ +typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ +typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ -typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ -typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ -typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ -typedef __uint64_t xfs_fileoff_t; /* block number in a file */ -typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ +typedef uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef uint64_t xfs_fileoff_t; /* block number in a file */ +typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ -typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ -typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ +typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +typedef int64_t xfs_sfiloff_t; /* signed block number in a file */ /* * Null values for the types. @@ -125,7 +125,7 @@ struct xfs_name { * uid_t and gid_t are hard-coded to 32 bits in the inode. * Hence, an 'id' in a dquot is 32 bits.. */ -typedef __uint32_t xfs_dqid_t; +typedef uint32_t xfs_dqid_t; /* * Constants for bit manipulations. diff --git a/fs/xfs/uuid.c b/fs/xfs/uuid.c deleted file mode 100644 index b83f76b6d410..000000000000 --- a/fs/xfs/uuid.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -/* IRIX interpretation of an uuid_t */ -typedef struct { - __be32 uu_timelow; - __be16 uu_timemid; - __be16 uu_timehi; - __be16 uu_clockseq; - __be16 uu_node[3]; -} xfs_uu_t; - -/* - * uuid_getnodeuniq - obtain the node unique fields of a UUID. - * - * This is not in any way a standard or condoned UUID function; - * it just something that's needed for user-level file handles. - */ -void -uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) -{ - xfs_uu_t *uup = (xfs_uu_t *)uuid; - - fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | - be16_to_cpu(uup->uu_timemid); - fsid[1] = be32_to_cpu(uup->uu_timelow); -} - -int -uuid_is_nil(uuid_t *uuid) -{ - int i; - char *cp = (char *)uuid; - - if (uuid == NULL) - return 0; - /* implied check of version number here... */ - for (i = 0; i < sizeof *uuid; i++) - if (*cp++) return 0; /* not nil */ - return 1; /* is nil */ -} - -int -uuid_equal(uuid_t *uuid1, uuid_t *uuid2) -{ - return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; -} diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h deleted file mode 100644 index 104db0f3bed6..000000000000 --- a/fs/xfs/uuid.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_UUID_H__ -#define __XFS_SUPPORT_UUID_H__ - -typedef struct { - unsigned char __u_bits[16]; -} uuid_t; - -extern int uuid_is_nil(uuid_t *uuid); -extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); -extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); - -static inline void -uuid_copy(uuid_t *dst, uuid_t *src) -{ - memcpy(dst, src, sizeof(uuid_t)); -} - -#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index a742c47f7d5a..80cd0fd86783 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,6 +24,10 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif +#ifdef CONFIG_XFS_ASSERT_FATAL +#define XFS_ASSERT_FATAL 1 +#endif + #ifdef CONFIG_XFS_WARN #define XFS_WARN 1 #endif diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b468e041f207..7034e17535de 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -170,8 +170,8 @@ xfs_get_acl(struct inode *inode, int type) return acl; } -STATIC int -__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int +__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; @@ -268,5 +268,5 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) } set_acl: - return __xfs_set_acl(inode, type, acl); + return __xfs_set_acl(inode, acl, type); } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 286fa89217f5..04327318ef67 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -24,6 +24,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3b91faacc1ba..6bf120bb1a17 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,7 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - int error = ioend->io_bio->bi_error; + int error; /* * Just clean up the in-memory strutures if the fs has been shut down. @@ -289,6 +289,7 @@ xfs_end_io( /* * Clean up any COW blocks on an I/O error. */ + error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { switch (ioend->io_type) { case XFS_IO_COW: @@ -332,7 +333,7 @@ xfs_end_bio( else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else - xfs_destroy_ioend(ioend, bio->bi_error); + xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } STATIC int @@ -500,11 +501,12 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_bio->bi_error = status; + ioend->io_bio->bi_status = errno_to_blk_status(status); bio_endio(ioend->io_bio); return status; } + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); return 0; } @@ -564,6 +566,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); ioend->io_bio = new; } @@ -836,7 +839,7 @@ xfs_writepage_map( struct inode *inode, struct page *page, loff_t offset, - __uint64_t end_offset) + uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; @@ -991,7 +994,7 @@ xfs_do_writepage( struct xfs_writepage_ctx *wpc = data; struct inode *inode = page->mapping->host; loff_t offset; - __uint64_t end_offset; + uint64_t end_offset; pgoff_t end_index; trace_xfs_writepage(inode, page, 0, 0); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index d14691aa02b4..5d5a5e277f35 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -117,6 +117,7 @@ typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { + struct xfs_trans *tp; struct xfs_inode *dp; /* inode */ struct attrlist_cursor_kern *cursor; /* position in list */ char *alist; /* output buffer */ @@ -140,8 +141,10 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, unsigned char *value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 97c45b6eb91e..545eca508d42 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -230,7 +230,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != -EFSCORRUPTED)) return error; @@ -242,7 +242,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: @@ -254,18 +254,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (cursor->hashval > be32_to_cpu( entries[leafhdr.count - 1].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; } else if (cursor->hashval <= be32_to_cpu( entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; } break; default: trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; } } @@ -279,9 +279,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - __uint16_t magic; + uint16_t magic; - error = xfs_da3_node_read(NULL, dp, + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) @@ -297,7 +297,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, node); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return -EFSCORRUPTED; } @@ -313,10 +313,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) } } if (i == nodehdr.count) { - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return 0; } - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); } } ASSERT(bp != NULL); @@ -333,12 +333,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; - xfs_trans_brelse(NULL, bp); - error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); + xfs_trans_brelse(context->tp, bp); + error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp); if (error) return error; } - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return 0; } @@ -448,16 +448,34 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp); if (error) return error; xfs_attr3_leaf_list_int(bp, context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return 0; } int +xfs_attr_list_int_ilocked( + struct xfs_attr_list_context *context) +{ + struct xfs_inode *dp = context->dp; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp)) + return 0; + else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + return xfs_attr_shortform_list(context); + else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + return xfs_attr_leaf_list(context); + return xfs_attr_node_list(context); +} + +int xfs_attr_list_int( xfs_attr_list_context_t *context) { @@ -470,19 +488,8 @@ xfs_attr_list_int( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - /* - * Decide on what work routines to call based on the inode size. - */ lock_mode = xfs_ilock_attr_map_shared(dp); - if (!xfs_inode_hasattr(dp)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(context); - } else { - error = xfs_attr_node_list(context); - } + error = xfs_attr_list_int_ilocked(context); xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index d419d23fa214..88073910fa5d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -396,6 +396,7 @@ xfs_bui_recover( struct xfs_map_extent *bmap; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; + xfs_filblks_t count; bool op_ok; struct xfs_bud_log_item *budp; enum xfs_bmap_intent_type type; @@ -404,6 +405,7 @@ xfs_bui_recover( struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_defer_ops dfops; + struct xfs_bmbt_irec irec; xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -481,13 +483,24 @@ xfs_bui_recover( } xfs_trans_ijoin(tp, ip, 0); + count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, ip, whichfork, bmap->me_startoff, - bmap->me_startblock, bmap->me_len, - state); + bmap->me_startblock, &count, state); if (error) goto err_dfops; + if (count > 0) { + ASSERT(type == XFS_BMAP_UNMAP); + irec.br_startblock = bmap->me_startblock; + irec.br_blockcount = count; + irec.br_startoff = bmap->me_startoff; + irec.br_state = state; + error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + if (error) + goto err_dfops; + } + /* Finish transaction, free inodes. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 9e3cc2146d5b..93e955262d07 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -219,20 +219,24 @@ xfs_bmap_eof( */ /* - * Count leaf blocks given a range of extent records. + * Count leaf blocks given a range of extent records. Delayed allocation + * extents are not counted towards the totals. */ STATIC void xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) + struct xfs_ifork *ifp, + xfs_extnum_t *numrecs, + xfs_filblks_t *count) { - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); + xfs_extnum_t i; + xfs_extnum_t nr_exts = xfs_iext_count(ifp); + + for (i = 0; i < nr_exts; i++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) { + (*numrecs)++; + *count += xfs_bmbt_get_blockcount(frp); + } } } @@ -245,7 +249,7 @@ xfs_bmap_disk_count_leaves( struct xfs_mount *mp, struct xfs_btree_block *block, int numrecs, - int *count) + xfs_filblks_t *count) { int b; xfs_bmbt_rec_t *frp; @@ -260,17 +264,18 @@ xfs_bmap_disk_count_leaves( * Recursively walks each level of a btree * to count total fsblocks in use. */ -STATIC int /* error */ +STATIC int xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_ifork *ifp, + xfs_fsblock_t blockno, + int levelin, + xfs_extnum_t *nextents, + xfs_filblks_t *count) { int error; - xfs_buf_t *bp, *nbp; + struct xfs_buf *bp, *nbp; int level = levelin; __be64 *pp; xfs_fsblock_t bno = blockno; @@ -303,8 +308,9 @@ xfs_bmap_count_tree( /* Dive to the next level */ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents, + count); + if (error) { xfs_trans_brelse(tp, bp); XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", XFS_ERRLEVEL_LOW, mp); @@ -316,6 +322,7 @@ xfs_bmap_count_tree( for (;;) { nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); numrecs = be16_to_cpu(block->bb_numrecs); + (*nextents) += numrecs; xfs_bmap_disk_count_leaves(mp, block, numrecs, count); xfs_trans_brelse(tp, bp); if (nextbno == NULLFSBLOCK) @@ -334,46 +341,64 @@ xfs_bmap_count_tree( } /* - * Count fsblocks of the given fork. + * Count fsblocks of the given fork. Delayed allocation extents are + * not counted towards the totals. */ -static int /* error */ +int xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *nextents, + xfs_filblks_t *count) { + struct xfs_mount *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ struct xfs_btree_block *block; /* current btree block */ + struct xfs_ifork *ifp; /* fork structure */ xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ + int error; bno = NULLFSBLOCK; mp = ip->i_mount; + *nextents = 0; + *count = 0; ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count); + if (!ifp) return 0; - } - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return -EFSCORRUPTED; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_EXTENTS: + xfs_bmap_count_leaves(ifp, nextents, count); + return 0; + case XFS_DINODE_FMT_BTREE: + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, + nextents, count); + if (error) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; } return 0; @@ -389,11 +414,11 @@ xfs_getbmapx_fix_eof_hole( struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with * preallocated data space */ - __int64_t end, /* last block requested */ + int64_t end, /* last block requested */ xfs_fsblock_t startblock, bool moretocome) { - __int64_t fixlen; + int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_extnum_t lastx; /* last extent pointer */ @@ -455,8 +480,8 @@ xfs_getbmap_adjust_shared( agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); - error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, - &ebno, &elen, true); + error = xfs_reflink_find_shared(mp, NULL, agno, agbno, + map->br_blockcount, &ebno, &elen, true); if (error) return error; @@ -514,9 +539,9 @@ xfs_getbmap( xfs_bmap_format_t formatter, /* format to user */ void *arg) /* formatter arg */ { - __int64_t bmvend; /* last block requested */ + int64_t bmvend; /* last block requested */ int error = 0; /* return value */ - __int64_t fixlen; /* length for -1 case */ + int64_t fixlen; /* length for -1 case */ int i; /* extent number */ int lock; /* lock state */ xfs_bmbt_irec_t *map; /* buffer for user's data */ @@ -605,7 +630,7 @@ xfs_getbmap( if (bmv->bmv_length == -1) { fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); bmv->bmv_length = - max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + max_t(int64_t, fixlen - bmv->bmv_offset, 0); } else if (bmv->bmv_length == 0) { bmv->bmv_entries = 0; return 0; @@ -742,7 +767,7 @@ xfs_getbmap( out[cur_ext].bmv_offset + out[cur_ext].bmv_length; bmv->bmv_length = - max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + max_t(int64_t, 0, bmvend - bmv->bmv_offset); /* * In case we don't want to return the hole, @@ -1617,7 +1642,7 @@ xfs_swap_extents_check_format( * extent format... */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(ip) && + if (XFS_IFORK_Q(ip) && XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= @@ -1627,7 +1652,7 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(tip) && + if (XFS_IFORK_Q(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= @@ -1676,7 +1701,7 @@ xfs_swap_extent_rmap( xfs_filblks_t ilen; xfs_filblks_t rlen; int nimaps; - __uint64_t tip_flags2; + uint64_t tip_flags2; /* * If the source file has shared blocks, we must flag the donor @@ -1789,10 +1814,11 @@ xfs_swap_extent_forks( int *target_log_flags) { struct xfs_ifork tempifp, *ifp, *tifp; - int aforkblks = 0; - int taforkblks = 0; + xfs_filblks_t aforkblks = 0; + xfs_filblks_t taforkblks = 0; + xfs_extnum_t junk; xfs_extnum_t nextents; - __uint64_t tmp; + uint64_t tmp; int error; /* @@ -1800,14 +1826,14 @@ xfs_swap_extent_forks( */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) return error; @@ -1850,15 +1876,15 @@ xfs_swap_extent_forks( /* * Fix the on-disk inode values */ - tmp = (__uint64_t)ip->i_d.di_nblocks; + tmp = (uint64_t)ip->i_d.di_nblocks; ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - tmp = (__uint64_t) ip->i_d.di_nextents; + tmp = (uint64_t) ip->i_d.di_nextents; ip->i_d.di_nextents = tip->i_d.di_nextents; tip->i_d.di_nextents = tmp; - tmp = (__uint64_t) ip->i_d.di_format; + tmp = (uint64_t) ip->i_d.di_format; ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; @@ -1927,7 +1953,7 @@ xfs_swap_extents( int error = 0; int lock_flags; struct xfs_ifork *cowfp; - __uint64_t f; + uint64_t f; int resblks; /* diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 135d8267e284..0cede1043571 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -70,4 +70,8 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_extnum_t *nextents, + xfs_filblks_t *count); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 16d6a578fc16..72f038492ba8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1194,7 +1194,7 @@ xfs_buf_ioerror_alert( { xfs_alert(bp->b_target->bt_mount, "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", - (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); + (uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } int @@ -1227,8 +1227,11 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) - cmpxchg(&bp->b_io_error, 0, bio->bi_error); + if (bio->bi_status) { + int error = blk_status_to_errno(bio->bi_status); + + cmpxchg(&bp->b_io_error, 0, error); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -2047,6 +2050,66 @@ xfs_buf_delwri_submit( return error; } +/* + * Push a single buffer on a delwri queue. + * + * The purpose of this function is to submit a single buffer of a delwri queue + * and return with the buffer still on the original queue. The waiting delwri + * buffer submission infrastructure guarantees transfer of the delwri queue + * buffer reference to a temporary wait list. We reuse this infrastructure to + * transfer the buffer back to the original queue. + * + * Note the buffer transitions from the queued state, to the submitted and wait + * listed state and back to the queued state during this call. The buffer + * locking and queue management logic between _delwri_pushbuf() and + * _delwri_queue() guarantee that the buffer cannot be queued to another list + * before returning. + */ +int +xfs_buf_delwri_pushbuf( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + LIST_HEAD (submit_list); + int error; + + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + + trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); + + /* + * Isolate the buffer to a new local list so we can submit it for I/O + * independently from the rest of the original list. + */ + xfs_buf_lock(bp); + list_move(&bp->b_list, &submit_list); + xfs_buf_unlock(bp); + + /* + * Delwri submission clears the DELWRI_Q buffer flag and returns with + * the buffer on the wait list with an associated reference. Rather than + * bounce the buffer from a local wait list back to the original list + * after I/O completion, reuse the original list as the wait list. + */ + xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); + + /* + * The buffer is now under I/O and wait listed as during typical delwri + * submission. Lock the buffer to wait for I/O completion. Rather than + * remove the buffer from the wait list and release the reference, we + * want to return with the buffer queued to the original list. The + * buffer already sits on the original list with a wait list reference, + * however. If we let the queue inherit that wait list reference, all we + * need to do is reset the DELWRI_Q flag. + */ + xfs_buf_lock(bp); + error = bp->b_error; + bp->b_flags |= _XBF_DELWRI_Q; + xfs_buf_unlock(bp); + + return error; +} + int __init xfs_buf_init(void) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1508121f29f2..20721261dae5 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -332,6 +332,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); +extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 0306168af332..f6a8422e9562 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -636,20 +636,23 @@ xfs_buf_item_unlock( /* * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be dirty and hence in the AIL. Therefore if we are - * aborting a buffer and we've just taken the last refernce away, we - * have to check if it is in the AIL before freeing it. We need to free - * it in this case, because an aborted transaction has already shut the - * filesystem down and this is the last chance we will have to do so. + * buffers may be in the AIL regardless of dirty state. An aborted + * transaction that invalidates a buffer already in the AIL may have + * marked it stale and cleared the dirty state, for example. + * + * Therefore if we are aborting a buffer and we've just taken the last + * reference away, we have to check if it is in the AIL before freeing + * it. We need to free it in this case, because an aborted transaction + * has already shut the filesystem down and this is the last chance we + * will have to do so. */ if (atomic_dec_and_test(&bip->bli_refcount)) { - if (clean) - xfs_buf_item_relse(bp); - else if (aborted) { + if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } + } else if (clean) + xfs_buf_item_relse(bp); } if (!(flags & XFS_BLI_HOLD)) diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 20b7a5c6eb2f..ba2638d37031 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -44,7 +44,7 @@ static unsigned char xfs_dir3_filetype_table[] = { static unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, - __uint8_t filetype) + uint8_t filetype) { if (!xfs_sb_version_hasftype(&mp->m_sb)) return DT_UNKNOWN; @@ -117,7 +117,7 @@ xfs_dir2_sf_getdents( */ sfep = xfs_dir2_sf_firstentry(sfp); for (i = 0; i < sfp->count; i++) { - __uint8_t filetype; + uint8_t filetype; off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, xfs_dir2_sf_get_offset(sfep)); @@ -170,7 +170,7 @@ xfs_dir2_block_getdents( return 0; lock_mode = xfs_ilock_data_map_shared(dp); - error = xfs_dir3_block_read(NULL, dp, &bp); + error = xfs_dir3_block_read(args->trans, dp, &bp); xfs_iunlock(dp, lock_mode); if (error) return error; @@ -194,7 +194,7 @@ xfs_dir2_block_getdents( * Each object is a real entry (dep) or an unused one (dup). */ while (ptr < endptr) { - __uint8_t filetype; + uint8_t filetype; dup = (xfs_dir2_data_unused_t *)ptr; /* @@ -228,7 +228,7 @@ xfs_dir2_block_getdents( if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) { - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); return 0; } } @@ -239,218 +239,104 @@ xfs_dir2_block_getdents( */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); return 0; } -struct xfs_dir2_leaf_map_info { - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ - int nmap; /* mappings to ask xfs_bmapi */ - xfs_dir2_db_t curdb; /* db for current block */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ - struct xfs_bmbt_irec map[]; /* map vector for blocks */ -}; - +/* + * Read a directory block and initiate readahead for blocks beyond that. + * We maintain a sliding readahead window of the remaining space in the + * buffer rounded up to the nearest block. + */ STATIC int xfs_dir2_leaf_readbuf( struct xfs_da_args *args, size_t bufsize, - struct xfs_dir2_leaf_map_info *mip, - xfs_dir2_off_t *curoff, - struct xfs_buf **bpp, - bool trim_map) + xfs_dir2_off_t *cur_off, + xfs_dablk_t *ra_blk, + struct xfs_buf **bpp) { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; - struct xfs_bmbt_irec *map = mip->map; + struct xfs_da_geometry *geo = args->geo; + struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_bmbt_irec map; struct blk_plug plug; + xfs_dir2_off_t new_off; + xfs_dablk_t next_ra; + xfs_dablk_t map_off; + xfs_dablk_t last_da; + xfs_extnum_t idx; + int ra_want; int error = 0; - int length; - int i; - int j; - struct xfs_da_geometry *geo = args->geo; - - /* - * If the caller just finished processing a buffer, it will tell us - * we need to trim that block out of the mapping now it is done. - */ - if (trim_map) { - mip->map_blocks -= geo->fsbcount; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = geo->fsbcount; i > 0; ) { - j = min_t(int, map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --mip->map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * mip->map_valid); - i -= j; - } - } - /* - * Recalculate the readahead blocks wanted. - */ - mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; - ASSERT(mip->ra_want >= 0); - - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + mip->ra_want > mip->map_blocks && - mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - mip->nmap = mip->map_size - mip->map_valid; - error = xfs_bmapi_read(dp, mip->map_off, - xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - - mip->map_off, - &map[mip->map_valid], &mip->nmap, 0); - - /* - * Don't know if we should ignore this or try to return an - * error. The trouble with returning errors is that readdir - * will just stop without actually passing the error through. - */ + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK); if (error) - goto out; /* XXX */ - - /* - * If we got all the mappings we asked for, set the final map - * offset based on the last bmap value received. Otherwise, - * we've reached the end. - */ - if (mip->nmap == mip->map_size - mip->map_valid) { - i = mip->map_valid + mip->nmap - 1; - mip->map_off = map[i].br_startoff + map[i].br_blockcount; - } else - mip->map_off = xfs_dir2_byte_to_da(geo, - XFS_DIR2_LEAF_OFFSET); - - /* - * Look for holes in the mapping, and eliminate them. Count up - * the valid blocks. - */ - for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { - if (map[i].br_startblock == HOLESTARTBLOCK) { - mip->nmap--; - length = mip->map_valid + mip->nmap - i; - if (length) - memmove(&map[i], &map[i + 1], - sizeof(map[i]) * length); - } else { - mip->map_blocks += map[i].br_blockcount; - i++; - } - } - mip->map_valid += mip->nmap; + goto out; } /* - * No valid mappings, so no more data blocks. + * Look for mapped directory blocks at or above the current offset. + * Truncate down to the nearest directory block to start the scanning + * operation. */ - if (!mip->map_valid) { - *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); + last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); + map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off)); + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map)) goto out; - } + if (map.br_startoff >= last_da) + goto out; + xfs_trim_extent(&map, map_off, last_da - map_off); - /* - * Read the directory block starting at the first mapping. - */ - mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); - error = xfs_dir3_data_read(NULL, dp, map->br_startoff, - map->br_blockcount >= geo->fsbcount ? - XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : - -1, &bp); - /* - * Should just skip over the data block instead of giving up. - */ + /* Read the directory block of that first mapping. */ + new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); + if (new_off > *cur_off) + *cur_off = new_off; + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp); if (error) - goto out; /* XXX */ - - /* - * Adjust the current amount of read-ahead: we just read a block that - * was previously ra. - */ - if (mip->ra_current) - mip->ra_current -= geo->fsbcount; + goto out; /* - * Do we need more readahead? - * Each loop tries to process 1 full dir blk; last may be partial. + * Start readahead for the next bufsize's worth of dir data blocks. + * We may have already issued readahead for some of that range; + * ra_blk tracks the last block we tried to read(ahead). */ + ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); + if (*ra_blk >= last_da) + goto out; + else if (*ra_blk == 0) + *ra_blk = map.br_startoff; + next_ra = map.br_startoff + geo->fsbcount; + if (next_ra >= last_da) + goto out_no_ra; + if (map.br_blockcount < geo->fsbcount && + !xfs_iext_get_extent(ifp, ++idx, &map)) + goto out_no_ra; + if (map.br_startoff >= last_da) + goto out_no_ra; + xfs_trim_extent(&map, next_ra, last_da - next_ra); + + /* Start ra for each dir (not fs) block that has a mapping. */ blk_start_plug(&plug); - for (mip->ra_index = mip->ra_offset = i = 0; - mip->ra_want > mip->ra_current && i < mip->map_blocks; - i += geo->fsbcount) { - ASSERT(mip->ra_index < mip->map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > mip->ra_current && - (map[mip->ra_index].br_blockcount - mip->ra_offset) >= - geo->fsbcount) { - xfs_dir3_data_readahead(dp, - map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_FSB_TO_DADDR(dp->i_mount, - map[mip->ra_index].br_startblock + - mip->ra_offset)); - mip->ra_current = i; - } - - /* - * Read-ahead a non-contiguous directory block. This doesn't - * use our mapping, but this is a very rare case. - */ - else if (i > mip->ra_current) { - xfs_dir3_data_readahead(dp, - map[mip->ra_index].br_startoff + - mip->ra_offset, -1); - mip->ra_current = i; - } - - /* - * Advance offset through the mapping table, processing a full - * dir block even if it is fragmented into several extents. - * But stop if we have consumed all valid mappings, even if - * it's not yet a full directory block. - */ - for (j = 0; - j < geo->fsbcount && mip->ra_index < mip->map_valid; - j += length ) { - /* - * The rest of this extent but not more than a dir - * block. - */ - length = min_t(int, geo->fsbcount - j, - map[mip->ra_index].br_blockcount - - mip->ra_offset); - mip->ra_offset += length; - - /* - * Advance to the next mapping if this one is used up. - */ - if (mip->ra_offset == map[mip->ra_index].br_blockcount) { - mip->ra_offset = 0; - mip->ra_index++; + while (ra_want > 0) { + next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount); + while (ra_want > 0 && + next_ra < map.br_startoff + map.br_blockcount) { + if (next_ra >= last_da) { + *ra_blk = last_da; + break; } + if (next_ra > *ra_blk) { + xfs_dir3_data_readahead(dp, next_ra, -2); + *ra_blk = next_ra; + } + ra_want -= geo->fsbcount; + next_ra += geo->fsbcount; + } + if (!xfs_iext_get_extent(ifp, ++idx, &map)) { + *ra_blk = last_da; + break; } } blk_finish_plug(&plug); @@ -458,6 +344,9 @@ xfs_dir2_leaf_readbuf( out: *bpp = bp; return error; +out_no_ra: + *ra_blk = last_da; + goto out; } /* @@ -475,14 +364,14 @@ xfs_dir2_leaf_getdents( xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ - int error = 0; /* error return value */ - int length; /* temporary length value */ - int byteoff; /* offset in current block */ - xfs_dir2_off_t curoff; /* current overall offset */ - xfs_dir2_off_t newoff; /* new curoff after new blk */ char *ptr = NULL; /* pointer to current data */ - struct xfs_dir2_leaf_map_info *map_info; struct xfs_da_geometry *geo = args->geo; + xfs_dablk_t rablk = 0; /* current readahead block */ + xfs_dir2_off_t curoff; /* current overall offset */ + int length; /* temporary length value */ + int byteoff; /* offset in current block */ + int lock_mode; + int error = 0; /* error return value */ /* * If the offset is at or past the largest allowed value, @@ -492,73 +381,35 @@ xfs_dir2_leaf_getdents( return 0; /* - * Set up to bmap a number of blocks based on the caller's - * buffer size, the directory block size, and the filesystem - * block size. - */ - length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); - map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + - (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP | KM_NOFS); - map_info->map_size = length; - - /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ curoff = xfs_dir2_dataptr_to_byte(ctx->pos); /* - * Force this conversion through db so we truncate the offset - * down to get the start of the data block. - */ - map_info->map_off = xfs_dir2_db_to_da(geo, - xfs_dir2_byte_to_db(geo, curoff)); - - /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. */ while (curoff < XFS_DIR2_LEAF_OFFSET) { - __uint8_t filetype; + uint8_t filetype; /* * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { - int lock_mode; - bool trim_map = false; - if (bp) { - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); bp = NULL; - trim_map = true; } lock_mode = xfs_ilock_data_map_shared(dp); - error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, - &curoff, &bp, trim_map); + error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, + &rablk, &bp); xfs_iunlock(dp, lock_mode); - if (error || !map_info->map_valid) + if (error || !bp) break; - /* - * Having done a read, we need to set a new offset. - */ - newoff = xfs_dir2_db_off_to_byte(geo, - map_info->curdb, 0); - /* - * Start of the current block. - */ - if (curoff < newoff) - curoff = newoff; - /* - * Make sure we're in the right block. - */ - else if (curoff > newoff) - ASSERT(xfs_dir2_byte_to_db(geo, curoff) == - map_info->curdb); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* @@ -643,17 +494,22 @@ xfs_dir2_leaf_getdents( ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; - kmem_free(map_info); if (bp) - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); return error; } /* * Read a directory. + * + * If supplied, the transaction collects locked dir buffers to avoid + * nested buffer deadlocks. This function does not dirty the + * transaction. The caller should ensure that the inode is locked + * before calling this function. */ int xfs_readdir( + struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize) @@ -672,6 +528,7 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; + args.trans = tp; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 6a05d278da64..b2cde5426182 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -39,7 +39,7 @@ xfs_trim_extents( xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, - __uint64_t *blocks_trimmed) + uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; @@ -166,7 +166,7 @@ xfs_ioc_trim( struct fstrim_range range; xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; - __uint64_t blocks_trimmed = 0; + uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9d06cc30e875..f89f7b5241e6 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -276,7 +276,7 @@ xfs_qm_init_dquot_blk( void xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { - __uint64_t space; + uint64_t space; dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); @@ -472,18 +472,23 @@ xfs_qm_dqtobp( struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); - uint lock_mode; + uint lock_mode = 0; quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - lock_mode = xfs_ilock_data_map_shared(quotip); + ASSERT(!(flags & XFS_QMOPT_NOLOCK) || + xfs_isilocked(quotip, XFS_ILOCK_SHARED) || + xfs_isilocked(quotip, XFS_ILOCK_EXCL)); + if (!(flags & XFS_QMOPT_NOLOCK)) + lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ - xfs_iunlock(quotip, lock_mode); + if (lock_mode) + xfs_iunlock(quotip, lock_mode); return -ESRCH; } @@ -493,7 +498,8 @@ xfs_qm_dqtobp( error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - xfs_iunlock(quotip, lock_mode); + if (lock_mode) + xfs_iunlock(quotip, lock_mode); if (error) return error; @@ -695,21 +701,18 @@ error0: */ static int xfs_dq_get_next_id( - xfs_mount_t *mp, + struct xfs_mount *mp, uint type, - xfs_dqid_t *id, - loff_t eof) + xfs_dqid_t *id) { - struct xfs_inode *quotip; + struct xfs_inode *quotip = xfs_quota_inode(mp, type); + xfs_dqid_t next_id = *id + 1; /* simple advance */ + uint lock_flags; + struct xfs_bmbt_irec got; + xfs_extnum_t idx; xfs_fsblock_t start; - loff_t offset; - uint lock; - xfs_dqid_t next_id; int error = 0; - /* Simple advance */ - next_id = *id + 1; - /* If we'd wrap past the max ID, stop */ if (next_id < *id) return -ENOENT; @@ -723,23 +726,25 @@ xfs_dq_get_next_id( /* Nope, next_id is now past the current chunk, so find the next one */ start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk; - quotip = xfs_quota_inode(mp, type); - lock = xfs_ilock_data_map_shared(quotip); - - offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start), - eof, SEEK_DATA); - if (offset < 0) - error = offset; + lock_flags = xfs_ilock_data_map_shared(quotip); + if (!(quotip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK); + if (error) + return error; + } - xfs_iunlock(quotip, lock); + if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &idx, &got)) { + /* contiguous chunk, bump startoff for the id calculation */ + if (got.br_startoff < start) + got.br_startoff = start; + *id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk; + } else { + error = -ENOENT; + } - /* -ENXIO is essentially "no more data" */ - if (error) - return (error == -ENXIO ? -ENOENT: error); + xfs_iunlock(quotip, lock_flags); - /* Convert next data offset back to a quota id */ - *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk; - return 0; + return error; } /* @@ -762,7 +767,6 @@ xfs_qm_dqget( struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; - loff_t eof = 0; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -790,21 +794,6 @@ xfs_qm_dqget( } #endif - /* Get the end of the quota file if we need it */ - if (flags & XFS_QMOPT_DQNEXT) { - struct xfs_inode *quotip; - xfs_fileoff_t last; - uint lock_mode; - - quotip = xfs_quota_inode(mp, type); - lock_mode = xfs_ilock_data_map_shared(quotip); - error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK); - xfs_iunlock(quotip, lock_mode); - if (error) - return error; - eof = XFS_FSB_TO_B(mp, last); - } - restart: mutex_lock(&qi->qi_tree_lock); dqp = radix_tree_lookup(tree, id); @@ -823,7 +812,7 @@ restart: if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); - error = xfs_dq_get_next_id(mp, type, &id, eof); + error = xfs_dq_get_next_id(mp, type, &id); if (error) return error; goto restart; @@ -858,7 +847,7 @@ restart: /* If we are asked to find next active id, keep looking */ if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) { - error = xfs_dq_get_next_id(mp, type, &id, eof); + error = xfs_dq_get_next_id(mp, type, &id); if (!error) goto restart; } @@ -917,7 +906,7 @@ restart: if (flags & XFS_QMOPT_DQNEXT) { if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { xfs_qm_dqput(dqp); - error = xfs_dq_get_next_id(mp, type, &id, eof); + error = xfs_dq_get_next_id(mp, type, &id); if (error) return error; goto restart; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed7ee4e8af73..2f4feb959bfb 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -22,103 +22,280 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_sysfs.h" #ifdef DEBUG -int xfs_etest[XFS_NUM_INJECT_ERROR]; -int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; -char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; -int xfs_error_test_active; +static unsigned int xfs_errortag_random_default[] = { + XFS_RANDOM_DEFAULT, + XFS_RANDOM_IFLUSH_1, + XFS_RANDOM_IFLUSH_2, + XFS_RANDOM_IFLUSH_3, + XFS_RANDOM_IFLUSH_4, + XFS_RANDOM_IFLUSH_5, + XFS_RANDOM_IFLUSH_6, + XFS_RANDOM_DA_READ_BUF, + XFS_RANDOM_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK, + XFS_RANDOM_ALLOC_READ_AGF, + XFS_RANDOM_IALLOC_READ_AGI, + XFS_RANDOM_ITOBP_INOTOBP, + XFS_RANDOM_IUNLINK, + XFS_RANDOM_IUNLINK_REMOVE, + XFS_RANDOM_DIR_INO_VALIDATE, + XFS_RANDOM_BULKSTAT_READ_CHUNK, + XFS_RANDOM_IODONE_IOERR, + XFS_RANDOM_STRATREAD_IOERR, + XFS_RANDOM_STRATCMPL_IOERR, + XFS_RANDOM_DIOWRITE_IOERR, + XFS_RANDOM_BMAPIFORMAT, + XFS_RANDOM_FREE_EXTENT, + XFS_RANDOM_RMAP_FINISH_ONE, + XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, + XFS_RANDOM_REFCOUNT_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE, + XFS_RANDOM_AG_RESV_CRITICAL, + XFS_RANDOM_DROP_WRITES, + XFS_RANDOM_LOG_BAD_CRC, +}; -int -xfs_error_test(int error_tag, int *fsidp, char *expression, - int line, char *file, unsigned long randfactor) +struct xfs_errortag_attr { + struct attribute attr; + unsigned int tag; +}; + +static inline struct xfs_errortag_attr * +to_attr(struct attribute *attr) { - int i; - int64_t fsid; + return container_of(attr, struct xfs_errortag_attr, attr); +} - if (prandom_u32() % randfactor) - return 0; +static inline struct xfs_mount * +to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); - memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); + return container_of(kobj, struct xfs_mount, m_errortag_kobj); +} + +STATIC ssize_t +xfs_errortag_attr_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = to_mp(kobject); + struct xfs_errortag_attr *xfs_attr = to_attr(attr); + int ret; + unsigned int val; - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { - xfs_warn(NULL, - "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, xfs_etest_fsname[i]); - return 1; - } + if (strcmp(buf, "default") == 0) { + val = xfs_errortag_random_default[xfs_attr->tag]; + } else { + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; } - return 0; + ret = xfs_errortag_set(mp, xfs_attr->tag, val); + if (ret) + return ret; + return count; } +STATIC ssize_t +xfs_errortag_attr_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_mount *mp = to_mp(kobject); + struct xfs_errortag_attr *xfs_attr = to_attr(attr); + + return snprintf(buf, PAGE_SIZE, "%u\n", + xfs_errortag_get(mp, xfs_attr->tag)); +} + +static const struct sysfs_ops xfs_errortag_sysfs_ops = { + .show = xfs_errortag_attr_show, + .store = xfs_errortag_attr_store, +}; + +#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ + .tag = (_tag), \ +} + +#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr + +XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); +XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); +XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); +XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); +XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); +XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); +XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); +XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); +XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); +XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); +XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); +XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); +XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); +XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); +XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); +XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); +XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); +XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); +XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); +XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); +XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); +XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); +XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); +XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); +XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); +XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); +XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); + +static struct attribute *xfs_errortag_attrs[] = { + XFS_ERRORTAG_ATTR_LIST(noerror), + XFS_ERRORTAG_ATTR_LIST(iflush1), + XFS_ERRORTAG_ATTR_LIST(iflush2), + XFS_ERRORTAG_ATTR_LIST(iflush3), + XFS_ERRORTAG_ATTR_LIST(iflush4), + XFS_ERRORTAG_ATTR_LIST(iflush5), + XFS_ERRORTAG_ATTR_LIST(iflush6), + XFS_ERRORTAG_ATTR_LIST(dareadbuf), + XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), + XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), + XFS_ERRORTAG_ATTR_LIST(readagf), + XFS_ERRORTAG_ATTR_LIST(readagi), + XFS_ERRORTAG_ATTR_LIST(itobp), + XFS_ERRORTAG_ATTR_LIST(iunlink), + XFS_ERRORTAG_ATTR_LIST(iunlinkrm), + XFS_ERRORTAG_ATTR_LIST(dirinovalid), + XFS_ERRORTAG_ATTR_LIST(bulkstat), + XFS_ERRORTAG_ATTR_LIST(logiodone), + XFS_ERRORTAG_ATTR_LIST(stratread), + XFS_ERRORTAG_ATTR_LIST(stratcmpl), + XFS_ERRORTAG_ATTR_LIST(diowrite), + XFS_ERRORTAG_ATTR_LIST(bmapifmt), + XFS_ERRORTAG_ATTR_LIST(free_extent), + XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), + XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), + XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), + XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), + XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), + XFS_ERRORTAG_ATTR_LIST(drop_writes), + XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + NULL, +}; + +struct kobj_type xfs_errortag_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_errortag_sysfs_ops, + .default_attrs = xfs_errortag_attrs, +}; + int -xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) +xfs_errortag_init( + struct xfs_mount *mp) { - int i; - int len; - int64_t fsid; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_errortag) + return -ENOMEM; - if (error_tag >= XFS_ERRTAG_MAX) - return -EINVAL; + return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); +} - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); +void +xfs_errortag_del( + struct xfs_mount *mp) +{ + xfs_sysfs_del(&mp->m_errortag_kobj); + kmem_free(mp->m_errortag); +} - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { - xfs_warn(mp, "error tag #%d on", error_tag); - return 0; - } - } +bool +xfs_errortag_test( + struct xfs_mount *mp, + const char *expression, + const char *file, + int line, + unsigned int error_tag) +{ + unsigned int randfactor; - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest[i] == 0) { - xfs_warn(mp, "Turned on XFS error tag #%d", - error_tag); - xfs_etest[i] = error_tag; - xfs_etest_fsid[i] = fsid; - len = strlen(mp->m_fsname); - xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); - strcpy(xfs_etest_fsname[i], mp->m_fsname); - xfs_error_test_active++; - return 0; - } - } + /* + * To be able to use error injection anywhere, we need to ensure error + * injection mechanism is already initialized. + * + * Code paths like I/O completion can be called before the + * initialization is complete, but be able to inject errors in such + * places is still useful. + */ + if (!mp->m_errortag) + return false; - xfs_warn(mp, "error tag overflow, too many turned on"); + ASSERT(error_tag < XFS_ERRTAG_MAX); + randfactor = mp->m_errortag[error_tag]; + if (!randfactor || prandom_u32() % randfactor) + return false; - return 1; + xfs_warn_ratelimited(mp, +"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", + expression, file, line, mp->m_fsname); + return true; } int -xfs_errortag_clearall(xfs_mount_t *mp, int loud) +xfs_errortag_get( + struct xfs_mount *mp, + unsigned int error_tag) { - int64_t fsid; - int cleared = 0; - int i; - - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - - - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && - xfs_etest[i] != 0) { - cleared = 1; - xfs_warn(mp, "Clearing XFS error tag #%d", - xfs_etest[i]); - xfs_etest[i] = 0; - xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i]); - xfs_etest_fsname[i] = NULL; - xfs_error_test_active--; - } - } + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + + return mp->m_errortag[error_tag]; +} + +int +xfs_errortag_set( + struct xfs_mount *mp, + unsigned int error_tag, + unsigned int tag_value) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; - if (loud || cleared) - xfs_warn(mp, "Cleared all XFS error tags for filesystem"); + mp->m_errortag[error_tag] = tag_value; + return 0; +} +int +xfs_errortag_add( + struct xfs_mount *mp, + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + + return xfs_errortag_set(mp, error_tag, + xfs_errortag_random_default[error_tag]); +} + +int +xfs_errortag_clearall( + struct xfs_mount *mp) +{ + memset(mp->m_errortag, 0, sizeof(unsigned int) * XFS_ERRTAG_MAX); return 0; } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 05f8666733a0..7577be5f09bc 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -96,7 +96,17 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 -#define XFS_ERRTAG_MAX 28 +/* + * DEBUG mode instrumentation to test and/or trigger delayed allocation + * block killing in the event of failed writes. When enabled, all + * buffered writes are silenty dropped and handled as if they failed. + * All delalloc blocks in the range of the write (including pre-existing + * delalloc blocks!) are tossed as part of the write failure error + * handling sequence. + */ +#define XFS_ERRTAG_DROP_WRITES 28 +#define XFS_ERRTAG_LOG_BAD_CRC 29 +#define XFS_ERRTAG_MAX 30 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -129,23 +139,29 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 +#define XFS_RANDOM_DROP_WRITES 1 +#define XFS_RANDOM_LOG_BAD_CRC 1 #ifdef DEBUG -extern int xfs_error_test_active; -extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); - -#define XFS_NUM_INJECT_ERROR 10 -#define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || (xfs_error_test_active && \ - xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf)))) +extern int xfs_errortag_init(struct xfs_mount *mp); +extern void xfs_errortag_del(struct xfs_mount *mp); +extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, + const char *file, int line, unsigned int error_tag); +#define XFS_TEST_ERROR(expr, mp, tag) \ + ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) -extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); -extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); +extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); +extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, + unsigned int tag_value); +extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +extern int xfs_errortag_clearall(struct xfs_mount *mp); #else -#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) -#define xfs_errortag_add(tag, mp) (ENOSYS) -#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#define xfs_errortag_init(mp) (0) +#define xfs_errortag_del(mp) +#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define xfs_errortag_set(mp, tag, val) (ENOSYS) +#define xfs_errortag_add(mp, tag) (ENOSYS) +#define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5fb5a0958a14..c4893e226fd8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -140,7 +140,7 @@ xfs_file_fsync( trace_xfs_file_fsync(ip); - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; @@ -237,7 +237,11 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -541,7 +545,11 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -553,9 +561,15 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + if (unaligned_io) { + /* If we are going to wait for other DIO to finish, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) { + if (atomic_read(&inode->i_dio_count)) + return -EAGAIN; + } else { + inode_dio_wait(inode); + } + } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -585,7 +599,12 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -660,6 +679,7 @@ write_retry: xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); goto write_retry; } @@ -892,6 +912,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; + file->f_mode |= FMODE_AIO_NOWAIT; return 0; } @@ -950,362 +971,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - return xfs_readdir(ip, ctx, bufsize); -} - -/* - * This type is designed to indicate the type of offset we would like - * to search from page cache for xfs_seek_hole_data(). - */ -enum { - HOLE_OFF = 0, - DATA_OFF, -}; - -/* - * Lookup the desired type of offset from the given page. - * - * On success, return true and the offset argument will point to the - * start of the region that was found. Otherwise this function will - * return false and keep the offset argument unchanged. - */ -STATIC bool -xfs_lookup_buffer_offset( - struct page *page, - loff_t *offset, - unsigned int type) -{ - loff_t lastoff = page_offset(page); - bool found = false; - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - /* - * Unwritten extents that have data in the page - * cache covering them can be identified by the - * BH_Unwritten state flag. Pages with multiple - * buffers might have a mix of holes, data and - * unwritten extents - any buffer with valid - * data in it should have BH_Uptodate flag set - * on it. - */ - if (buffer_unwritten(bh) || - buffer_uptodate(bh)) { - if (type == DATA_OFF) - found = true; - } else { - if (type == HOLE_OFF) - found = true; - } - - if (found) { - *offset = lastoff; - break; - } - lastoff += bh->b_size; - } while ((bh = bh->b_this_page) != head); - - return found; -} - -/* - * This routine is called to find out and return a data or hole offset - * from the page cache for unwritten extents according to the desired - * type for xfs_seek_hole_data(). - * - * The argument offset is used to tell where we start to search from the - * page cache. Map is used to figure out the end points of the range to - * lookup pages. - * - * Return true if the desired type of offset was found, and the argument - * offset is filled with that address. Otherwise, return false and keep - * offset unchanged. - */ -STATIC bool -xfs_find_get_desired_pgoff( - struct inode *inode, - struct xfs_bmbt_irec *map, - unsigned int type, - loff_t *offset) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct pagevec pvec; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff = *offset; - loff_t lastoff = startoff; - bool found = false; - - pagevec_init(&pvec, 0); - - index = startoff >> PAGE_SHIFT; - endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = (endoff - 1) >> PAGE_SHIFT; - do { - int want; - unsigned nr_pages; - unsigned int i; - - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - want); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - loff_t b_offset; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to tmpfs - * file mapping. However, page->index will not change - * because we have a reference on the page. - * - * If current page offset is beyond where we've ended, - * we've found a hole. - */ - if (type == HOLE_OFF && lastoff < endoff && - lastoff < page_offset(pvec.pages[i])) { - found = true; - *offset = lastoff; - goto out; - } - /* Searching done if the page index is out of range. */ - if (page->index > end) - goto out; - - lock_page(page); - /* - * Page truncated or invalidated(page->mapping == NULL). - * We can freely skip it and proceed to check the next - * page. - */ - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - found = xfs_lookup_buffer_offset(page, &b_offset, type); - if (found) { - /* - * The found offset may be less than the start - * point to search if this is the first time to - * come here. - */ - *offset = max_t(loff_t, startoff, b_offset); - unlock_page(page); - goto out; - } - - /* - * We either searching data but nothing was found, or - * searching hole but found a data buffer. In either - * case, probably the next page contains the desired - * things, update the last offset to it so. - */ - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } - - /* - * The number of returned pages less than our desired, search - * done. - */ - if (nr_pages < want) - break; - - index = pvec.pages[i - 1]->index + 1; - pagevec_release(&pvec); - } while (index <= end); - - /* No page at lastoff and we are not done - we found a hole. */ - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } -out: - pagevec_release(&pvec); - return found; -} - -/* - * caller must lock inode with xfs_ilock_data_map_shared, - * can we craft an appropriate ASSERT? - * - * end is because the VFS-level lseek interface is defined such that any - * offset past i_size shall return -ENXIO, but we use this for quota code - * which does not maintain i_size, and we want to SEEK_DATA past i_size. - */ -loff_t -__xfs_seek_hole_data( - struct inode *inode, - loff_t start, - loff_t end, - int whence) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - loff_t uninitialized_var(offset); - xfs_fileoff_t fsbno; - xfs_filblks_t lastbno; - int error; - - if (start >= end) { - error = -ENXIO; - goto out_error; - } - - /* - * Try to read extents from the first block indicated - * by fsbno to the end block of the file. - */ - fsbno = XFS_B_TO_FSBT(mp, start); - lastbno = XFS_B_TO_FSB(mp, end); - - for (;;) { - struct xfs_bmbt_irec map[2]; - int nmap = 2; - unsigned int i; - - error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_error; - - /* No extents at given offset, must be beyond EOF */ - if (nmap == 0) { - error = -ENXIO; - goto out_error; - } - - for (i = 0; i < nmap; i++) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[i].br_startoff)); - - /* Landed in the hole we wanted? */ - if (whence == SEEK_HOLE && - map[i].br_startblock == HOLESTARTBLOCK) - goto out; - - /* Landed in the data extent we wanted? */ - if (whence == SEEK_DATA && - (map[i].br_startblock == DELAYSTARTBLOCK || - (map[i].br_state == XFS_EXT_NORM && - !isnullstartblock(map[i].br_startblock)))) - goto out; - - /* - * Landed in an unwritten extent, try to search - * for hole or data from page cache. - */ - if (map[i].br_state == XFS_EXT_UNWRITTEN) { - if (xfs_find_get_desired_pgoff(inode, &map[i], - whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, - &offset)) - goto out; - } - } - - /* - * We only received one extent out of the two requested. This - * means we've hit EOF and didn't find what we are looking for. - */ - if (nmap == 1) { - /* - * If we were looking for a hole, set offset to - * the end of the file (i.e., there is an implicit - * hole at the end of any file). - */ - if (whence == SEEK_HOLE) { - offset = end; - break; - } - /* - * If we were looking for data, it's nowhere to be found - */ - ASSERT(whence == SEEK_DATA); - error = -ENXIO; - goto out_error; - } - - ASSERT(i > 1); - - /* - * Nothing was found, proceed to the next round of search - * if the next reading offset is not at or beyond EOF. - */ - fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; - start = XFS_FSB_TO_B(mp, fsbno); - if (start >= end) { - if (whence == SEEK_HOLE) { - offset = end; - break; - } - ASSERT(whence == SEEK_DATA); - error = -ENXIO; - goto out_error; - } - } - -out: - /* - * If at this point we have found the hole we wanted, the returned - * offset may be bigger than the file size as it may be aligned to - * page boundary for unwritten extents. We need to deal with this - * situation in particular. - */ - if (whence == SEEK_HOLE) - offset = min_t(loff_t, offset, end); - - return offset; - -out_error: - return error; -} - -STATIC loff_t -xfs_seek_hole_data( - struct file *file, - loff_t start, - int whence) -{ - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - uint lock; - loff_t offset, end; - int error = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - lock = xfs_ilock_data_map_shared(ip); - - end = i_size_read(inode); - offset = __xfs_seek_hole_data(inode, start, end, whence); - if (offset < 0) { - error = offset; - goto out_unlock; - } - - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); - -out_unlock: - xfs_iunlock(ip, lock); - - if (error) - return error; - return offset; + return xfs_readdir(NULL, ip, ctx, bufsize); } STATIC loff_t @@ -1314,17 +980,25 @@ xfs_file_llseek( loff_t offset, int whence) { + struct inode *inode = file->f_mapping->host; + + if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount)) + return -EIO; + switch (whence) { - case SEEK_END: - case SEEK_CUR: - case SEEK_SET: + default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: + offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + break; case SEEK_DATA: - return xfs_seek_hole_data(file, offset, whence); - default: - return -EINVAL; + offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + break; } + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 6ccaae9eb0ee..8f22fc579dbb 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -602,7 +602,7 @@ xfs_growfs_data_private( if (nagimax) mp->m_maxagi = nagimax; if (mp->m_sb.sb_imax_pct) { - __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else @@ -793,17 +793,17 @@ xfs_fs_counts( int xfs_reserve_blocks( xfs_mount_t *mp, - __uint64_t *inval, + uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta; - __int64_t fdblks_delta = 0; - __uint64_t request; - __int64_t free; + int64_t lcounter, delta; + int64_t fdblks_delta = 0; + uint64_t request; + int64_t free; int error = 0; /* If inval is null, report current values and return */ - if (inval == (__uint64_t *)NULL) { + if (inval == (uint64_t *)NULL) { if (!outval) return -EINVAL; outval->resblks = mp->m_resblks; @@ -904,7 +904,7 @@ out: int xfs_fs_goingdown( xfs_mount_t *mp, - __uint32_t inflags) + uint32_t inflags) { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f34915898fea..2954c13a3acd 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -22,9 +22,9 @@ extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); -extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, +extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); -extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 687a4b01fc53..3e1cc3001bcb 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -47,4 +47,9 @@ xfs_param_t xfs_params = { struct xfs_globals xfs_globals = { .log_recovery_delay = 0, /* no delay by default */ +#ifdef XFS_ASSERT_FATAL + .bug_on_assert = true, /* assert failures BUG() */ +#else + .bug_on_assert = false, /* assert failures WARN() */ +#endif }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 990210fcb9c3..0a9e6985a0d0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -269,12 +269,12 @@ xfs_inew_wait( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (!xfs_iflags_test(ip, XFS_INEW)) break; schedule(); } while (true); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } /* @@ -368,6 +368,11 @@ xfs_iget_cache_hit( if (ip->i_flags & XFS_IRECLAIMABLE) { trace_xfs_iget_reclaim(ip); + if (flags & XFS_IGET_INCORE) { + error = -EAGAIN; + goto out_error; + } + /* * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode * from stomping over us while we recycle the inode. We can't @@ -432,7 +437,8 @@ xfs_iget_cache_hit( if (lock_flags != 0) xfs_ilock(ip, lock_flags); - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + if (!(flags & XFS_IGET_INCORE)) + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -603,6 +609,10 @@ again: goto out_error_or_again; } else { rcu_read_unlock(); + if (flags & XFS_IGET_INCORE) { + error = -ENOENT; + goto out_error_or_again; + } XFS_STATS_INC(mp, xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, @@ -623,7 +633,7 @@ again: return 0; out_error_or_again: - if (error == -EAGAIN) { + if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { delay(1); goto again; } @@ -632,6 +642,44 @@ out_error_or_again: } /* + * "Is this a cached inode that's also allocated?" + * + * Look up an inode by number in the given file system. If the inode is + * in cache and isn't in purgatory, return 1 if the inode is allocated + * and 0 if it is not. For all other cases (not in cache, being torn + * down, etc.), return a negative error code. + * + * The caller has to prevent inode allocation and freeing activity, + * presumably by locking the AGI buffer. This is to ensure that an + * inode cannot transition from allocated to freed until the caller is + * ready to allow that. If the inode is in an intermediate state (new, + * reclaimable, or being reclaimed), -EAGAIN will be returned; if the + * inode is not in the cache, -ENOENT will be returned. The caller must + * deal with these scenarios appropriately. + * + * This is a specialized use case for the online scrubber; if you're + * reading this, you probably want xfs_iget. + */ +int +xfs_icache_inode_is_allocated( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + bool *inuse) +{ + struct xfs_inode *ip; + int error; + + error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); + if (error) + return error; + + *inuse = !!(VFS_I(ip)->i_mode); + IRELE(ip); + return 0; +} + +/* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between * lookup reduction and stack usage. This is in the reclaim path, so we can't diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 9183f77958ef..bff4d85e5498 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -47,6 +47,7 @@ struct xfs_eofblocks { #define XFS_IGET_CREATE 0x1 #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ /* * flags for AG inode iterator @@ -126,4 +127,7 @@ xfs_fs_eofblocks_from_user( return 0; } +int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_ino_t ino, bool *inuse); + #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ec9826c56500..ceef77c0416a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -622,17 +622,17 @@ __xfs_iflock( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_isiflocked(ip)) io_schedule(); } while (!xfs_iflock_nowait(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } STATIC uint _xfs_dic2xflags( - __uint16_t di_flags, + uint16_t di_flags, uint64_t di_flags2, bool has_attr) { @@ -855,8 +855,8 @@ xfs_ialloc( inode->i_version = 1; ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; - ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; + ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; } @@ -2486,11 +2486,11 @@ __xfs_iunpin_wait( xfs_iunpin(ip); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_ipincount(ip)) io_schedule(); } while (xfs_ipincount(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } void @@ -3489,7 +3489,7 @@ xfs_iflush_int( dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { + mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -3499,7 +3499,7 @@ xfs_iflush_int( if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { + mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr 0x%p", __func__, ip->i_ino, ip); @@ -3510,7 +3510,7 @@ xfs_iflush_int( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), - mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { + mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr 0x%p", __func__, ip->i_ino, ip); @@ -3518,8 +3518,7 @@ xfs_iflush_int( } } if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > - ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, - XFS_RANDOM_IFLUSH_5)) { + ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr 0x%p", @@ -3529,7 +3528,7 @@ xfs_iflush_int( goto corrupt_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { + mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", __func__, ip->i_ino, ip->i_d.di_forkoff, ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 10e89fcb49d7..0ee453de239a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -192,8 +192,8 @@ static inline void xfs_set_projid(struct xfs_inode *ip, prid_t projid) { - ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); - ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); + ip->i_d.di_projid_hi = (uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff); } static inline prid_t @@ -445,9 +445,6 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, bool *did_zero); -loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, - loff_t eof, int whence); - /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 08cb7d1a4a3a..013cc78d7daf 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -834,9 +834,7 @@ xfs_inode_item_format_convert( in_f->ilf_dsize = in_f32->ilf_dsize; in_f->ilf_ino = in_f32->ilf_ino; /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f32->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); in_f->ilf_blkno = in_f32->ilf_blkno; in_f->ilf_len = in_f32->ilf_len; in_f->ilf_boffset = in_f32->ilf_boffset; @@ -851,9 +849,7 @@ xfs_inode_item_format_convert( in_f->ilf_dsize = in_f64->ilf_dsize; in_f->ilf_ino = in_f64->ilf_ino; /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f64->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid); in_f->ilf_blkno = in_f64->ilf_blkno; in_f->ilf_len = in_f64->ilf_len; in_f->ilf_boffset = in_f64->ilf_boffset; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6190697603c9..9c0c7a920304 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -120,8 +120,7 @@ xfs_find_handle( handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = inode->i_generation; handle.ha_fid.fid_ino = ip->i_ino; - - hsize = XFS_HSIZE(handle); + hsize = sizeof(xfs_handle_t); } error = -EFAULT; @@ -444,8 +443,8 @@ xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, unsigned char __user *ubuf, - __uint32_t *len, - __uint32_t flags) + uint32_t *len, + uint32_t flags) { unsigned char *kbuf; int error = -EFAULT; @@ -473,8 +472,8 @@ xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, const unsigned char __user *ubuf, - __uint32_t len, - __uint32_t flags) + uint32_t len, + uint32_t flags) { unsigned char *kbuf; int error; @@ -499,7 +498,7 @@ int xfs_attrmulti_attr_remove( struct inode *inode, unsigned char *name, - __uint32_t flags) + uint32_t flags) { int error; @@ -877,7 +876,7 @@ xfs_merge_ioc_xflags( STATIC unsigned int xfs_di2lxflags( - __uint16_t di_flags) + uint16_t di_flags) { unsigned int flags = 0; @@ -1288,7 +1287,7 @@ xfs_ioctl_setattr_check_projid( struct fsxattr *fa) { /* Disallow 32bit project ids if projid32bit feature is not enabled. */ - if (fa->fsx_projid > (__uint16_t)-1 && + if (fa->fsx_projid > (uint16_t)-1 && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return -EINVAL; @@ -1932,7 +1931,7 @@ xfs_file_ioctl( case XFS_IOC_SET_RESBLKS: { xfs_fsop_resblks_t inout; - __uint64_t in; + uint64_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2018,12 +2017,12 @@ xfs_file_ioctl( } case XFS_IOC_GOINGDOWN: { - __uint32_t in; + uint32_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (get_user(in, (__uint32_t __user *)arg)) + if (get_user(in, (uint32_t __user *)arg)) return -EFAULT; return xfs_fs_goingdown(mp, in); @@ -2038,14 +2037,14 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; - return xfs_errortag_add(in.errtag, mp); + return xfs_errortag_add(mp, in.errtag); } case XFS_IOC_ERROR_CLEARALL: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return xfs_errortag_clearall(mp, 1); + return xfs_errortag_clearall(mp); case XFS_IOC_FREE_EOFBLOCKS: { struct xfs_fs_eofblocks eofb; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 8b52881bfd90..e86c3ea137d2 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -48,22 +48,22 @@ xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, unsigned char __user *ubuf, - __uint32_t *len, - __uint32_t flags); + uint32_t *len, + uint32_t flags); extern int xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, const unsigned char __user *ubuf, - __uint32_t len, - __uint32_t flags); + uint32_t len, + uint32_t flags); extern int xfs_attrmulti_attr_remove( struct inode *inode, unsigned char *name, - __uint32_t flags); + uint32_t flags); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index b1bb45444df8..5492bcf6f442 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -112,9 +112,9 @@ typedef struct compat_xfs_fsop_handlereq { /* The bstat field in the swapext struct needs translation */ typedef struct compat_xfs_swapext { - __int64_t sx_version; /* version */ - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ + int64_t sx_version; /* version */ + int64_t sx_fdtarget; /* fd of target file */ + int64_t sx_fdtmp; /* fd of tmp file */ xfs_off_t sx_offset; /* offset into file */ xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 94e5bdf7304c..813394c62849 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -543,7 +543,7 @@ xfs_file_iomap_begin_delay( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_unlock; @@ -995,6 +995,11 @@ xfs_file_iomap_begin( lockmode = xfs_ilock_data_map_shared(ip); } + if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = -EAGAIN; + goto out_unlock; + } + ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) length = mp->m_super->s_maxbytes - offset; @@ -1016,6 +1021,15 @@ xfs_file_iomap_begin( if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { if (flags & IOMAP_DIRECT) { + /* + * A reflinked inode will result in CoW alloc. + * FIXME: It could still overwrite on unshared extents + * and not need allocation. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1033,6 +1047,14 @@ xfs_file_iomap_begin( if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* + * If nowait is set bail since we are going to make + * allocations. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } + /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric * with the work writeback does. This is a completely arbitrary @@ -1097,7 +1119,7 @@ xfs_file_iomap_end_delalloc( * Behave as if the write failed if drop writes is enabled. Set the NEW * flag to force delalloc cleanup. */ - if (xfs_mp_drop_writes(mp)) { + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { iomap->flags |= IOMAP_F_NEW; written = 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ebfc13350f9a..469c9fa4c178 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -190,12 +190,12 @@ xfs_generic_create( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } @@ -460,7 +460,7 @@ xfs_vn_get_link( if (!dentry) return ERR_PTR(-ECHILD); - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + link = kmalloc(XFS_SYMLINK_MAXLEN+1, GFP_KERNEL); if (!link) goto out_err; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 26d67ce3c18d..c393a2f6d8c3 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -31,7 +31,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" -STATIC int +int xfs_internal_inum( xfs_mount_t *mp, xfs_ino_t ino) diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 6ea8b3912fa4..17e86e0541af 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -96,4 +96,6 @@ xfs_inumbers( void __user *buffer, /* buffer with inode info */ inumbers_fmt_pf formatter); +int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); + #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 044fb0e15390..9301c5a6060b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -19,18 +19,11 @@ #define __XFS_LINUX__ #include <linux/types.h> +#include <linux/uuid.h> /* * Kernel specific type declarations for XFS */ -typedef signed char __int8_t; -typedef unsigned char __uint8_t; -typedef signed short int __int16_t; -typedef unsigned short int __uint16_t; -typedef signed int __int32_t; -typedef unsigned int __uint32_t; -typedef signed long long int __int64_t; -typedef unsigned long long int __uint64_t; typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ @@ -42,7 +35,6 @@ typedef __u32 xfs_nlink_t; #include "kmem.h" #include "mrlock.h" -#include "uuid.h" #include <linux/semaphore.h> #include <linux/mm.h> @@ -151,7 +143,6 @@ typedef __u32 xfs_nlink_t; #define __return_address __builtin_return_address(0) #define XFS_PROJID_DEFAULT 0 -#define MAXPATHLEN 1024 #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) @@ -186,22 +177,22 @@ extern struct xstats xfsstats; * are converting to the init_user_ns. The uid is later mapped to a particular * user namespace value when crossing the kernel/user boundary. */ -static inline __uint32_t xfs_kuid_to_uid(kuid_t uid) +static inline uint32_t xfs_kuid_to_uid(kuid_t uid) { return from_kuid(&init_user_ns, uid); } -static inline kuid_t xfs_uid_to_kuid(__uint32_t uid) +static inline kuid_t xfs_uid_to_kuid(uint32_t uid) { return make_kuid(&init_user_ns, uid); } -static inline __uint32_t xfs_kgid_to_gid(kgid_t gid) +static inline uint32_t xfs_kgid_to_gid(kgid_t gid) { return from_kgid(&init_user_ns, gid); } -static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) +static inline kgid_t xfs_gid_to_kgid(uint32_t gid) { return make_kgid(&init_user_ns, gid); } @@ -231,14 +222,14 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) -static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; do_div(x, y); return x * y; } -static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +static inline uint64_t howmany_64(uint64_t x, uint32_t y) { x += y - 1; do_div(x, y); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3731f13f63e9..0053bcf2b10a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -434,7 +434,7 @@ xfs_log_reserve( int unit_bytes, int cnt, struct xlog_ticket **ticp, - __uint8_t client, + uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -825,9 +825,9 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (!error) { /* the data section must be 32 bit size aligned */ struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ + uint16_t magic; + uint16_t pad1; + uint32_t pad2; /* may as well make it 64 bits */ } magic = { .magic = XLOG_UNMOUNT_TYPE, }; @@ -1189,8 +1189,7 @@ xlog_iodone(xfs_buf_t *bp) * IOABORT state. The IOABORT state is only set in DEBUG mode to inject * CRC errors into log recovery. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, - XFS_RANDOM_IODONE_IOERR) || + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) || iclog->ic_state & XLOG_STATE_IOABORT) { if (iclog->ic_state & XLOG_STATE_IOABORT) iclog->ic_state &= ~XLOG_STATE_IOABORT; @@ -1665,7 +1664,7 @@ xlog_cksum( char *dp, int size) { - __uint32_t crc; + uint32_t crc; /* first generate the crc for the record header ... */ crc = xfs_start_cksum_update((char *)rhead, @@ -1828,7 +1827,7 @@ xlog_sync( */ dptr = (char *)&iclog->ic_header + count; for (i = 0; i < split; i += BBSIZE) { - __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); if (++cycle == XLOG_HEADER_MAGIC_NUM) cycle++; *(__be32 *)dptr = cpu_to_be32(cycle); @@ -1842,7 +1841,6 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size); -#ifdef DEBUG /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -1850,15 +1848,13 @@ xlog_sync( * write on I/O completion and shutdown the fs. The subsequent mount * detects the bad CRC and attempts to recover. */ - if (log->l_badcrc_factor && - (prandom_u32() % log->l_badcrc_factor == 0)) { + if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_state |= XLOG_STATE_IOABORT; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", be64_to_cpu(iclog->ic_header.h_lsn)); } -#endif bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; @@ -2024,7 +2020,7 @@ xlog_print_tic_res( }; #undef REG_TYPE_STR - xfs_warn(mp, "xlog_write: reservation summary:"); + xfs_warn(mp, "ticket reservation summary:"); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); xfs_warn(mp, " current res = %d bytes", @@ -2045,10 +2041,55 @@ xlog_print_tic_res( "bad-rtype" : res_type_str[r_type]), ticket->t_res_arr[i].r_len); } +} + +/* + * Print a summary of the transaction. + */ +void +xlog_print_trans( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item_desc *lidp; + + /* dump core transaction and ticket info */ + xfs_warn(mp, "transaction summary:"); + xfs_warn(mp, " flags = 0x%x", tp->t_flags); + + xlog_print_tic_res(mp, tp->t_ticket); - xfs_alert_tag(mp, XFS_PTAG_LOGRES, - "xlog_write: reservation ran out. Need to up reservation"); - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + /* dump each log item */ + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv = lip->li_lv; + struct xfs_log_iovec *vec; + int i; + + xfs_warn(mp, "log item: "); + xfs_warn(mp, " type = 0x%x", lip->li_type); + xfs_warn(mp, " flags = 0x%x", lip->li_flags); + if (!lv) + continue; + xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); + xfs_warn(mp, " size = %d", lv->lv_size); + xfs_warn(mp, " bytes = %d", lv->lv_bytes); + xfs_warn(mp, " buf len = %d", lv->lv_buf_len); + + /* dump each iovec for the log item */ + vec = lv->lv_iovecp; + for (i = 0; i < lv->lv_niovecs; i++) { + int dumplen = min(vec->i_len, 32); + + xfs_warn(mp, " iovec[%d]", i); + xfs_warn(mp, " type = 0x%x", vec->i_type); + xfs_warn(mp, " len = %d", vec->i_len); + xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i); + xfs_hex_dump(vec->i_addr, dumplen); + + vec++; + } + } } /* @@ -2321,8 +2362,12 @@ xlog_write( if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) ticket->t_curr_res -= sizeof(xlog_op_header_t); - if (ticket->t_curr_res < 0) + if (ticket->t_curr_res < 0) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + } index = 0; lv = log_vector; @@ -2363,8 +2408,8 @@ xlog_write( } reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(__int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + ASSERT(reg->i_len % sizeof(int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); start_rec_copy = xlog_write_start_rec(ptr, ticket); if (start_rec_copy) { @@ -3143,7 +3188,7 @@ xlog_state_switch_iclogs( /* Round up to next log-sunit */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { - __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); } @@ -3771,7 +3816,7 @@ xlog_verify_iclog( xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; ptrdiff_t field_offset; - __uint8_t clientid; + uint8_t clientid; int len, i, j, k, op_len; int idx; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index cc5a9f1574e7..bf212772595c 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -159,7 +159,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int length, int count, struct xlog_ticket **ticket, - __uint8_t clientid, + uint8_t clientid, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 82f1cbcc4de1..fbe72b134bef 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -410,6 +410,7 @@ xlog_cil_insert_items( int len = 0; int diff_iovecs = 0; int iclog_space; + int iovhdr_res = 0, split_res = 0, ctx_res = 0; ASSERT(tp); @@ -419,30 +420,11 @@ xlog_cil_insert_items( */ xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); - /* - * Now (re-)position everything modified at the tail of the CIL. - * We do this here so we only need to take the CIL lock once during - * the transaction commit. - */ spin_lock(&cil->xc_cil_lock); - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; - - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* - * Only move the item if it isn't already at the tail. This is - * to prevent a transient list_empty() state when reinserting - * an item that is already the only item in the CIL. - */ - if (!list_is_last(&lip->li_cil, &cil->xc_cil)) - list_move_tail(&lip->li_cil, &cil->xc_cil); - } /* account for space used by new iovec headers */ - len += diff_iovecs * sizeof(xlog_op_header_t); + iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t); + len += iovhdr_res; ctx->nvecs += diff_iovecs; /* attach the transaction to the CIL if it has any busy extents */ @@ -457,28 +439,66 @@ xlog_cil_insert_items( * during the transaction commit. */ if (ctx->ticket->t_curr_res == 0) { - ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; - tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; + ctx_res = ctx->ticket->t_unit_res; + ctx->ticket->t_curr_res = ctx_res; + tp->t_ticket->t_curr_res -= ctx_res; } /* do we need space for more log record headers? */ iclog_space = log->l_iclog_size - log->l_iclog_hsize; if (len > 0 && (ctx->space_used / iclog_space != (ctx->space_used + len) / iclog_space)) { - int hdrs; - - hdrs = (len + iclog_space - 1) / iclog_space; + split_res = (len + iclog_space - 1) / iclog_space; /* need to take into account split region headers, too */ - hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); - ctx->ticket->t_unit_res += hdrs; - ctx->ticket->t_curr_res += hdrs; - tp->t_ticket->t_curr_res -= hdrs; + split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += split_res; + ctx->ticket->t_curr_res += split_res; + tp->t_ticket->t_curr_res -= split_res; ASSERT(tp->t_ticket->t_curr_res >= len); } tp->t_ticket->t_curr_res -= len; ctx->space_used += len; + /* + * If we've overrun the reservation, dump the tx details before we move + * the log items. Shutdown is imminent... + */ + if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { + xfs_warn(log->l_mp, "Transaction log reservation overrun:"); + xfs_warn(log->l_mp, + " log items: %d bytes (iov hdrs: %d bytes)", + len, iovhdr_res); + xfs_warn(log->l_mp, " split region headers: %d bytes", + split_res); + xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); + xlog_print_trans(tp); + } + + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* + * Only move the item if it isn't already at the tail. This is + * to prevent a transient list_empty() state when reinserting + * an item that is already the only item in the CIL. + */ + if (!list_is_last(&lip->li_cil, &cil->xc_cil)) + list_move_tail(&lip->li_cil, &cil->xc_cil); + } + spin_unlock(&cil->xc_cil_lock); + + if (tp->t_ticket->t_curr_res < 0) + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } static void @@ -973,6 +993,7 @@ xfs_log_commit_cil( { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + xfs_lsn_t xc_commit_lsn; /* * Do all necessary memory allocation before we lock the CIL. @@ -986,13 +1007,9 @@ xfs_log_commit_cil( xlog_cil_insert_items(log, tp); - /* check we didn't blow the reservation */ - if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(mp, tp->t_ticket); - - tp->t_commit_lsn = cil->xc_ctx->sequence; + xc_commit_lsn = cil->xc_ctx->sequence; if (commit_lsn) - *commit_lsn = tp->t_commit_lsn; + *commit_lsn = xc_commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); @@ -1008,7 +1025,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, false); + xfs_trans_free_items(tp, xc_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c2604a5366f2..51bf7b827387 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -419,7 +419,7 @@ struct xlog { }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) @@ -456,6 +456,7 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) } void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +void xlog_print_trans(struct xfs_trans *); int xlog_write( struct xlog *log, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cd0b077deb35..9549188f5a36 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -352,13 +352,13 @@ xlog_header_check_mount( { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); - if (uuid_is_nil(&head->h_fs_uuid)) { + if (uuid_is_null(&head->h_fs_uuid)) { /* * IRIX doesn't write the h_fs_uuid or h_fmt fields. If - * h_fs_uuid is nil, we assume this log was last mounted + * h_fs_uuid is null, we assume this log was last mounted * by IRIX and continue. */ - xfs_warn(mp, "nil uuid in log - IRIX style log"); + xfs_warn(mp, "null uuid in log - IRIX style log"); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); @@ -2230,9 +2230,9 @@ xlog_recover_get_buf_lsn( struct xfs_mount *mp, struct xfs_buf *bp) { - __uint32_t magic32; - __uint16_t magic16; - __uint16_t magicda; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; void *blk = bp->b_addr; uuid_t *uuid; xfs_lsn_t lsn = -1; @@ -2381,9 +2381,9 @@ xlog_recover_validate_buf_type( xfs_lsn_t current_lsn) { struct xfs_da_blkinfo *info = bp->b_addr; - __uint32_t magic32; - __uint16_t magic16; - __uint16_t magicda; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; char *warnmsg = NULL; /* @@ -2852,7 +2852,7 @@ xlog_recover_buffer_pass2( if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)log->l_mp->m_inode_cluster_size))) { + (uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { @@ -3423,7 +3423,7 @@ xlog_recover_efd_pass2( xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; xfs_log_item_t *lip; - __uint64_t efi_id; + uint64_t efi_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3519,7 +3519,7 @@ xlog_recover_rud_pass2( struct xfs_rud_log_format *rud_formatp; struct xfs_rui_log_item *ruip = NULL; struct xfs_log_item *lip; - __uint64_t rui_id; + uint64_t rui_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3635,7 +3635,7 @@ xlog_recover_cud_pass2( struct xfs_cud_log_format *cud_formatp; struct xfs_cui_log_item *cuip = NULL; struct xfs_log_item *lip; - __uint64_t cui_id; + uint64_t cui_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3754,7 +3754,7 @@ xlog_recover_bud_pass2( struct xfs_bud_log_format *bud_formatp; struct xfs_bui_log_item *buip = NULL; struct xfs_log_item *lip; - __uint64_t bui_id; + uint64_t bui_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -4152,7 +4152,7 @@ xlog_recover_commit_trans( #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 - hlist_del(&trans->r_list); + hlist_del_init(&trans->r_list); error = xlog_recover_reorder_trans(log, trans, pass); if (error) @@ -4354,6 +4354,8 @@ xlog_recover_free_trans( xlog_recover_item_t *item, *n; int i; + hlist_del_init(&trans->r_list); + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { /* Free the regions in the item. */ list_del(&item->ri_list); @@ -5224,12 +5226,16 @@ xlog_do_recovery_pass( int error2 = 0; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; + int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); rhead_blk = 0; + for (i = 0; i < XLOG_RHASH_SIZE; i++) + INIT_HLIST_HEAD(&rhash[i]); + /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. @@ -5466,6 +5472,19 @@ xlog_do_recovery_pass( if (error && first_bad) *first_bad = rhead_blk; + /* + * Transactions are freed at commit time but transactions without commit + * records on disk are never committed. Free any that may be left in the + * hash table. + */ + for (i = 0; i < XLOG_RHASH_SIZE; i++) { + struct hlist_node *tmp; + struct xlog_recover *trans; + + hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) + xlog_recover_free_trans(trans); + } + return error ? error : error2; } @@ -5772,9 +5791,9 @@ xlog_recover_check_summary( xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; - __uint64_t freeblks; - __uint64_t itotal; - __uint64_t ifree; + uint64_t freeblks; + uint64_t itotal; + uint64_t ifree; int error; mp = log->l_mp; diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 11792d888e4e..e68bd1050eab 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -110,7 +110,10 @@ assfail(char *expr, char *file, int line) { xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", expr, file, line); - BUG(); + if (xfs_globals.bug_on_assert) + BUG(); + else + WARN_ON(1); } void diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2eaf81859166..40d4e8b4e193 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -74,20 +74,19 @@ xfs_uuid_mount( int hole, i; /* Publish UUID in struct super_block */ - BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t)); - memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t)); + uuid_copy(&mp->m_super->s_uuid, uuid); if (mp->m_flags & XFS_MOUNT_NOUUID) return 0; - if (uuid_is_nil(uuid)) { - xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + if (uuid_is_null(uuid)) { + xfs_warn(mp, "Filesystem has null UUID - can't mount"); return -EINVAL; } mutex_lock(&xfs_uuid_table_mutex); for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { - if (uuid_is_nil(&xfs_uuid_table[i])) { + if (uuid_is_null(&xfs_uuid_table[i])) { hole = i; continue; } @@ -124,7 +123,7 @@ xfs_uuid_unmount( mutex_lock(&xfs_uuid_table_mutex); for (i = 0; i < xfs_uuid_table_size; i++) { - if (uuid_is_nil(&xfs_uuid_table[i])) + if (uuid_is_null(&xfs_uuid_table[i])) continue; if (!uuid_equal(uuid, &xfs_uuid_table[i])) continue; @@ -174,7 +173,7 @@ xfs_free_perag( int xfs_sb_validate_fsb_count( xfs_sb_t *sbp, - __uint64_t nblocks) + uint64_t nblocks) { ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); @@ -436,7 +435,7 @@ STATIC void xfs_set_maxicount(xfs_mount_t *mp) { xfs_sb_t *sbp = &(mp->m_sb); - __uint64_t icount; + uint64_t icount; if (sbp->sb_imax_pct) { /* @@ -502,7 +501,7 @@ xfs_set_low_space_thresholds( int i; for (i = 0; i < XFS_LOWSP_MAX; i++) { - __uint64_t space = mp->m_sb.sb_dblocks; + uint64_t space = mp->m_sb.sb_dblocks; do_div(space, 100); mp->m_low_space[i] = space * (i + 1); @@ -598,10 +597,10 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } -__uint64_t +uint64_t xfs_default_resblks(xfs_mount_t *mp) { - __uint64_t resblks; + uint64_t resblks; /* * We default to 5% or 8192 fsbs of space reserved, whichever is @@ -612,7 +611,7 @@ xfs_default_resblks(xfs_mount_t *mp) */ resblks = mp->m_sb.sb_dblocks; do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 8192); + resblks = min_t(uint64_t, resblks, 8192); return resblks; } @@ -632,7 +631,7 @@ xfs_mountfs( { struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; - __uint64_t resblks; + uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -720,10 +719,13 @@ xfs_mountfs( if (error) goto out_del_stats; + error = xfs_errortag_init(mp); + if (error) + goto out_remove_error_sysfs; error = xfs_uuid_mount(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_errortag; /* * Set the minimum read and write sizes @@ -793,7 +795,10 @@ xfs_mountfs( * Copies the low order bits of the timestamp and the randomly * set "sequence" number out of a UUID. */ - uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + mp->m_fixedfsid[0] = + (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) | + get_unaligned_be16(&sbp->sb_uuid.b[4]); + mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); mp->m_dmevmask = 0; /* not persistent; set after each mount */ @@ -1042,6 +1047,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_errortag: + xfs_errortag_del(mp); out_remove_error_sysfs: xfs_error_sysfs_del(mp); out_del_stats: @@ -1060,7 +1067,7 @@ void xfs_unmountfs( struct xfs_mount *mp) { - __uint64_t resblks; + uint64_t resblks; int error; cancel_delayed_work_sync(&mp->m_eofblocks_work); @@ -1145,10 +1152,11 @@ xfs_unmountfs( xfs_uuid_unmount(mp); #if defined(DEBUG) - xfs_errortag_clearall(mp, 0); + xfs_errortag_clearall(mp); #endif xfs_free_perag(mp); + xfs_errortag_del(mp); xfs_error_sysfs_del(mp); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); @@ -1209,7 +1217,7 @@ xfs_mod_icount( struct xfs_mount *mp, int64_t delta) { - __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { ASSERT(0); percpu_counter_add(&mp->m_icount, -delta); @@ -1288,7 +1296,7 @@ xfs_mod_fdblocks( else batch = XFS_FDBLOCKS_BATCH; - __percpu_counter_add(&mp->m_fdblocks, delta, batch); + percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9fa312a41c93..e0792d036be2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -108,10 +108,10 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - __uint8_t m_blkbit_log; /* blocklog + NBBY */ - __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ - __uint8_t m_agno_log; /* log #ag's */ - __uint8_t m_agino_log; /* #bits for agino in inum */ + uint8_t m_blkbit_log; /* blocklog + NBBY */ + uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ + uint8_t m_agno_log; /* log #ag's */ + uint8_t m_agino_log; /* #bits for agino in inum */ uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ @@ -139,7 +139,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ - __uint64_t m_flags; /* global mount flags */ + uint64_t m_flags; /* global mount flags */ bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ @@ -148,14 +148,14 @@ typedef struct xfs_mount { int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ - __uint64_t m_maxicount; /* maximum inode count */ - __uint64_t m_resblks; /* total reserved blocks */ - __uint64_t m_resblks_avail;/* available reserved blocks */ - __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + uint64_t m_maxicount; /* maximum inode count */ + uint64_t m_resblks; /* total reserved blocks */ + uint64_t m_resblks_avail;/* available reserved blocks */ + uint64_t m_resblks_save; /* reserved blks @ remount,ro */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ - __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ @@ -194,19 +194,17 @@ typedef struct xfs_mount { * ever support shrinks it would have to be persisted in addition * to various other kinds of pain inflicted on the pNFS server. */ - __uint32_t m_generation; + uint32_t m_generation; bool m_fail_unmount; #ifdef DEBUG /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Frequency with which errors are injected. Replaces xfs_etest; the + * value stored in here is the inverse of the frequency with which the + * error triggers. 1 = always, 2 = half the time, etc. */ - bool m_drop_writes; + unsigned int *m_errortag; + struct xfs_kobj m_errortag_kobj; #endif } xfs_mount_t; @@ -325,20 +323,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -#ifdef DEBUG -static inline bool -xfs_mp_drop_writes(struct xfs_mount *mp) -{ - return mp->m_drop_writes; -} -#else -static inline bool -xfs_mp_drop_writes(struct xfs_mount *mp) -{ - return 0; -} -#endif - /* per-AG block reservation data structures*/ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, @@ -367,12 +351,12 @@ typedef struct xfs_perag { char pagi_init; /* this agi's entry is initialized */ char pagf_metadata; /* the agf is preferred to be metadata */ char pagi_inodeok; /* The agi is ok for inodes */ - __uint8_t pagf_levels[XFS_BTNUM_AGF]; + uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ - __uint32_t pagf_flcount; /* count of blocks in freelist */ + uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ - __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ xfs_agino_t pagi_count; /* number of allocated inodes */ @@ -411,7 +395,7 @@ typedef struct xfs_perag { struct xfs_ag_resv pag_agfl_resv; /* reference count */ - __uint8_t pagf_refcount_level; + uint8_t pagf_refcount_level; } xfs_perag_t; static inline struct xfs_ag_resv * @@ -434,7 +418,7 @@ void xfs_buf_hash_destroy(xfs_perag_t *pag); extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); -extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); +extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); @@ -450,7 +434,7 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); -extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5fe6e70b88ef..6ce948c436d5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1247,6 +1247,7 @@ xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { + struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; @@ -1257,7 +1258,32 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - xfs_dqflock(dqp); + /* + * The only way the dquot is already flush locked by the time quotacheck + * gets here is if reclaim flushed it before the dqadjust walk dirtied + * it for the final time. Quotacheck collects all dquot bufs in the + * local delwri queue before dquots are dirtied, so reclaim can't have + * possibly queued it for I/O. The only way out is to push the buffer to + * cycle the flush lock. + */ + if (!xfs_dqflock_nowait(dqp)) { + /* buf is pinned in-core by delwri list */ + DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen); + bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); + if (!bp) { + error = -EINVAL; + goto out_unlock; + } + xfs_buf_unlock(bp); + + xfs_buf_delwri_pushbuf(bp, buffer_list); + xfs_buf_rele(bp); + + error = -EAGAIN; + goto out_unlock; + } + error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 3e52d5de7ae1..2be6d2735ca9 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -33,7 +33,7 @@ xfs_fill_statvfs_from_dquot( struct kstatfs *statp, struct xfs_dquot *dqp) { - __uint64_t limit; + uint64_t limit; limit = dqp->q_core.d_blk_softlimit ? be64_to_cpu(dqp->q_core.d_blk_softlimit) : diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index f82d79a8c694..de9493253edf 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -269,7 +269,6 @@ xfs_fs_get_nextdqblk( /* ID may be different, so convert back what we got */ *qid = make_kqid(current_user_ns(), qid->type, id); return 0; - } STATIC int diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ffe6fe7a7eb5..ab2270a87196 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -155,6 +155,7 @@ int xfs_reflink_find_shared( struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, @@ -166,18 +167,18 @@ xfs_reflink_find_shared( struct xfs_btree_cur *cur; int error; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); return error; } @@ -217,7 +218,7 @@ xfs_reflink_trim_around_shared( agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); aglen = irec->br_blockcount; - error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, + error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, aglen, &fbno, &flen, true); if (error) return error; @@ -1373,8 +1374,8 @@ xfs_reflink_dirty_extents( agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); aglen = map[1].br_blockcount; - error = xfs_reflink_find_shared(mp, agno, agbno, aglen, - &rbno, &rlen, true); + error = xfs_reflink_find_shared(mp, NULL, agno, agbno, + aglen, &rbno, &rlen, true); if (error) goto out; if (rbno == NULLAGBLOCK) @@ -1405,57 +1406,73 @@ out: return error; } -/* Clear the inode reflink flag if there are no shared extents. */ +/* Does this inode need the reflink flag? */ int -xfs_reflink_clear_inode_flag( - struct xfs_inode *ip, - struct xfs_trans **tpp) +xfs_reflink_inode_has_shared_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + bool *has_shared) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t fbno; - xfs_filblks_t end; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; - struct xfs_bmbt_irec map; - int nmaps; - int error = 0; - - ASSERT(xfs_is_reflink_inode(ip)); + struct xfs_bmbt_irec got; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + xfs_extnum_t idx; + bool found; + int error; - fbno = 0; - end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); - while (end - fbno > 0) { - nmaps = 1; - /* - * Look for extents in the file. Skip holes, delalloc, or - * unwritten extents; they can't be reflinked. - */ - error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) return error; - if (nmaps == 0) - break; - if (!xfs_bmap_is_real_extent(&map)) - goto next; + } - agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); - aglen = map.br_blockcount; + *has_shared = false; + found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got); + while (found) { + if (isnullstartblock(got.br_startblock) || + got.br_state != XFS_EXT_NORM) + goto next; + agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); + aglen = got.br_blockcount; - error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, &rbno, &rlen, false); if (error) return error; /* Is there still a shared block here? */ - if (rbno != NULLAGBLOCK) + if (rbno != NULLAGBLOCK) { + *has_shared = true; return 0; + } next: - fbno = map.br_startoff + map.br_blockcount; + found = xfs_iext_get_extent(ifp, ++idx, &got); } + return 0; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( + struct xfs_inode *ip, + struct xfs_trans **tpp) +{ + bool needs_flag; + int error = 0; + + ASSERT(xfs_is_reflink_inode(ip)); + + error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); + if (error || needs_flag) + return error; + /* * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index d29a7967f029..701487bab468 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -20,9 +20,9 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 -extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, - xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, + xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); @@ -47,6 +47,8 @@ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, + struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index c57aa7f18087..91472193643b 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1256,13 +1256,13 @@ xfs_rtpick_extent( { xfs_rtblock_t b; /* result block */ int log2; /* log of sequence number */ - __uint64_t resid; /* residual after log removed */ - __uint64_t seq; /* sequence number of file creation */ - __uint64_t *seqp; /* pointer to seqno in inode */ + uint64_t resid; /* residual after log removed */ + uint64_t seq; /* sequence number of file creation */ + uint64_t *seqp; /* pointer to seqno in inode */ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; + seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; *seqp = 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f13133e6f19f..79defa722bf1 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -107,6 +107,8 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); @@ -143,6 +145,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp, # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) # define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f11282c96887..056e12b421eb 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -33,9 +33,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { int i, j; int len = 0; - __uint64_t xs_xstrat_bytes = 0; - __uint64_t xs_write_bytes = 0; - __uint64_t xs_read_bytes = 0; + uint64_t xs_xstrat_bytes = 0; + uint64_t xs_write_bytes = 0; + uint64_t xs_read_bytes = 0; static const struct xstats_entry { char *desc; @@ -100,7 +100,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) void xfs_stats_clearall(struct xfsstats __percpu *stats) { int c; - __uint32_t vn_active; + uint32_t vn_active; xfs_notice(NULL, "Clearing xfsstats"); for_each_possible_cpu(c) { diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 375840f5a99a..f64d0ae345c4 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -54,125 +54,125 @@ enum { */ struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 - __uint32_t xs_allocx; - __uint32_t xs_allocb; - __uint32_t xs_freex; - __uint32_t xs_freeb; + uint32_t xs_allocx; + uint32_t xs_allocb; + uint32_t xs_freex; + uint32_t xs_freeb; # define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) - __uint32_t xs_abt_lookup; - __uint32_t xs_abt_compare; - __uint32_t xs_abt_insrec; - __uint32_t xs_abt_delrec; + uint32_t xs_abt_lookup; + uint32_t xs_abt_compare; + uint32_t xs_abt_insrec; + uint32_t xs_abt_delrec; # define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) - __uint32_t xs_blk_mapr; - __uint32_t xs_blk_mapw; - __uint32_t xs_blk_unmap; - __uint32_t xs_add_exlist; - __uint32_t xs_del_exlist; - __uint32_t xs_look_exlist; - __uint32_t xs_cmp_exlist; + uint32_t xs_blk_mapr; + uint32_t xs_blk_mapw; + uint32_t xs_blk_unmap; + uint32_t xs_add_exlist; + uint32_t xs_del_exlist; + uint32_t xs_look_exlist; + uint32_t xs_cmp_exlist; # define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) - __uint32_t xs_bmbt_lookup; - __uint32_t xs_bmbt_compare; - __uint32_t xs_bmbt_insrec; - __uint32_t xs_bmbt_delrec; + uint32_t xs_bmbt_lookup; + uint32_t xs_bmbt_compare; + uint32_t xs_bmbt_insrec; + uint32_t xs_bmbt_delrec; # define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) - __uint32_t xs_dir_lookup; - __uint32_t xs_dir_create; - __uint32_t xs_dir_remove; - __uint32_t xs_dir_getdents; + uint32_t xs_dir_lookup; + uint32_t xs_dir_create; + uint32_t xs_dir_remove; + uint32_t xs_dir_getdents; # define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) - __uint32_t xs_trans_sync; - __uint32_t xs_trans_async; - __uint32_t xs_trans_empty; + uint32_t xs_trans_sync; + uint32_t xs_trans_async; + uint32_t xs_trans_empty; # define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) - __uint32_t xs_ig_attempts; - __uint32_t xs_ig_found; - __uint32_t xs_ig_frecycle; - __uint32_t xs_ig_missed; - __uint32_t xs_ig_dup; - __uint32_t xs_ig_reclaims; - __uint32_t xs_ig_attrchg; + uint32_t xs_ig_attempts; + uint32_t xs_ig_found; + uint32_t xs_ig_frecycle; + uint32_t xs_ig_missed; + uint32_t xs_ig_dup; + uint32_t xs_ig_reclaims; + uint32_t xs_ig_attrchg; # define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) - __uint32_t xs_log_writes; - __uint32_t xs_log_blocks; - __uint32_t xs_log_noiclogs; - __uint32_t xs_log_force; - __uint32_t xs_log_force_sleep; + uint32_t xs_log_writes; + uint32_t xs_log_blocks; + uint32_t xs_log_noiclogs; + uint32_t xs_log_force; + uint32_t xs_log_force_sleep; # define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) - __uint32_t xs_try_logspace; - __uint32_t xs_sleep_logspace; - __uint32_t xs_push_ail; - __uint32_t xs_push_ail_success; - __uint32_t xs_push_ail_pushbuf; - __uint32_t xs_push_ail_pinned; - __uint32_t xs_push_ail_locked; - __uint32_t xs_push_ail_flushing; - __uint32_t xs_push_ail_restarts; - __uint32_t xs_push_ail_flush; + uint32_t xs_try_logspace; + uint32_t xs_sleep_logspace; + uint32_t xs_push_ail; + uint32_t xs_push_ail_success; + uint32_t xs_push_ail_pushbuf; + uint32_t xs_push_ail_pinned; + uint32_t xs_push_ail_locked; + uint32_t xs_push_ail_flushing; + uint32_t xs_push_ail_restarts; + uint32_t xs_push_ail_flush; # define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) - __uint32_t xs_xstrat_quick; - __uint32_t xs_xstrat_split; + uint32_t xs_xstrat_quick; + uint32_t xs_xstrat_split; # define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) - __uint32_t xs_write_calls; - __uint32_t xs_read_calls; + uint32_t xs_write_calls; + uint32_t xs_read_calls; # define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) - __uint32_t xs_attr_get; - __uint32_t xs_attr_set; - __uint32_t xs_attr_remove; - __uint32_t xs_attr_list; + uint32_t xs_attr_get; + uint32_t xs_attr_set; + uint32_t xs_attr_remove; + uint32_t xs_attr_list; # define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) - __uint32_t xs_iflush_count; - __uint32_t xs_icluster_flushcnt; - __uint32_t xs_icluster_flushinode; + uint32_t xs_iflush_count; + uint32_t xs_icluster_flushcnt; + uint32_t xs_icluster_flushinode; # define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) - __uint32_t vn_active; /* # vnodes not on free lists */ - __uint32_t vn_alloc; /* # times vn_alloc called */ - __uint32_t vn_get; /* # times vn_get called */ - __uint32_t vn_hold; /* # times vn_hold called */ - __uint32_t vn_rele; /* # times vn_rele called */ - __uint32_t vn_reclaim; /* # times vn_reclaim called */ - __uint32_t vn_remove; /* # times vn_remove called */ - __uint32_t vn_free; /* # times vn_free called */ + uint32_t vn_active; /* # vnodes not on free lists */ + uint32_t vn_alloc; /* # times vn_alloc called */ + uint32_t vn_get; /* # times vn_get called */ + uint32_t vn_hold; /* # times vn_hold called */ + uint32_t vn_rele; /* # times vn_rele called */ + uint32_t vn_reclaim; /* # times vn_reclaim called */ + uint32_t vn_remove; /* # times vn_remove called */ + uint32_t vn_free; /* # times vn_free called */ #define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) - __uint32_t xb_get; - __uint32_t xb_create; - __uint32_t xb_get_locked; - __uint32_t xb_get_locked_waited; - __uint32_t xb_busy_locked; - __uint32_t xb_miss_locked; - __uint32_t xb_page_retries; - __uint32_t xb_page_found; - __uint32_t xb_get_read; + uint32_t xb_get; + uint32_t xb_create; + uint32_t xb_get_locked; + uint32_t xb_get_locked_waited; + uint32_t xb_busy_locked; + uint32_t xb_miss_locked; + uint32_t xb_page_retries; + uint32_t xb_page_found; + uint32_t xb_get_read; /* Version 2 btree counters */ #define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) - __uint32_t xs_abtb_2[__XBTS_MAX]; + uint32_t xs_abtb_2[__XBTS_MAX]; #define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) - __uint32_t xs_abtc_2[__XBTS_MAX]; + uint32_t xs_abtc_2[__XBTS_MAX]; #define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) - __uint32_t xs_bmbt_2[__XBTS_MAX]; + uint32_t xs_bmbt_2[__XBTS_MAX]; #define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) - __uint32_t xs_ibt_2[__XBTS_MAX]; + uint32_t xs_ibt_2[__XBTS_MAX]; #define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) - __uint32_t xs_fibt_2[__XBTS_MAX]; + uint32_t xs_fibt_2[__XBTS_MAX]; #define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) - __uint32_t xs_rmap_2[__XBTS_MAX]; + uint32_t xs_rmap_2[__XBTS_MAX]; #define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) - __uint32_t xs_refcbt_2[__XBTS_MAX]; + uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) - __uint32_t xs_qm_dqreclaims; - __uint32_t xs_qm_dqreclaim_misses; - __uint32_t xs_qm_dquot_dups; - __uint32_t xs_qm_dqcachemisses; - __uint32_t xs_qm_dqcachehits; - __uint32_t xs_qm_dqwants; + uint32_t xs_qm_dqreclaims; + uint32_t xs_qm_dqreclaim_misses; + uint32_t xs_qm_dquot_dups; + uint32_t xs_qm_dqcachemisses; + uint32_t xs_qm_dqcachehits; + uint32_t xs_qm_dqwants; #define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) - __uint32_t xs_qm_dquot; - __uint32_t xs_qm_dquot_unused; + uint32_t xs_qm_dquot; + uint32_t xs_qm_dquot_unused; /* Extra precision counters */ - __uint64_t xs_xstrat_bytes; - __uint64_t xs_write_bytes; - __uint64_t xs_read_bytes; + uint64_t xs_xstrat_bytes; + uint64_t xs_write_bytes; + uint64_t xs_read_bytes; }; struct xfsstats { @@ -186,7 +186,7 @@ struct xfsstats { * simple wrapper for getting the array index of s struct member offset */ #define XFS_STATS_CALC_INDEX(member) \ - (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + (offsetof(struct __xfsstats, member) / (int)sizeof(uint32_t)) int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 455a575f101d..38aaacdbb8b3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -196,7 +196,7 @@ xfs_parseargs( int dsunit = 0; int dswidth = 0; int iosize = 0; - __uint8_t iosizelog = 0; + uint8_t iosizelog = 0; /* * set up the mount name first so all the errors will refer to the @@ -556,7 +556,7 @@ xfs_showargs( return 0; } -static __uint64_t +static uint64_t xfs_max_file_offset( unsigned int blockshift) { @@ -587,7 +587,7 @@ xfs_max_file_offset( # endif #endif - return (((__uint64_t)pagefactor) << bitshift) - 1; + return (((uint64_t)pagefactor) << bitshift) - 1; } /* @@ -622,7 +622,7 @@ xfs_set_inode_alloc( * the max inode percentage. Used only for inode32. */ if (mp->m_maxicount) { - __uint64_t icount; + uint64_t icount; icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); @@ -1088,12 +1088,12 @@ xfs_fs_statfs( struct xfs_mount *mp = XFS_M(dentry->d_sb); xfs_sb_t *sbp = &mp->m_sb; struct xfs_inode *ip = XFS_I(d_inode(dentry)); - __uint64_t fakeinos, id; - __uint64_t icount; - __uint64_t ifree; - __uint64_t fdblocks; + uint64_t fakeinos, id; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; xfs_extlen_t lsize; - __int64_t ffree; + int64_t ffree; statp->f_type = XFS_SB_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1116,7 +1116,7 @@ xfs_fs_statfs( statp->f_bavail = statp->f_bfree; fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + statp->f_files = MIN(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1129,7 +1129,7 @@ xfs_fs_statfs( /* make sure statp->f_ffree does not underflow */ ffree = statp->f_files - (icount - ifree); - statp->f_ffree = max_t(__int64_t, ffree, 0); + statp->f_ffree = max_t(int64_t, ffree, 0); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && @@ -1142,7 +1142,7 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - __uint64_t resblks = 0; + uint64_t resblks = 0; mp->m_resblks_save = mp->m_resblks; xfs_reserve_blocks(mp, &resblks, NULL); @@ -1151,7 +1151,7 @@ xfs_save_resvblks(struct xfs_mount *mp) STATIC void xfs_restore_resvblks(struct xfs_mount *mp) { - __uint64_t resblks; + uint64_t resblks; if (mp->m_resblks_save) { resblks = mp->m_resblks_save; @@ -1766,7 +1766,8 @@ STATIC int __init xfs_init_zones(void) { xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, - offsetof(struct xfs_ioend, io_inline_bio)); + offsetof(struct xfs_ioend, io_inline_bio), + BIOSET_NEED_BVECS); if (!xfs_ioend_bioset) goto out; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index f2cb45ed1d54..12cd9cf7de41 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -43,8 +43,8 @@ #include "xfs_log.h" /* ----- Kernel only functions below ----- */ -STATIC int -xfs_readlink_bmap( +int +xfs_readlink_bmap_ilocked( struct xfs_inode *ip, char *link) { @@ -143,7 +143,7 @@ xfs_readlink( if (!pathlen) goto out; - if (pathlen < 0 || pathlen > MAXPATHLEN) { + if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", __func__, (unsigned long long) ip->i_ino, (long long) pathlen); @@ -153,7 +153,7 @@ xfs_readlink( } - error = xfs_readlink_bmap(ip, link); + error = xfs_readlink_bmap_ilocked(ip, link); out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -202,7 +202,7 @@ xfs_symlink( * Check component lengths of the target path name. */ pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ + if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */ return -ENAMETOOLONG; udqp = gdqp = NULL; @@ -559,7 +559,7 @@ xfs_inactive_symlink( return 0; } - if (pathlen < 0 || pathlen > MAXPATHLEN) { + if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index e75245d09116..aeaee8923617 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -21,6 +21,7 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 984a3499cfe3..82afee005140 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -95,6 +95,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ + bool bug_on_assert; /* BUG() the kernel on assert failure */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 80ac15fb9638..8b2ccc234f36 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -90,15 +90,25 @@ to_mp(struct kobject *kobject) return container_of(kobj, struct xfs_mount, m_kobj); } +static struct attribute *xfs_mp_attrs[] = { + NULL, +}; + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_mp_attrs, +}; + #ifdef DEBUG +/* debug */ STATIC ssize_t -drop_writes_store( +bug_on_assert_store( struct kobject *kobject, const char *buf, size_t count) { - struct xfs_mount *mp = to_mp(kobject); int ret; int val; @@ -107,9 +117,9 @@ drop_writes_store( return ret; if (val == 1) - mp->m_drop_writes = true; + xfs_globals.bug_on_assert = true; else if (val == 0) - mp->m_drop_writes = false; + xfs_globals.bug_on_assert = false; else return -EINVAL; @@ -117,33 +127,13 @@ drop_writes_store( } STATIC ssize_t -drop_writes_show( +bug_on_assert_show( struct kobject *kobject, char *buf) { - struct xfs_mount *mp = to_mp(kobject); - - return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0); + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0); } -XFS_SYSFS_ATTR_RW(drop_writes); - -#endif /* DEBUG */ - -static struct attribute *xfs_mp_attrs[] = { -#ifdef DEBUG - ATTR_LIST(drop_writes), -#endif - NULL, -}; - -struct kobj_type xfs_mp_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_mp_attrs, -}; - -#ifdef DEBUG -/* debug */ +XFS_SYSFS_ATTR_RW(bug_on_assert); STATIC ssize_t log_recovery_delay_store( @@ -176,6 +166,7 @@ log_recovery_delay_show( XFS_SYSFS_ATTR_RW(log_recovery_delay); static struct attribute *xfs_dbg_attrs[] = { + ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), NULL, }; @@ -314,47 +305,11 @@ write_grant_head_show( } XFS_SYSFS_ATTR_RO(write_grant_head); -#ifdef DEBUG -STATIC ssize_t -log_badcrc_factor_store( - struct kobject *kobject, - const char *buf, - size_t count) -{ - struct xlog *log = to_xlog(kobject); - int ret; - uint32_t val; - - ret = kstrtouint(buf, 0, &val); - if (ret) - return ret; - - log->l_badcrc_factor = val; - - return count; -} - -STATIC ssize_t -log_badcrc_factor_show( - struct kobject *kobject, - char *buf) -{ - struct xlog *log = to_xlog(kobject); - - return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor); -} - -XFS_SYSFS_ATTR_RW(log_badcrc_factor); -#endif /* DEBUG */ - static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head), ATTR_LIST(write_grant_head), -#ifdef DEBUG - ATTR_LIST(log_badcrc_factor), -#endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7c5a16528d8b..bcc3cdf8e1c5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -251,7 +251,7 @@ TRACE_EVENT(xfs_iext_insert, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - (__int64_t)__entry->startblock, + (int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - (__int64_t)__entry->startblock, + (int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -367,6 +367,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); @@ -1280,7 +1281,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), __entry->startoff, - (__int64_t)__entry->startblock, + (int64_t)__entry->startblock, __entry->blockcount) ) @@ -1490,25 +1491,6 @@ TRACE_EVENT(xfs_extent_busy_trim, __entry->tlen) ); -TRACE_EVENT(xfs_trans_commit_lsn, - TP_PROTO(struct xfs_trans *trans), - TP_ARGS(trans), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(struct xfs_trans *, tp) - __field(xfs_lsn_t, lsn) - ), - TP_fast_assign( - __entry->dev = trans->t_mountp->m_super->s_dev; - __entry->tp = trans; - __entry->lsn = trans->t_commit_lsn; - ), - TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->tp, - __entry->lsn) -); - TRACE_EVENT(xfs_agf, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), @@ -2057,7 +2039,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, TP_ARGS(log, buf_f), TP_STRUCT__entry( __field(dev_t, dev) - __field(__int64_t, blkno) + __field(int64_t, blkno) __field(unsigned short, len) __field(unsigned short, flags) __field(unsigned short, size) @@ -2106,7 +2088,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, __field(int, fields) __field(unsigned short, asize) __field(unsigned short, dsize) - __field(__int64_t, blkno) + __field(int64_t, blkno) __field(int, len) __field(int, boffset) ), @@ -3256,8 +3238,8 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __field(xfs_agnumber_t, agno) __field(xfs_fsblock_t, bno) __field(xfs_filblks_t, len) - __field(__uint64_t, owner) - __field(__uint64_t, offset) + __field(uint64_t, owner) + __field(uint64_t, offset) __field(unsigned int, flags) ), TP_fast_assign( @@ -3297,9 +3279,9 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class, __field(dev_t, keydev) __field(xfs_daddr_t, block) __field(xfs_daddr_t, len) - __field(__uint64_t, owner) - __field(__uint64_t, offset) - __field(__uint64_t, flags) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(uint64_t, flags) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a07acbf0bd8a..6bdad6f58934 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -105,10 +105,6 @@ typedef struct xfs_trans { unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ struct xlog_ticket *t_ticket; /* log mgr ticket */ - xfs_lsn_t t_lsn; /* log seq num of start of - * transaction. */ - xfs_lsn_t t_commit_lsn; /* log seq num of end of - * transaction. */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ unsigned int t_flags; /* misc flags */ @@ -249,7 +245,7 @@ struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, struct xfs_rui_log_item *ruip); int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, - __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); @@ -275,6 +271,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, enum xfs_bmap_intent_type type, struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, xfs_exntst_t state); + xfs_filblks_t *blockcount, xfs_exntst_t state); #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 6408e7d7c08c..14543d93cd4b 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update( int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, + xfs_filblks_t *blockcount, xfs_exntst_t state) { int error; @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item( void **state) { struct xfs_bmap_intent *bmap; + xfs_filblks_t count; int error; bmap = container_of(item, struct xfs_bmap_intent, bi_list); + count = bmap->bi_bmap.br_blockcount; error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, bmap->bi_type, bmap->bi_owner, bmap->bi_whichfork, bmap->bi_bmap.br_startoff, bmap->bi_bmap.br_startblock, - bmap->bi_bmap.br_blockcount, + &count, bmap->bi_bmap.br_state); + if (!error && count > 0) { + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); + bmap->bi_bmap.br_blockcount = count; + return -EAGAIN; + } kmem_free(bmap); return error; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8ee29ca132dc..86987d823d76 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp, xfs_buf_t *bp) { xfs_buf_log_item_t *bip; + int freed; /* * Default to a normal brelse() call if the tp is NULL. @@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Drop our reference to the buf log item. */ - atomic_dec(&bip->bli_refcount); + freed = atomic_dec_and_test(&bip->bli_refcount); /* - * If the buf item is not tracking data in the log, then - * we must free it before releasing the buffer back to the - * free pool. Before releasing the buffer to the free pool, - * clear the transaction pointer in b_fsprivate2 to dissolve - * its relation to this transaction. + * If the buf item is not tracking data in the log, then we must free it + * before releasing the buffer back to the free pool. + * + * If the fs has shutdown and we dropped the last reference, it may fall + * on us to release a (possibly dirty) bli if it never made it to the + * AIL (e.g., the aborted unpin already happened and didn't release it + * due to our reference). Since we're already shutdown and need xa_lock, + * just force remove from the AIL and release the bli here. */ - if (!xfs_buf_item_dirty(bip)) { + if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bp); + } else if (!xfs_buf_item_dirty(bip)) { /*** ASSERT(bp->b_pincount == 0); ***/ diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 9ead064b5e90..9b577beb43d7 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -96,7 +96,7 @@ xfs_trans_log_finish_rmap_update( struct xfs_trans *tp, struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, - __uint64_t owner, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, |