diff options
Diffstat (limited to 'fs')
98 files changed, 1389 insertions, 3895 deletions
diff --git a/fs/Makefile b/fs/Makefile index f9cb9876e466..4030cbfbc9af 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o diff --git a/fs/affs/super.c b/fs/affs/super.c index 6d589f28bf9b..895ac7dc9dbf 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 1c8c6cc6de30..4b0eff6da674 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call) { _enter(""); + /* Break the callbacks here so that we do it after the final ACK is + * received. The step number here must match the final number in + * afs_deliver_cb_callback(). + */ + if (call->unmarshall == 6) { + ASSERT(call->server && call->count && call->request); + afs_break_callbacks(call->server, call->count, call->request); + } + afs_put_server(call->server); call->server = NULL; kfree(call->buffer); @@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("trailer"); if (skb->len != 0) return -EBADMSG; + + /* Record that the message was unmarshalled successfully so + * that the call destructor can know do the callback breaking + * work, even if the final ACK isn't received. + * + * If the step number changes, then afs_cm_destructor() must be + * updated also. + */ + call->unmarshall++; + case 6: break; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index be75b500005d..590b55f46d61 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -75,7 +75,7 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - work_func_t async_workfn; + void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ struct sk_buff_head rx_queue; /* received packets */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index ef943df73b8c..03a3beb17004 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *); static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct afs_call *); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct work_struct *); +static void afs_process_async_call(struct afs_call *); static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); @@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *); static struct sk_buff_head afs_incoming_calls; static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); +static void afs_async_workfn(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + call->async_workfn(call); +} + /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT @@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call) } /* + * End a call but do not free it + */ +static void afs_end_call_nofree(struct afs_call *call) +{ + if (call->rxcall) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); +} + +/* + * End a call and free it + */ +static void afs_end_call(struct afs_call *call) +{ + afs_end_call_nofree(call); + afs_free_call(call); +} + +/* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, @@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -383,11 +413,8 @@ error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - rxrpc_kernel_end_call(rxcall); - call->rxcall = NULL; error_kill_call: - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state >= AFS_CALL_COMPLETE) { while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - if (call->incoming) { - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); - } + if (call->incoming) + afs_end_call(call); } _leave(""); @@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) } _debug("call complete"); - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct work_struct *work) +static void afs_delete_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); afs_free_call(call); @@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work) * - on a multiple-thread workqueue this work item may try to run on several * CPUs at the same time */ -static void afs_process_async_call(struct work_struct *work) +static void afs_process_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); if (!skb_queue_empty(&call->rx_queue)) @@ -637,10 +651,7 @@ static void afs_process_async_call(struct work_struct *work) call->reply = NULL; /* kill the call */ - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - if (call->type->destructor) - call->type->destructor(call); + afs_end_call_nofree(call); /* we can't just delete the call because the work item may be * queued */ @@ -663,13 +674,6 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) call->reply_size += len; } -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(work); -} - /* * accept the backlog of incoming calls */ @@ -790,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); default: - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); return; } @@ -823,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) call->state = AFS_CALL_AWAIT_ACK; n = rxrpc_kernel_send_data(call->rxcall, &msg, len); if (n >= 0) { + /* Success */ _leave(" [replied]"); return; } + if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); } - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); } @@ -112,6 +112,11 @@ struct kioctx { struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); } @@ -718,7 +727,8 @@ err: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { if (!atomic_xchg(&ctx->dead, 1)) { struct kioctx_table *table; @@ -747,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); + ctx->requests_done = requests_done; percpu_ref_kill(&ctx->users); + } else { + if (requests_done) + complete(requests_done); } } @@ -809,7 +823,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } @@ -1185,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1203,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + wait_for_completion(&requests_done); + return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1299,10 +1327,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e93..cc87c1abac97 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c deleted file mode 100644 index 1c2ce0c87711..000000000000 --- a/fs/bio-integrity.c +++ /dev/null @@ -1,657 +0,0 @@ -/* - * bio-integrity.c - bio data integrity extensions - * - * Copyright (C) 2007, 2008, 2009 Oracle Corporation - * Written by: Martin K. Petersen <martin.petersen@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - */ - -#include <linux/blkdev.h> -#include <linux/mempool.h> -#include <linux/export.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/slab.h> - -#define BIP_INLINE_VECS 4 - -static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -/** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio - * @bio: bio to attach integrity metadata to - * @gfp_mask: Memory allocation mask - * @nr_vecs: Number of integrity metadata scatter-gather elements - * - * Description: This function prepares a bio for attaching integrity - * metadata. nr_vecs specifies the maximum number of pages containing - * integrity metadata that can be attached. - */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) -{ - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned long idx = BIO_POOL_NONE; - unsigned inline_vecs; - - if (!bs) { - bip = kmalloc(sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * nr_vecs, gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; - } - - if (unlikely(!bip)) - return NULL; - - memset(bip, 0, sizeof(*bip)); - - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - bs->bvec_integrity_pool); - if (!bip->bip_vec) - goto err; - } else { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_slab = idx; - bip->bip_bio = bio; - bio->bi_integrity = bip; - - return bip; -err: - mempool_free(bip, bs->bio_integrity_pool); - return NULL; -} -EXPORT_SYMBOL(bio_integrity_alloc); - -/** - * bio_integrity_free - Free bio integrity payload - * @bio: bio containing bip to be freed - * - * Description: Used to free the integrity portion of a bio. Usually - * called from bio_free(). - */ -void bio_integrity_free(struct bio *bio) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct bio_set *bs = bio->bi_pool; - - if (bip->bip_owns_buf) - kfree(bip->bip_buf); - - if (bs) { - if (bip->bip_slab != BIO_POOL_NONE) - bvec_free(bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); - - mempool_free(bip, bs->bio_integrity_pool); - } else { - kfree(bip); - } - - bio->bi_integrity = NULL; -} -EXPORT_SYMBOL(bio_integrity_free); - -static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) -{ - if (bip->bip_slab == BIO_POOL_NONE) - return BIP_INLINE_VECS; - - return bvec_nr_vecs(bip->bip_slab); -} - -/** - * bio_integrity_add_page - Attach integrity metadata - * @bio: bio to update - * @page: page containing integrity metadata - * @len: number of bytes of integrity metadata in page - * @offset: start offset within page - * - * Description: Attach a page containing integrity metadata to bio. - */ -int bio_integrity_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct bio_vec *iv; - - if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); - return 0; - } - - iv = bip->bip_vec + bip->bip_vcnt; - - iv->bv_page = page; - iv->bv_len = len; - iv->bv_offset = offset; - bip->bip_vcnt++; - - return len; -} -EXPORT_SYMBOL(bio_integrity_add_page); - -static int bdev_integrity_enabled(struct block_device *bdev, int rw) -{ - struct blk_integrity *bi = bdev_get_integrity(bdev); - - if (bi == NULL) - return 0; - - if (rw == READ && bi->verify_fn != NULL && - (bi->flags & INTEGRITY_FLAG_READ)) - return 1; - - if (rw == WRITE && bi->generate_fn != NULL && - (bi->flags & INTEGRITY_FLAG_WRITE)) - return 1; - - return 0; -} - -/** - * bio_integrity_enabled - Check whether integrity can be passed - * @bio: bio to check - * - * Description: Determines whether bio_integrity_prep() can be called - * on this bio or not. bio data direction and target device must be - * set prior to calling. The functions honors the write_generate and - * read_verify flags in sysfs. - */ -int bio_integrity_enabled(struct bio *bio) -{ - if (!bio_is_rw(bio)) - return 0; - - /* Already protected? */ - if (bio_integrity(bio)) - return 0; - - return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); -} -EXPORT_SYMBOL(bio_integrity_enabled); - -/** - * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto - * @bi: blk_integrity profile for device - * @sectors: Number of 512 sectors to convert - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the hardware - * sector size of the storage device. Convert the block layer sectors - * to physical sectors. - */ -static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, - unsigned int sectors) -{ - /* At this point there are only 512b or 4096b DIF/EPP devices */ - if (bi->sector_size == 4096) - return sectors >>= 3; - - return sectors; -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; -} - -/** - * bio_integrity_tag_size - Retrieve integrity tag space - * @bio: bio to inspect - * - * Description: Returns the maximum number of tag bytes that can be - * attached to this bio. Filesystems can use this to determine how - * much metadata to attach to an I/O. - */ -unsigned int bio_integrity_tag_size(struct bio *bio) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - - BUG_ON(bio->bi_iter.bi_size == 0); - - return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); -} -EXPORT_SYMBOL(bio_integrity_tag_size); - -static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, - int set) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip->bip_buf == NULL); - - if (bi->tag_size == 0) - return -1; - - nr_sectors = bio_integrity_hw_sectors(bi, - DIV_ROUND_UP(len, bi->tag_size)); - - if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, - nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); - return -1; - } - - if (set) - bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - else - bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - - return 0; -} - -/** - * bio_integrity_set_tag - Attach a tag buffer to a bio - * @bio: bio to attach buffer to - * @tag_buf: Pointer to a buffer containing tag data - * @len: Length of the included buffer - * - * Description: Use this function to tag a bio by leveraging the extra - * space provided by devices formatted with integrity protection. The - * size of the integrity buffer must be <= to the size reported by - * bio_integrity_tag_size(). - */ -int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != WRITE); - - return bio_integrity_tag(bio, tag_buf, len, 1); -} -EXPORT_SYMBOL(bio_integrity_set_tag); - -/** - * bio_integrity_get_tag - Retrieve a tag buffer from a bio - * @bio: bio to retrieve buffer from - * @tag_buf: Pointer to a buffer for the tag data - * @len: Length of the target buffer - * - * Description: Use this function to retrieve the tag buffer from a - * completed I/O. The size of the integrity buffer must be <= to the - * size reported by bio_integrity_tag_size(). - */ -int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != READ); - - return bio_integrity_tag(bio, tag_buf, len, 0); -} -EXPORT_SYMBOL(bio_integrity_get_tag); - -/** - * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio - * @bio: bio to generate/verify integrity metadata for - * @operate: operate number, 1 for generate, 0 for verify - */ -static int bio_integrity_generate_verify(struct bio *bio, int operate) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector; - unsigned int sectors, ret = 0, i; - void *prot_buf = bio->bi_integrity->bip_buf; - - if (operate) - sector = bio->bi_iter.bi_sector; - else - sector = bio->bi_integrity->bip_iter.bi_sector; - - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.sector_size = bi->sector_size; - - bio_for_each_segment_all(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; - bix.prot_buf = prot_buf; - bix.sector = sector; - - if (operate) - bi->generate_fn(&bix); - else { - ret = bi->verify_fn(&bix); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } - } - - sectors = bv->bv_len / bi->sector_size; - sector += sectors; - prot_buf += sectors * bi->tuple_size; - - kunmap_atomic(kaddr); - } - return ret; -} - -/** - * bio_integrity_generate - Generate integrity metadata for a bio - * @bio: bio to generate integrity metadata for - * - * Description: Generates integrity metadata for a bio by calling the - * block device's generation callback function. The bio must have a - * bip attached with enough room to accommodate the generated - * integrity metadata. - */ -static void bio_integrity_generate(struct bio *bio) -{ - bio_integrity_generate_verify(bio, 1); -} - -static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) -{ - if (bi) - return bi->tuple_size; - - return 0; -} - -/** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Allocates a buffer for integrity metadata, maps the - * pages and attaches them to a bio. The bio must have data - * direction, target device and start sector set priot to calling. In - * the WRITE case, integrity metadata will be generated using the - * block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -int bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi; - struct request_queue *q; - void *buf; - unsigned long start, end; - unsigned int len, nr_pages; - unsigned int bytes, offset, i; - unsigned int sectors; - - bi = bdev_get_integrity(bio->bi_bdev); - q = bdev_get_queue(bio->bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bio_integrity(bio)); - - sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); - - /* Allocate kernel buffer for protection data */ - len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); - if (unlikely(buf == NULL)) { - printk(KERN_ERR "could not allocate integrity buffer\n"); - return -ENOMEM; - } - - end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ((unsigned long) buf) >> PAGE_SHIFT; - nr_pages = end - start; - - /* Allocate bio integrity payload and integrity vectors */ - bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { - printk(KERN_ERR "could not allocate data integrity bioset\n"); - kfree(buf); - return -EIO; - } - - bip->bip_owns_buf = 1; - bip->bip_buf = buf; - bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; - - /* Map it */ - offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; - bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) - return 0; - - if (ret < bytes) - break; - - buf += bytes; - len -= bytes; - offset = 0; - } - - /* Install custom I/O completion handler if read verify is enabled */ - if (bio_data_dir(bio) == READ) { - bip->bip_end_io = bio->bi_end_io; - bio->bi_end_io = bio_integrity_endio; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - bio_integrity_generate(bio); - - return 0; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify - Verify integrity metadata for a bio - * @bio: bio to verify - * - * Description: This function is called to verify the integrity of a - * bio. The data in the bio io_vec is compared to the integrity - * metadata returned by the HBA. - */ -static int bio_integrity_verify(struct bio *bio) -{ - return bio_integrity_generate_verify(bio, 0); -} - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - int error; - - error = bio_integrity_verify(bio); - - /* Restore original bio completion handler */ - bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); -} - -/** - * bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * @error: Pointer to errno - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -void bio_integrity_endio(struct bio *bio, int error) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - - BUG_ON(bip->bip_bio != bio); - - /* In case of an I/O error there is no point in verifying the - * integrity metadata. Restore original bio end_io handler - * and run it. - */ - if (error) { - bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); - - return; - } - - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); -} -EXPORT_SYMBOL(bio_integrity_endio); - -/** - * bio_integrity_advance - Advance integrity vector - * @bio: bio whose integrity vector to update - * @bytes_done: number of data bytes that have been completed - * - * Description: This function calculates how many integrity bytes the - * number of completed data bytes correspond to and advances the - * integrity vector accordingly. - */ -void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - - bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); -} -EXPORT_SYMBOL(bio_integrity_advance); - -/** - * bio_integrity_trim - Trim integrity vector - * @bio: bio whose integrity vector to update - * @offset: offset to first data sector - * @sectors: number of data sectors - * - * Description: Used to trim the integrity vector in a cloned bio. - * The ivec will be advanced corresponding to 'offset' data sectors - * and the length will be truncated corresponding to 'len' data - * sectors. - */ -void bio_integrity_trim(struct bio *bio, unsigned int offset, - unsigned int sectors) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - - bio_integrity_advance(bio, offset << 9); - bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); -} -EXPORT_SYMBOL(bio_integrity_trim); - -/** - * bio_integrity_clone - Callback for cloning bios with integrity metadata - * @bio: New bio - * @bio_src: Original bio - * @gfp_mask: Memory allocation mask - * - * Description: Called to allocate a bip when cloning a bio - */ -int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask) -{ - struct bio_integrity_payload *bip_src = bio_src->bi_integrity; - struct bio_integrity_payload *bip; - - BUG_ON(bip_src == NULL); - - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; - - memcpy(bip->bip_vec, bip_src->bip_vec, - bip_src->bip_vcnt * sizeof(struct bio_vec)); - - bip->bip_vcnt = bip_src->bip_vcnt; - bip->bip_iter = bip_src->bip_iter; - - return 0; -} -EXPORT_SYMBOL(bio_integrity_clone); - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (bs->bio_integrity_pool) - return 0; - - bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - if (!bs->bio_integrity_pool) - return -1; - - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_integrity_pool) { - mempool_destroy(bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - if (bs->bio_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); - - if (bs->bvec_integrity_pool) - mempool_destroy(bs->bvec_integrity_pool); -} -EXPORT_SYMBOL(bioset_integrity_free); - -void __init bio_integrity_init(void) -{ - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); -} diff --git a/fs/bio.c b/fs/bio.c deleted file mode 100644 index 6f0362b77806..000000000000 --- a/fs/bio.c +++ /dev/null @@ -1,2037 +0,0 @@ -/* - * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - * - */ -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/uio.h> -#include <linux/iocontext.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/mempool.h> -#include <linux/workqueue.h> -#include <linux/cgroup.h> -#include <scsi/sg.h> /* for struct sg_iovec */ - -#include <trace/events/block.h> - -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { - BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), -}; -#undef BV - -/* - * fs_bio_set is the bio_set containing bio and iovec memory pools used by - * IO code that does not need private memory pools. - */ -struct bio_set *fs_bio_set; -EXPORT_SYMBOL(fs_bio_set); - -/* - * Our slab pool management - */ -struct bio_slab { - struct kmem_cache *slab; - unsigned int slab_ref; - unsigned int slab_size; - char name[8]; -}; -static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; - -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) -{ - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; - - mutex_lock(&bio_slab_lock); - - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; - - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } - - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; - - bslab = &bio_slabs[entry]; - - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; - - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: - mutex_unlock(&bio_slab_lock); - return slab; -} - -static void bio_put_slab(struct bio_set *bs) -{ - struct bio_slab *bslab = NULL; - unsigned int i; - - mutex_lock(&bio_slab_lock); - - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - - if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) - goto out; - - WARN_ON(!bslab->slab_ref); - - if (--bslab->slab_ref) - goto out; - - kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; - -out: - mutex_unlock(&bio_slab_lock); -} - -unsigned int bvec_nr_vecs(unsigned short idx) -{ - return bvec_slabs[idx].nr_vecs; -} - -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); - - if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, pool); - else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } -} - -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) -{ - struct bio_vec *bvl; - - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } - - /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. - */ - if (*idx == BIOVEC_MAX_IDX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); - - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - - /* - * Try a slab allocation. If this fails and __GFP_WAIT - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { - *idx = BIOVEC_MAX_IDX; - goto fallback; - } - } - - return bvl; -} - -static void __bio_free(struct bio *bio) -{ - bio_disassociate_task(bio); - - if (bio_integrity(bio)) - bio_integrity_free(bio); -} - -static void bio_free(struct bio *bio) -{ - struct bio_set *bs = bio->bi_pool; - void *p; - - __bio_free(bio); - - if (bs) { - if (bio_flagged(bio, BIO_OWNS_VEC)) - bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); - - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } -} - -void bio_init(struct bio *bio) -{ - memset(bio, 0, sizeof(*bio)); - bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_remaining, 1); - atomic_set(&bio->bi_cnt, 1); -} -EXPORT_SYMBOL(bio_init); - -/** - * bio_reset - reinitialize a bio - * @bio: bio to reset - * - * Description: - * After calling bio_reset(), @bio will be in the same state as a freshly - * allocated bio returned bio bio_alloc_bioset() - the only fields that are - * preserved are the ones that are initialized by bio_alloc_bioset(). See - * comment in struct bio. - */ -void bio_reset(struct bio *bio) -{ - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - - __bio_free(bio); - - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); - atomic_set(&bio->bi_remaining, 1); -} -EXPORT_SYMBOL(bio_reset); - -static void bio_chain_endio(struct bio *bio, int error) -{ - bio_endio(bio->bi_private, error); - bio_put(bio); -} - -/** - * bio_chain - chain bio completions - * - * The caller won't have a bi_end_io called when @bio completes - instead, - * @parent's bi_end_io won't be called until both @parent and @bio have - * completed; the chained bio will also be freed when it completes. - * - * The caller must not set bi_private or bi_end_io in @bio. - */ -void bio_chain(struct bio *bio, struct bio *parent) -{ - BUG_ON(bio->bi_private || bio->bi_end_io); - - bio->bi_private = parent; - bio->bi_end_io = bio_chain_endio; - atomic_inc(&parent->bi_remaining); -} -EXPORT_SYMBOL(bio_chain); - -static void bio_alloc_rescue(struct work_struct *work) -{ - struct bio_set *bs = container_of(work, struct bio_set, rescue_work); - struct bio *bio; - - while (1) { - spin_lock(&bs->rescue_lock); - bio = bio_list_pop(&bs->rescue_list); - spin_unlock(&bs->rescue_lock); - - if (!bio) - break; - - generic_make_request(bio); - } -} - -static void punt_bios_to_rescuer(struct bio_set *bs) -{ - struct bio_list punt, nopunt; - struct bio *bio; - - /* - * In order to guarantee forward progress we must punt only bios that - * were allocated from this bio_set; otherwise, if there was a bio on - * there for a stacking driver higher up in the stack, processing it - * could require allocating bios from this bio_set, and doing that from - * our own rescuer would be bad. - * - * Since bio lists are singly linked, pop them all instead of trying to - * remove from the middle of the list: - */ - - bio_list_init(&punt); - bio_list_init(&nopunt); - - while ((bio = bio_list_pop(current->bio_list))) - bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); - - *current->bio_list = nopunt; - - spin_lock(&bs->rescue_lock); - bio_list_merge(&bs->rescue_list, &punt); - spin_unlock(&bs->rescue_lock); - - queue_work(bs->rescue_workqueue, &bs->rescue_work); -} - -/** - * bio_alloc_bioset - allocate a bio for I/O - * @gfp_mask: the GFP_ mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from. - * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. - * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. - * - * Note that when running under generic_make_request() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent - * stack overflows. - * - * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. - * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. - * - * RETURNS: - * Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) -{ - gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - unsigned long idx = BIO_POOL_NONE; - struct bio_vec *bvl = NULL; - struct bio *bio; - void *p; - - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* - * generic_make_request() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. - */ - - if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; - - p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!p)) - return NULL; - - bio = p + front_pad; - bio_init(bio); - - if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - } - - if (unlikely(!bvl)) - goto err_free; - - bio->bi_flags |= 1 << BIO_OWNS_VEC; - } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; - } - - bio->bi_pool = bs; - bio->bi_flags |= idx << BIO_POOL_OFFSET; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; - return bio; - -err_free: - mempool_free(p, bs->bio_pool); - return NULL; -} -EXPORT_SYMBOL(bio_alloc_bioset); - -void zero_fill_bio(struct bio *bio) -{ - unsigned long flags; - struct bio_vec bv; - struct bvec_iter iter; - - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } -} -EXPORT_SYMBOL(zero_fill_bio); - -/** - * bio_put - release a reference to a bio - * @bio: bio to release reference to - * - * Description: - * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. - **/ -void bio_put(struct bio *bio) -{ - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) - bio_free(bio); -} -EXPORT_SYMBOL(bio_put); - -inline int bio_phys_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_phys_segments; -} -EXPORT_SYMBOL(bio_phys_segments); - -/** - * __bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: destination bio - * @bio_src: bio to clone - * - * Clone a &bio. Caller will own the returned bio, but not - * the actual data it points to. Reference count of returned - * bio will be one. - * - * Caller must ensure that @bio_src is not freed before @bio. - */ -void __bio_clone_fast(struct bio *bio, struct bio *bio_src) -{ - BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); - - /* - * most users will be overriding ->bi_bdev with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_flags |= 1 << BIO_CLONED; - bio->bi_rw = bio_src->bi_rw; - bio->bi_iter = bio_src->bi_iter; - bio->bi_io_vec = bio_src->bi_io_vec; -} -EXPORT_SYMBOL(__bio_clone_fast); - -/** - * bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Like __bio_clone_fast, only also allocates the returned bio - */ -struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) -{ - struct bio *b; - - b = bio_alloc_bioset(gfp_mask, 0, bs); - if (!b) - return NULL; - - __bio_clone_fast(b, bio); - - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; -} -EXPORT_SYMBOL(bio_clone_fast); - -/** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. - */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_rw = bio_src->bi_rw; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - if (bio->bi_rw & REQ_DISCARD) - goto integrity_clone; - - if (bio->bi_rw & REQ_WRITE_SAME) { - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - goto integrity_clone; - } - - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - -integrity_clone: - if (bio_integrity(bio_src)) { - int ret; - - ret = bio_integrity_clone(bio, bio_src, gfp_mask); - if (ret < 0) { - bio_put(bio); - return NULL; - } - } - - return bio; -} -EXPORT_SYMBOL(bio_clone_bioset); - -/** - * bio_get_nr_vecs - return approx number of vecs - * @bdev: I/O target - * - * Return the approximate number of pages we can send to this target. - * There's no guarantee that you will be able to fit this number of pages - * into a bio, it does not account for dynamic restrictions that vary - * on offset. - */ -int bio_get_nr_vecs(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = min_t(unsigned, - queue_max_segments(q), - queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); - - return min_t(unsigned, nr_pages, BIO_MAX_PAGES); - -} -EXPORT_SYMBOL(bio_get_nr_vecs); - -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned int max_sectors) -{ - int retried_segments = 0; - struct bio_vec *bvec; - - /* - * cloned bio must not modify vec list - */ - if (unlikely(bio_flagged(bio, BIO_CLONED))) - return 0; - - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) - return 0; - - /* - * For filesystems with a blocksize smaller than the pagesize - * we will often be called with the same page as last time and - * a consecutive offset. Optimize this special case. - */ - if (bio->bi_vcnt > 0) { - struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page == prev->bv_page && - offset == prev->bv_offset + prev->bv_len) { - unsigned int prev_bv_len = prev->bv_len; - prev->bv_len += len; - - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec - as new bvec. */ - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size - - prev_bv_len, - .bi_rw = bio->bi_rw, - }; - - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { - prev->bv_len -= len; - return 0; - } - } - - goto done; - } - } - - if (bio->bi_vcnt >= bio->bi_max_vecs) - return 0; - - /* - * we might lose a segment or two here, but rather that than - * make this too complex. - */ - - while (bio->bi_phys_segments >= queue_max_segments(q)) { - - if (retried_segments) - return 0; - - retried_segments = 1; - blk_recount_segments(q, bio); - } - - /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* - * if queue has other restrictions (eg varying max sector size - * depending on offset), it can specify a merge_bvec_fn in the - * queue to get further control - */ - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size, - .bi_rw = bio->bi_rw, - }; - - /* - * merge_bvec_fn() returns number of bytes it can accept - * at this offset - */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } - } - - /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) - bio->bi_flags &= ~(1 << BIO_SEG_VALID); - - bio->bi_vcnt++; - bio->bi_phys_segments++; - done: - bio->bi_iter.bi_size += len; - return len; -} - -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by REQ_PC bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return __bio_add_page(q, bio, page, len, offset, - queue_max_hw_sectors(q)); -} -EXPORT_SYMBOL(bio_add_pc_page); - -/** - * bio_add_page - attempt to add page to bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - */ -int bio_add_page(struct bio *bio, struct page *page, unsigned int len, - unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); -} -EXPORT_SYMBOL(bio_add_page); - -struct submit_bio_ret { - struct completion event; - int error; -}; - -static void submit_bio_wait_endio(struct bio *bio, int error) -{ - struct submit_bio_ret *ret = bio->bi_private; - - ret->error = error; - complete(&ret->event); -} - -/** - * submit_bio_wait - submit a bio, and wait until it completes - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) - * @bio: The &struct bio which describes the I/O - * - * Simple wrapper around submit_bio(). Returns 0 on success, or the error from - * bio_endio() on failure. - */ -int submit_bio_wait(int rw, struct bio *bio) -{ - struct submit_bio_ret ret; - - rw |= REQ_SYNC; - init_completion(&ret.event); - bio->bi_private = &ret; - bio->bi_end_io = submit_bio_wait_endio; - submit_bio(rw, bio); - wait_for_completion(&ret.event); - - return ret.error; -} -EXPORT_SYMBOL(submit_bio_wait); - -/** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. - * - * @bio will then represent the remaining, uncompleted portion of the io. - */ -void bio_advance(struct bio *bio, unsigned bytes) -{ - if (bio_integrity(bio)) - bio_integrity_advance(bio, bytes); - - bio_advance_iter(bio, &bio->bi_iter, bytes); -} -EXPORT_SYMBOL(bio_advance); - -/** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - -/** - * bio_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats - * @src and @dst as linked lists of bios. - * - * Stops when it reaches the end of either @src or @dst - that is, copies - * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). - */ -void bio_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter, dst_iter; - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - - src_iter = src->bi_iter; - dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - src_bv = bio_iter_iovec(src, src_iter); - dst_bv = bio_iter_iovec(dst, dst_iter); - - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - bio_advance_iter(src, &src_iter, bytes); - bio_advance_iter(dst, &dst_iter, bytes); - } -} -EXPORT_SYMBOL(bio_copy_data); - -struct bio_map_data { - int nr_sgvecs; - int is_our_pages; - struct sg_iovec sgvecs[]; -}; - -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - const struct sg_iovec *iov, int iov_count, - int is_our_pages) -{ - memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); - bmd->nr_sgvecs = iov_count; - bmd->is_our_pages = is_our_pages; - bio->bi_private = bmd; -} - -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, - gfp_t gfp_mask) -{ - if (iov_count > UIO_MAXIOV) - return NULL; - - return kmalloc(sizeof(struct bio_map_data) + - sizeof(struct sg_iovec) * iov_count, gfp_mask); -} - -static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, - int to_user, int from_user, int do_free_page) -{ - int ret = 0, i; - struct bio_vec *bvec; - int iov_idx = 0; - unsigned int iov_off = 0; - - bio_for_each_segment_all(bvec, bio, i) { - char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; - - while (bv_len && iov_idx < iov_count) { - unsigned int bytes; - char __user *iov_addr; - - bytes = min_t(unsigned int, - iov[iov_idx].iov_len - iov_off, bv_len); - iov_addr = iov[iov_idx].iov_base + iov_off; - - if (!ret) { - if (to_user) - ret = copy_to_user(iov_addr, bv_addr, - bytes); - - if (from_user) - ret = copy_from_user(bv_addr, iov_addr, - bytes); - - if (ret) - ret = -EFAULT; - } - - bv_len -= bytes; - bv_addr += bytes; - iov_addr += bytes; - iov_off += bytes; - - if (iov[iov_idx].iov_len == iov_off) { - iov_idx++; - iov_off = 0; - } - } - - if (do_free_page) - __free_page(bvec->bv_page); - } - - return ret; -} - -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - struct bio_vec *bvec; - int ret = 0, i; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. - */ - if (current->mm) - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, - bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); - else if (bmd->is_our_pages) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - } - kfree(bmd); - bio_put(bio); - return ret; -} -EXPORT_SYMBOL(bio_uncopy_user); - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct bio_vec *bvec; - struct page *page; - struct bio *bio; - int i, ret; - int nr_pages = 0; - unsigned int len = 0; - unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr; - unsigned long end; - unsigned long start; - - uaddr = (unsigned long)iov[i].iov_base; - end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = uaddr >> PAGE_SHIFT; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages += end - start; - len += iov[i].iov_len; - } - - if (offset) - nr_pages++; - - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - if (!write_to_vm) - bio->bi_rw |= REQ_WRITE; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) - break; - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - /* - * success - */ - if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); - if (ret) - goto cleanup; - } - - bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); - return bio; -cleanup: - if (!map_data) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_copy_user - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, - unsigned long uaddr, unsigned int len, - int write_to_vm, gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_copy_user); - -static struct bio *__bio_map_user_iov(struct request_queue *q, - struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - int i, j; - int nr_pages = 0; - struct page **pages; - struct bio *bio; - int cur_page = 0; - int ret, offset; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages += end - start; - /* - * buffer must be aligned to at least hardsector size for now - */ - if (uaddr & queue_dma_alignment(q)) - return ERR_PTR(-EINVAL); - } - - if (!nr_pages) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); - if (!pages) - goto out; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; - const int local_nr_pages = end - start; - const int page_limit = cur_page + local_nr_pages; - - ret = get_user_pages_fast(uaddr, local_nr_pages, - write_to_vm, &pages[cur_page]); - if (ret < local_nr_pages) { - ret = -EFAULT; - goto out_unmap; - } - - offset = uaddr & ~PAGE_MASK; - for (j = cur_page; j < page_limit; j++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - /* - * sorry... - */ - if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < - bytes) - break; - - len -= bytes; - offset = 0; - } - - cur_page = j; - /* - * release the pages we didn't map into the bio, if any - */ - while (j < page_limit) - page_cache_release(pages[j++]); - } - - kfree(pages); - - /* - * set data direction, and check if mapped pages need bouncing - */ - if (!write_to_vm) - bio->bi_rw |= REQ_WRITE; - - bio->bi_bdev = bdev; - bio->bi_flags |= (1 << BIO_USER_MAPPED); - return bio; - - out_unmap: - for (i = 0; i < nr_pages; i++) { - if(!pages[i]) - break; - page_cache_release(pages[i]); - } - out: - kfree(pages); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_map_user - map user address into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm, - gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_map_user); - -/** - * bio_map_user_iov - map user sg_iovec table into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, - gfp_mask); - if (IS_ERR(bio)) - return bio; - - /* - * subtle -- if __bio_map_user() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - - return bio; -} - -static void __bio_unmap_user(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - /* - * make sure we dirty pages we wrote to - */ - bio_for_each_segment_all(bvec, bio, i) { - if (bio_data_dir(bio) == READ) - set_page_dirty_lock(bvec->bv_page); - - page_cache_release(bvec->bv_page); - } - - bio_put(bio); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user(). Must be called with - * a process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - __bio_unmap_user(bio); - bio_put(bio); -} -EXPORT_SYMBOL(bio_unmap_user); - -static void bio_map_kern_endio(struct bio *bio, int err) -{ - bio_put(bio); -} - -static struct bio *__bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) - break; - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_kern(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (bio->bi_iter.bi_size == len) - return bio; - - /* - * Don't support partial mappings. - */ - bio_put(bio); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL(bio_map_kern); - -static void bio_copy_kern_endio(struct bio *bio, int err) -{ - struct bio_vec *bvec; - const int read = bio_data_dir(bio) == READ; - struct bio_map_data *bmd = bio->bi_private; - int i; - char *p = bmd->sgvecs[0].iov_base; - - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - if (read) - memcpy(p, addr, bvec->bv_len); - - __free_page(bvec->bv_page); - p += bvec->bv_len; - } - - kfree(bmd); - bio_put(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - struct bio *bio; - struct bio_vec *bvec; - int i; - - bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (!reading) { - void *p = data; - - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - memcpy(addr, p, bvec->bv_len); - p += bvec->bv_len; - } - } - - bio->bi_end_io = bio_copy_kern_endio; - - return bio; -} -EXPORT_SYMBOL(bio_copy_kern); - -/* - * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions - * for performing direct-IO in BIOs. - * - * The problem is that we cannot run set_page_dirty() from interrupt context - * because the required locks are not interrupt-safe. So what we can do is to - * mark the pages dirty _before_ performing IO. And in interrupt context, - * check that the pages are still dirty. If so, fine. If not, redirty them - * in process context. - * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * - * Note that this code is very hard to test under normal circumstances because - * direct-io pins the pages with get_user_pages(). This makes - * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, flusher threads) could clean the pages if they are mapped - * pagecache. - * - * Simply disabling the call to bio_set_pages_dirty() is a good way to test the - * deferred bio dirtying paths. - */ - -/* - * bio_set_pages_dirty() will mark all the bio's pages as dirty. - */ -void bio_set_pages_dirty(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page && !PageCompound(page)) - set_page_dirty_lock(page); - } -} - -static void bio_release_pages(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page) - put_page(page); - } -} - -/* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. - * If they are, then fine. If, however, some pages are clean then they must - * have been written out during the direct-IO read. So we take another ref on - * the BIO and the offending pages and re-dirty the pages in process context. - * - * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one page_cache_release() against each page and will - * run one bio_put() against the BIO. - */ - -static void bio_dirty_fn(struct work_struct *work); - -static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); -static DEFINE_SPINLOCK(bio_dirty_lock); -static struct bio *bio_dirty_list; - -/* - * This runs in process context - */ -static void bio_dirty_fn(struct work_struct *work) -{ - unsigned long flags; - struct bio *bio; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio = bio_dirty_list; - bio_dirty_list = NULL; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - - while (bio) { - struct bio *next = bio->bi_private; - - bio_set_pages_dirty(bio); - bio_release_pages(bio); - bio_put(bio); - bio = next; - } -} - -void bio_check_pages_dirty(struct bio *bio) -{ - struct bio_vec *bvec; - int nr_clean_pages = 0; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (PageDirty(page) || PageCompound(page)) { - page_cache_release(page); - bvec->bv_page = NULL; - } else { - nr_clean_pages++; - } - } - - if (nr_clean_pages) { - unsigned long flags; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio->bi_private = bio_dirty_list; - bio_dirty_list = bio; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } else { - bio_put(bio); - } -} - -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void bio_flush_dcache_pages(struct bio *bi) -{ - struct bio_vec bvec; - struct bvec_iter iter; - - bio_for_each_segment(bvec, bi, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL(bio_flush_dcache_pages); -#endif - -/** - * bio_endio - end I/O on a bio - * @bio: bio - * @error: error, if any - * - * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. - **/ -void bio_endio(struct bio *bio, int error) -{ - while (bio) { - BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (!atomic_dec_and_test(&bio->bi_remaining)) - return; - - /* - * Need to have a real endio function for chained bios, - * otherwise various corner cases will break (like stacking - * block devices that save/restore bi_end_io) - however, we want - * to avoid unbounded recursion and blowing the stack. Tail call - * optimization would handle this, but compiling with frame - * pointers also disables gcc's sibling call optimization. - */ - if (bio->bi_end_io == bio_chain_endio) { - struct bio *parent = bio->bi_private; - bio_put(bio); - bio = parent; - } else { - if (bio->bi_end_io) - bio->bi_end_io(bio, error); - bio = NULL; - } - } -} -EXPORT_SYMBOL(bio_endio); - -/** - * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining - * @bio: bio - * @error: error, if any - * - * For code that has saved and restored bi_end_io; thing hard before using this - * function, probably you should've cloned the entire bio. - **/ -void bio_endio_nodec(struct bio *bio, int error) -{ - atomic_inc(&bio->bi_remaining); - bio_endio(bio, error); -} -EXPORT_SYMBOL(bio_endio_nodec); - -/** - * bio_split - split a bio - * @bio: bio to split - * @sectors: number of sectors to split from the front of @bio - * @gfp: gfp mask - * @bs: bio set to allocate from - * - * Allocates and returns a new bio which represents @sectors from the start of - * @bio, and updates @bio to represent the remaining sectors. - * - * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's - * responsibility to ensure that @bio is not freed before the split. - */ -struct bio *bio_split(struct bio *bio, int sectors, - gfp_t gfp, struct bio_set *bs) -{ - struct bio *split = NULL; - - BUG_ON(sectors <= 0); - BUG_ON(sectors >= bio_sectors(bio)); - - split = bio_clone_fast(bio, gfp, bs); - if (!split) - return NULL; - - split->bi_iter.bi_size = sectors << 9; - - if (bio_integrity(split)) - bio_integrity_trim(split, 0, sectors); - - bio_advance(bio, split->bi_iter.bi_size); - - return split; -} -EXPORT_SYMBOL(bio_split); - -/** - * bio_trim - trim a bio - * @bio: bio to trim - * @offset: number of sectors to trim from the front of @bio - * @size: size we want to trim @bio to, in sectors - */ -void bio_trim(struct bio *bio, int offset, int size) -{ - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - */ - - size <<= 9; - if (offset == 0 && size == bio->bi_iter.bi_size) - return; - - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - bio_advance(bio, offset << 9); - - bio->bi_iter.bi_size = size; -} -EXPORT_SYMBOL_GPL(bio_trim); - -/* - * create memory pools for biovec's in a bio_set. - * use the global biovec slabs created for general use. - */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) -{ - struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - - return mempool_create_slab_pool(pool_entries, bp->slab); -} - -void bioset_free(struct bio_set *bs) -{ - if (bs->rescue_workqueue) - destroy_workqueue(bs->rescue_workqueue); - - if (bs->bio_pool) - mempool_destroy(bs->bio_pool); - - if (bs->bvec_pool) - mempool_destroy(bs->bvec_pool); - - bioset_integrity_free(bs); - bio_put_slab(bs); - - kfree(bs); -} -EXPORT_SYMBOL(bioset_free); - -/** - * bioset_create - Create a bio_set - * @pool_size: Number of bio and bio_vecs to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller - * to ask for a number of bytes to be allocated in front of the bio. - * Front pad allocation is useful for embedding the bio inside - * another structure, to avoid allocating extra data to go with the bio. - * Note that the bio must be embedded at the END of that structure always, - * or things will break badly. - */ -struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) -{ - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - struct bio_set *bs; - - bs = kzalloc(sizeof(*bs), GFP_KERNEL); - if (!bs) - return NULL; - - bs->front_pad = front_pad; - - spin_lock_init(&bs->rescue_lock); - bio_list_init(&bs->rescue_list); - INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); - if (!bs->bio_slab) { - kfree(bs); - return NULL; - } - - bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); - if (!bs->bio_pool) - goto bad; - - bs->bvec_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_pool) - goto bad; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; - - return bs; -bad: - bioset_free(bs); - return NULL; -} -EXPORT_SYMBOL(bioset_create); - -#ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - struct cgroup_subsys_state *css; - - if (bio->bi_ioc) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - /* acquire active ref on @ioc and associate */ - get_io_context_active(ioc); - bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_cgrp_id); - if (css && css_tryget(css)) - bio->bi_css = css; - rcu_read_unlock(); - - return 0; -} - -/** - * bio_disassociate_task - undo bio_associate_current() - * @bio: target bio - */ -void bio_disassociate_task(struct bio *bio) -{ - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } -} - -#endif /* CONFIG_BLK_CGROUP */ - -static void __init biovec_init_slabs(void) -{ - int i; - - for (i = 0; i < BIOVEC_NR_POOLS; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; - - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } -} - -static int __init init_bio(void) -{ - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - - bio_integrity_init(); - biovec_init_slabs(); - - fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); - if (!fs_bio_set) - panic("bio: can't allocate bios\n"); - - if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - - return 0; -} -subsys_initcall(init_bio); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c48df572bd6..ba6b88528dc7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2058,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 029d46c2e170..983314932af3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2861,7 +2861,7 @@ retry_root_backup: printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2870,24 +2870,24 @@ retry_root_backup: "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) { ret = btrfs_commit_super(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; ret = btrfs_recover_relocation(tree_root); if (ret < 0) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1306487c82cf..5590af92094b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1542,6 +1542,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -3542,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -5719,6 +5722,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eb742c07e7a4..ae6af072b635 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -800,7 +800,7 @@ next_slot: if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -846,7 +846,7 @@ next_slot: */ if (start <= key.offset && end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -872,7 +872,7 @@ next_slot: if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -1777,7 +1777,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + iov->iov_len, root->sectorsize); + end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc8ca193d830..86935f5ae291 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -205,24 +209,14 @@ again: void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - again: if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { @@ -234,11 +228,7 @@ again: start_caching(root); - if (objectid <= root->cache_progress || - objectid >= root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); up_write(&root->fs_info->commit_root_sem); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f45040a4bb76..3f52bb7a58d2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3066,7 +3066,7 @@ process_slot: new_key.offset + datal, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); @@ -3120,6 +3120,8 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3136,12 +3138,14 @@ process_slot: size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, new_key.offset, - new_key.offset + datal, + aligned_end, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1ac3ca98c429..fd38b5053479 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -349,6 +349,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; @@ -1663,7 +1668,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5011aadacab8..9601d25a4607 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -385,20 +385,6 @@ static match_table_t tokens = { {Opt_err, NULL}, }; -#define btrfs_set_and_info(root, opt, fmt, args...) \ -{ \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ -} - -#define btrfs_clear_and_info(root, opt, fmt, args...) \ -{ \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ -} - /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -1186,7 +1172,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_PTR(-ENOMEM); mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - kfree(newargs); if (PTR_RET(mnt) == -EBUSY) { if (flags & MS_RDONLY) { @@ -1196,17 +1181,22 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, int r; mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, newargs); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + kfree(newargs); return ERR_CAST(mnt); + } r = btrfs_remount(mnt->mnt_sb, &flags, NULL); if (r < 0) { /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); return ERR_PTR(r); } } } + kfree(newargs); + if (IS_ERR(mnt)) return ERR_CAST(mnt); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2e5e648eb5c3..c561b628ebce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3261,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 766410a12c2c..c29d6ae68874 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -182,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -198,19 +205,12 @@ more: return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -446,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -935,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 39da1c2efa50..88a6df4cbe6d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1221,9 +1221,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0b0728e5be2d..233c6f96910a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -744,7 +744,6 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; } no_change: /* only update max_size on auth cap */ @@ -890,41 +889,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; @@ -1161,6 +1122,9 @@ retry_lookup: /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1180,13 +1144,10 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1213,8 +1174,9 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease, true); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1235,17 +1197,16 @@ retry_lookup: (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, NULL, true); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1407,7 +1368,7 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL, false); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); dn = NULL; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index fdf941b44ff1..a822a6e58290 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -109,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -153,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d94ba0df9f4d..191398852a2e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7866cd05a6bb..ead05cc1f447 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -266,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index df9c9141c099..5be1f997ecde 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; + written = cifs_get_writer(cinode); + if (written) + return written; + written = generic_file_aio_write(iocb, iov, nr_segs, pos); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a8..30f6e9251a4a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -1113,6 +1115,12 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + unsigned long flags; +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed075..ca7980a1e303 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83d..6ce4e0954b98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8add25538a3b..5ed03e0b8b40 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2599,7 +2599,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, ssize_t err; err = generic_write_sync(file, iocb->ki_pos - rc, rc); - if (rc < 0) + if (err < 0) rc = err; } } else { @@ -2621,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_aio_write( + iocb, iov, nr_segs, pos); + goto out; + } + written = cifs_writev(iocb, iov, nr_segs, pos); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2646,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2872,7 +2882,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -3621,6 +3631,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3628,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3666,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index aadc2b68678b..a22d667f1069 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; + if (!cifs_sb->actimeo) + return true; + if (!time_in_range(jiffies, cifs_i->time, cifs_i->time + cifs_sb->actimeo)) return true; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679d..3b0c62e622da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f9230..d1fdfa848703 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6e..b8021fde987d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf1..35ddc3ed119d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 860344701067..3802f8c94acc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } diff --git a/fs/compat.c b/fs/compat.c index ca926ad0430c..66d3d3c6b4b2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -457,9 +457,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: ret = get_compat_flock64(&f, compat_ptr(arg)); if (ret != 0) break; @@ -468,7 +468,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, conv_cmd = convert_fcntl_cmd(cmd); ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) { + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { /* need to return lock information - see above for commentary */ if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; @@ -493,9 +493,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: return -EINVAL; } return compat_sys_fcntl64(fd, cmd, arg); diff --git a/fs/coredump.c b/fs/coredump.c index e3ad709a4232..0b2528fb640e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 40707d88a945..be2bea834bf4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) +static void dentry_free(struct dentry *dentry) { - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); @@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry * -dentry_kill(struct dentry *dentry, int unlock_on_failure) - __releases(dentry->d_lock) +static void __dentry_kill(struct dentry *dentry) { - struct inode *inode; - struct dentry *parent; - - inode = dentry->d_inode; - if (inode && !spin_trylock(&inode->i_lock)) { -relock: - if (unlock_on_failure) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - } - return dentry; /* try again with same dentry */ - } - if (IS_ROOT(dentry)) - parent = NULL; - else + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) parent = dentry->d_parent; - if (parent && !spin_trylock(&parent->d_lock)) { - if (inode) - spin_unlock(&inode->i_lock); - goto relock; - } /* * The dentry is now unrecoverably dead to the world. @@ -543,10 +460,103 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto failed; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + if (inode) + spin_unlock(&inode->i_lock); + goto failed; + } + } + + __dentry_kill(dentry); + return parent; + +failed: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + spin_unlock(&dentry->d_lock); + rcu_read_lock(); +again: + parent = ACCESS_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; } /* @@ -602,7 +612,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -815,64 +825,15 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; - - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ + struct dentry *dentry, *parent; - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + struct inode *inode; + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } + parent = lock_parent(dentry); /* * The dispose list is isolated and dentries are not accounted @@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); continue; } - rcu_read_unlock(); - /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. - */ - dentry = try_prune_one_dentry(dentry); - rcu_read_lock(); - if (dentry) { + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + __dentry_kill(dentry); + + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { + parent = lock_parent(dentry); + if (dentry->d_lockref.count != 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + break; + } + inode = dentry->d_inode; /* can't be NULL */ + if (unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + cpu_relax(); + continue; + } + __dentry_kill(dentry); + dentry = parent; } } - rcu_read_unlock(); } static enum lru_status @@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } @@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); -static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { - struct select_data *data = _data; - enum d_walk_ret ret = D_WALK_CONTINUE; + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - if (likely(!list_empty(&dentry->d_subdirs))) - goto out; - if (dentry == data->start && dentry->d_lockref.count == 1) - goto out; - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, + dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - BUG(); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); - data->found++; - ret = D_WALK_NORETRY; - } -out: - if (data->found && need_resched()) - ret = D_WALK_QUIT; - return ret; + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); } /* @@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; - for (;;) { - struct select_data data; - - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (!data.found) - break; - - shrink_dentry_list(&data.dispose); - cond_resched(); - } - d_drop(dentry); - dput(dentry); + do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { - struct select_data data; - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - - INIT_LIST_HEAD(&data.dispose); - data.start = NULL; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (data.found) - shrink_dentry_list(&data.dispose); - cond_resched(); + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); } } @@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_ENTRY_TYPE; - dentry->d_flags |= add_flags; + __d_set_type(dentry, add_flags); if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; diff --git a/fs/exec.c b/fs/exec.c index 476f3ebf437e..238b7aa26f68 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP - /* Limit stack size to 1GB */ + /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); - if (stack_base > (1 << 30)) - stack_base = 1 << 30; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bbc..5c56785007e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1c65dc7cc0a..66946aa62127 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2466,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 82df3ce9874a..01b0c208f625 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3313,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } uninitialized = ext4_ext_is_uninitialized(ex); split_flag1 = 0; @@ -3694,6 +3699,12 @@ static int ext4_convert_initialized_extents(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } } err = ext4_ext_get_access(handle, inode, path + depth); @@ -4730,6 +4741,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* * Write out all dirty pages to avoid race conditions * Then release them. @@ -4878,9 +4892,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4892,6 +4903,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -5229,18 +5243,19 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = ex_last->ee_block + + *start = le32_to_cpu(ex_last->ee_block) + ext4_ext_get_actual_len(ex_last); while (ex_start <= ex_last) { - ex_start->ee_block -= shift; - if (ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) { - if (ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) - ex_last--; - } - ex_start++; + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5255,7 +5270,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - path[depth].p_idx->ei_block -= shift; + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5300,7 +5315,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return ret; } - stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5313,10 +5329,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * enough to accomodate the shift. */ path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = extent->ee_block; - ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } ext4_ext_drop_refs(path); kfree(path); @@ -5331,7 +5355,13 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - current_block = extent->ee_block; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ ret = mext_next_extent(inode, path, &extent); @@ -5365,17 +5395,18 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; - loff_t new_size; + loff_t new_size, ioffset; int ret; - BUG_ON(offset + len > i_size_read(inode)); - /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || len & (EXT4_BLOCK_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) return -EOPNOTSUPP; trace_ext4_collapse_range(inode, offset, len); @@ -5383,22 +5414,34 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); if (ret) return ret; /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; goto out_mutex; } @@ -5408,7 +5451,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache_range(inode, offset, -1); + truncate_pagecache(inode, ioffset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); @@ -5425,7 +5468,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start - 1); + EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5436,6 +5479,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start); @@ -5445,10 +5489,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } new_size = i_size_read(inode) - len; - truncate_setsize(inode, new_size); + i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a014a7194b2..0ebc21204b51 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -810,7 +810,7 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; - block = 0x7FDEADBEEF; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca7502d89fde..063fc1538355 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b0d2c7d5408..d7b7462a0e13 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -522,6 +522,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -2243,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -3527,15 +3541,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3616,7 +3621,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4423,21 +4427,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4449,15 +4452,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a888cac76e9c..c8238a26818c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -5008,6 +5016,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d40..c18d95b50540 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f3c667091618..6f9e6fadac04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3869,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + + /* + * set up enough so that it can read an inode, + * and create new inode for buddy allocator + */ + sbi->s_gdb_count = db_count; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount2; + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; + goto failed_mount2a; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2; + goto failed_mount2a; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3916,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4113,21 +4124,13 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; + goto failed_mount5; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); goto failed_mount5; } @@ -4204,11 +4207,8 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4a: +failed_mount5: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4232,11 +4232,14 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); +failed_mount2a: + ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1f5cf5880718..4eec399ec807 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -520,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -542,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); diff --git a/fs/fcntl.c b/fs/fcntl.c index 9ead1596399a..72c82f69b01b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -274,15 +274,15 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_GETLKP: + case F_OFD_GETLK: #endif case F_GETLK: err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: #endif /* Fallthrough */ case F_SETLK: @@ -399,13 +399,13 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: - case F_GETLKP: + case F_OFD_GETLK: err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); break; case F_SETLK64: case F_SETLKW64: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: err = fcntl_setlk64(fd, f.file, cmd, (struct flock64 __user *) arg); break; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a9..205e0d5d5307 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5b4e035b364c..42198359fa1b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); + + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, sizeof(struct fuse_rename_in)); +} + +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + return err; + +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -846,6 +896,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; @@ -1504,7 +1556,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) } static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_mtime) + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1523,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1597,39 +1654,38 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct file *file, bool nofail) +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct inode *inode = file->f_mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = NULL; + struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; int err; - if (nofail) { - req = fuse_get_req_nofail_nopages(fc, file); - } else { - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - } + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - inarg.valid |= FATTR_MTIME; + inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); - return err; } @@ -1653,7 +1709,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; - bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1678,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_mtime); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1711,9 +1769,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, spin_lock(&fc->lock); /* the kernel maintains i_mtime locally */ - if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { - inode->i_mtime = attr->ia_mtime; - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, @@ -1810,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1941,20 +2004,11 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); - return err; -} - -static int fuse_update_time(struct inode *inode, struct timespec *now, - int flags) -{ - if (flags & S_MTIME) { - inode->i_mtime = *now; - set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); - BUG_ON(!S_ISREG(inode->i_mode)); + fuse_update_ctime(inode); } - return 0; + return err; } static const struct inode_operations fuse_dir_inode_operations = { @@ -1964,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, @@ -1996,7 +2051,6 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, - .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 13f8bdec5110..96d513e01a5d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) @@ -314,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file) /* see fuse_vma_close() for !writeback_cache case */ if (fc->writeback_cache) - filemap_write_and_wait(file->f_mapping); - - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) - fuse_flush_mtime(file, true); + write_inode_now(inode, 1); fuse_release_common(file, FUSE_RELEASE); @@ -439,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - err = filemap_write_and_wait(file->f_mapping); + err = write_inode_now(inode, 1); if (err) return err; @@ -480,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -494,17 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - int err = fuse_flush_mtime(file, false); - if (err) - goto out; - } + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { @@ -1659,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_writepage_free(fc, req); } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) { struct fuse_file *ff = NULL; spin_lock(&fc->lock); - if (!WARN_ON(list_empty(&fi->write_files))) { + if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); @@ -1675,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, return ff; } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -2972,6 +2995,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; @@ -3017,12 +3043,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) { - struct fuse_inode *fi = get_fuse_inode(inode); - - inode->i_mtime = current_fs_time(inode->i_sb); - set_bit(FUSE_I_MTIME_DIRTY, &fi->state); - } + if (changed && fc->writeback_cache) + file_update_time(file); } if (mode & FALLOC_FL_PUNCH_HOLE) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a257ed8ebee6..7aa5c75e0de1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,8 +119,6 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, - /** i_mtime has been updated locally; a flush to userspace needed */ - FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -544,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; @@ -725,7 +726,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request @@ -891,7 +892,8 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct file *file, bool nofail); +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8d611696fcad..754dcf23de8a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; } - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -303,7 +305,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if ((inode->i_state & I_NEW)) { inode->i_flags |= S_NOATIME; - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; @@ -788,6 +790,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, @@ -890,6 +893,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; + else + fc->sb->s_time_gran = 1000000000; + } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -996,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 204027520937..e19d4c0cacae 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; diff --git a/fs/ioprio.c b/fs/ioprio.c deleted file mode 100644 index e50170ca7c33..000000000000 --- a/fs/ioprio.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * fs/ioprio.c - * - * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> - * - * Helper functions for setting/querying io priorities of processes. The - * system calls closely mimmick getpriority/setpriority, see the man page for - * those. The prio argument is a composite of prio class and prio data, where - * the data argument has meaning within that class. The standard scheduling - * classes have 8 distinct prio levels, with 0 being the highest prio and 7 - * being the lowest. - * - * IOW, setting BE scheduling class with prio 2 is done ala: - * - * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; - * - * ioprio_set(PRIO_PROCESS, pid, prio); - * - * See also Documentation/block/ioprio.txt - * - */ -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/ioprio.h> -#include <linux/blkdev.h> -#include <linux/capability.h> -#include <linux/syscalls.h> -#include <linux/security.h> -#include <linux/pid_namespace.h> - -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - -SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) -{ - int class = IOPRIO_PRIO_CLASS(ioprio); - int data = IOPRIO_PRIO_DATA(ioprio); - struct task_struct *p, *g; - struct user_struct *user; - struct pid *pgrp; - kuid_t uid; - int ret; - - switch (class) { - case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* fall through, rt has prio field too */ - case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) - return -EINVAL; - - break; - case IOPRIO_CLASS_IDLE: - break; - case IOPRIO_CLASS_NONE: - if (data) - return -EINVAL; - break; - default: - return -EINVAL; - } - - ret = -ESRCH; - rcu_read_lock(); - switch (which) { - case IOPRIO_WHO_PROCESS: - if (!who) - p = current; - else - p = find_task_by_vpid(who); - if (p) - ret = set_task_ioprio(p, ioprio); - break; - case IOPRIO_WHO_PGRP: - if (!who) - pgrp = task_pgrp(current); - else - pgrp = find_vpid(who); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - ret = set_task_ioprio(p, ioprio); - if (ret) - break; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - uid = make_kuid(current_user_ns(), who); - if (!uid_valid(uid)) - break; - if (!who) - user = current_user(); - else - user = find_user(uid); - - if (!user) - break; - - do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) - continue; - ret = set_task_ioprio(p, ioprio); - if (ret) - goto free_uid; - } while_each_thread(g, p); -free_uid: - if (who) - free_uid(user); - break; - default: - ret = -EINVAL; - } - - rcu_read_unlock(); - return ret; -} - -static int get_task_ioprio(struct task_struct *p) -{ - int ret; - - ret = security_task_getioprio(p); - if (ret) - goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); - if (p->io_context) - ret = p->io_context->ioprio; -out: - return ret; -} - -int ioprio_best(unsigned short aprio, unsigned short bprio) -{ - unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); - unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); - - if (aclass == IOPRIO_CLASS_NONE) - aclass = IOPRIO_CLASS_BE; - if (bclass == IOPRIO_CLASS_NONE) - bclass = IOPRIO_CLASS_BE; - - if (aclass == bclass) - return min(aprio, bprio); - if (aclass > bclass) - return bprio; - else - return aprio; -} - -SYSCALL_DEFINE2(ioprio_get, int, which, int, who) -{ - struct task_struct *g, *p; - struct user_struct *user; - struct pid *pgrp; - kuid_t uid; - int ret = -ESRCH; - int tmpio; - - rcu_read_lock(); - switch (which) { - case IOPRIO_WHO_PROCESS: - if (!who) - p = current; - else - p = find_task_by_vpid(who); - if (p) - ret = get_task_ioprio(p); - break; - case IOPRIO_WHO_PGRP: - if (!who) - pgrp = task_pgrp(current); - else - pgrp = find_vpid(who); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - tmpio = get_task_ioprio(p); - if (tmpio < 0) - continue; - if (ret == -ESRCH) - ret = tmpio; - else - ret = ioprio_best(ret, tmpio); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - uid = make_kuid(current_user_ns(), who); - if (!who) - user = current_user(); - else - user = find_user(uid); - - if (!user) - break; - - do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) - continue; - tmpio = get_task_ioprio(p); - if (tmpio < 0) - continue; - if (ret == -ESRCH) - ret = tmpio; - else - ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); - - if (who) - free_uid(user); - break; - default: - ret = -EINVAL; - } - - rcu_read_unlock(); - return ret; -} diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 78f3403300af..a693f5b01ae6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -232,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn) struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; - if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; - while (*node) { struct kernfs_node *pos; int result; @@ -249,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn) else return -EEXIST; } + /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + return 0; } @@ -711,6 +714,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, return ERR_PTR(-ENOMEM); ida_init(&root->ino_ida); + INIT_LIST_HEAD(&root->supers); kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8034706a7af8..e3d37f607f97 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -14,6 +14,7 @@ #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/fsnotify.h> #include "kernfs-internal.h" @@ -484,6 +485,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); + if (rc) + goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() @@ -608,6 +611,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -622,14 +626,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) has_write = ops->write || ops->mmap; has_mmap = ops->mmap; - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; @@ -785,20 +791,48 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) */ void kernfs_notify(struct kernfs_node *kn) { + struct kernfs_root *root = kernfs_root(kn); struct kernfs_open_node *on; + struct kernfs_super_info *info; unsigned long flags; + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + /* kick poll */ spin_lock_irqsave(&kernfs_open_node_lock, flags); - if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) { - on = kn->attr.open; - if (on) { - atomic_inc(&on->event); - wake_up_interruptible(&on->poll); - } + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); } spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + + /* kick fsnotify */ + mutex_lock(&kernfs_mutex); + + list_for_each_entry(info, &root->supers, node) { + struct inode *inode; + struct dentry *dentry; + + inode = ilookup(info->sb, kn->ino); + if (!inode) + continue; + + dentry = d_find_any_alias(inode); + if (dentry) { + fsnotify_parent(NULL, dentry, FS_MODIFY); + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + NULL, 0); + dput(dentry); + } + + iput(inode); + } + + mutex_unlock(&kernfs_mutex); } EXPORT_SYMBOL_GPL(kernfs_notify); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index abb0f1f53d93..985217626e66 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 8be13b2a079b..dc84a3ef9ca2 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -49,6 +49,8 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) * mount.c */ struct kernfs_super_info { + struct super_block *sb; + /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. @@ -62,6 +64,9 @@ struct kernfs_super_info { * an array and compare kernfs_node tag against every entry. */ const void *ns; + + /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 6a5f04ac8704..d171b98a6cdd 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -62,15 +62,16 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) return NULL; } -static int kernfs_fill_super(struct super_block *sb) +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; struct dentry *root; + info->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; @@ -131,6 +132,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * @@ -142,8 +144,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -166,12 +168,18 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, *new_sb_created = !sb->s_root; if (!sb->s_root) { - error = kernfs_fill_super(sb); + struct kernfs_super_info *info = kernfs_info(sb); + + error = kernfs_fill_super(sb, magic); if (error) { deactivate_locked_super(sb); return ERR_PTR(error); } sb->s_flags |= MS_ACTIVE; + + mutex_lock(&kernfs_mutex); + list_add(&info->node, &root->supers); + mutex_unlock(&kernfs_mutex); } return dget(sb->s_root); @@ -190,6 +198,10 @@ void kernfs_kill_sb(struct super_block *sb) struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_node *root_kn = sb->s_root->d_fsdata; + mutex_lock(&kernfs_mutex); + list_del(&info->node); + mutex_unlock(&kernfs_mutex); + /* * Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing kernfs_super_info. diff --git a/fs/locks.c b/fs/locks.c index 13fc7a6d380a..e390bd9ae068 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -135,7 +135,7 @@ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) -#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) { @@ -389,18 +389,6 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, fl->fl_ops = NULL; fl->fl_lmops = NULL; - /* Ensure that fl->fl_filp has compatible f_mode */ - switch (l->l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - break; - } - return assign_type(fl, l->l_type); } @@ -564,7 +552,7 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker)) + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -759,12 +747,12 @@ EXPORT_SYMBOL(posix_test_lock); * of tasks (such as posix threads) sharing the same open file table. * To handle those cases, we just bail out after a few iterations. * - * For FL_FILE_PVT locks, the owner is the filp, not the files_struct. + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. * Because the owner is not even nominally tied to a thread of * execution, the deadlock detection below can't reasonably work well. Just * skip it for those. * - * In principle, we could do a more limited deadlock detection on FL_FILE_PVT + * In principle, we could do a more limited deadlock detection on FL_OFDLCK * locks that just checks for the case where two tasks are attempting to * upgrade from read to write locks on the same inode. */ @@ -791,9 +779,9 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, /* * This deadlock detector can't reasonably detect deadlocks with - * FL_FILE_PVT locks, since they aren't owned by a process, per-se. + * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ - if (IS_FILE_PVT(caller_fl)) + if (IS_OFDLCK(caller_fl)) return 0; while ((block_fl = what_owner_is_waiting_for(block_fl))) { @@ -1391,11 +1379,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, @@ -1891,7 +1878,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1913,7 +1900,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1942,13 +1929,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2035,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -2072,27 +2075,31 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLK; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLKW; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW: @@ -2144,13 +2151,13 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2207,27 +2214,31 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLK64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLKW64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW64: @@ -2413,8 +2424,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (IS_POSIX(fl)) { if (fl->fl_flags & FL_ACCESS) seq_printf(f, "ACCESS"); - else if (IS_FILE_PVT(fl)) - seq_printf(f, "FLPVT "); + else if (IS_OFDLCK(fl)) + seq_printf(f, "OFDLCK"); else seq_printf(f, "POSIX "); diff --git a/fs/namei.c b/fs/namei.c index c6157c894fce..80168273396b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1542,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; if (should_follow_link(path->dentry, follow)) { @@ -2249,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; @@ -2994,7 +2994,7 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (d_is_negative(path->dentry)) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6f3f392d48af..f66c66b9f182 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl) * by uid/gid. */ int i, j; - if (pacl->a_count <= 4) - return; /* no users or groups */ + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + i = 1; while (pacl->a_entries[i].e_tag == ACL_USER) i++; @@ -530,13 +532,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) /* * ACLs with no ACEs are treated differently in the inheritable - * and effective cases: when there are no inheritable ACEs, we - * set a zero-length default posix acl: + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { - pacl = posix_acl_alloc(0, GFP_KERNEL); - return pacl ? pacl : ERR_PTR(-ENOMEM); - } + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + /* * When there are no effective ACEs, the following will end * up setting a 3-element effective posix ACL with all @@ -589,7 +590,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) add_to_mask(state, &state->groups->aces[i].perms); } - if (!state->users->n && !state->groups->n) { + if (state->users->n || state->groups->n) { pace++; pace->e_tag = ACL_MASK; low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 39c8ef875f91..2c73cae9899d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -654,9 +654,11 @@ static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { - .to_initval = max_cb_time(clp->net), + .to_initval = maxtime, .to_retries = 0, + .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3ba65979a3cd..9a77a5a21557 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return NULL; } clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; } @@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp) WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); @@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, if (clp == NULL) return NULL; - INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { spin_lock(&nn->client_lock); @@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, spin_unlock(&nn->client_lock); return NULL; } - idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); - INIT_LIST_HEAD(&clp->cl_revoked); - spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); - rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); @@ -3716,9 +3717,16 @@ out: static __be32 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + + if (check_for_locks(stp->st_file, lo)) return nfserr_locks_held; - release_lock_stateid(stp); + /* + * Currently there's a 1-1 lock stateid<->lockowner + * correspondance, and we have to delete the lockowner when we + * delete the lock stateid: + */ + unhash_lockowner(lo); return nfs_ok; } @@ -4158,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c if (!same_owner_str(&lo->lo_owner, owner, clid)) return false; + if (list_empty(&lo->lo_owner.so_stateids)) { + WARN_ON_ONCE(1); + return false; + } lst = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); return lst->st_file->fi_inode == inode; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2723c1badd01..18881f34737a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3627,14 +3627,6 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) /* nfsd4_check_resp_size guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); - if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { - struct nfsd4_slot *slot = resp->cstate.slot; - - if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) - op->status = nfserr_rep_too_big_to_cache; - else - op->status = nfserr_rep_too_big; - } if (so) { so->so_replay.rp_status = op->status; so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e565c814309..732648b270dc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->overflow_event = &oevent->fse; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS spin_lock_init(&group->fanotify_data.access_lock); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index af3f7aa73e13..ee1f88419cb0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -472,11 +472,15 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) + if (dlm_lockname_cache) { kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; + } - if (dlm_lockres_cache) + if (dlm_lockres_cache) { kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; + } } static void dlm_lockres_release(struct kref *kref) diff --git a/fs/open.c b/fs/open.c index 3d30eb1fc95e..9d64679cec73 100644 --- a/fs/open.c +++ b/fs/open.c @@ -254,17 +254,22 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EBADF; /* - * It's not possible to punch hole or perform collapse range - * on append only file + * We can only allow pure fallocate on append only files */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE) - && IS_APPEND(inode)) + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* + * We can not allow to do any fallocate operation on an active + * swapfile + */ + if (IS_SWAPFILE(inode)) + ret = -ETXTBSY; + + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ @@ -286,14 +291,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, in which case - * it is effectively a truncate operation - */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (offset + len >= i_size_read(inode))) - return -EINVAL; - if (!file->f_op->fallocate) return -EOPNOTSUPP; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9e363e41dacc..0855f772cd41 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: diff --git a/fs/splice.c b/fs/splice.c index 9bc07d2b53cf..e246954ea48c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1537,7 +1537,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t count = 0; + ssize_t count; pipe = get_pipe_info(file); if (!pipe) @@ -1546,8 +1546,9 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, ret = rw_copy_check_uvector(READ, uiov, nr_segs, ARRAY_SIZE(iovstack), iovstack, &iov); if (ret <= 0) - return ret; + goto out; + count = ret; iov_iter_init(&iter, iov, nr_segs, count, 0); sd.len = 0; @@ -1560,6 +1561,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); +out: if (iov != iovstack) kfree(iov); diff --git a/fs/super.c b/fs/super.c index e9dc3c3fe159..48377f7463c0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -800,7 +800,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b8b91b67fdb..e9ef59b3abb1 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) ssize_t count; char *buf; - /* acquire buffer and ensure that it's >= PAGE_SIZE */ + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } + memset(buf, 0, PAGE_SIZE); /* * Invoke show(). Control may reach here via seq file lseek even @@ -453,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index aa0406895b53..7d2a860ba788 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,7 +18,7 @@ #include "sysfs.h" -static void remove_files(struct kernfs_node *parent, struct kobject *kobj, +static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; @@ -29,7 +29,7 @@ static void remove_files(struct kernfs_node *parent, struct kobject *kobj, kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) - sysfs_remove_bin_file(kobj, *bin_attr); + kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static int create_files(struct kernfs_node *parent, struct kobject *kobj, @@ -62,7 +62,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, break; } if (error) { - remove_files(parent, kobj, grp); + remove_files(parent, grp); goto exit; } } @@ -79,7 +79,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, break; } if (error) - remove_files(parent, kobj, grp); + remove_files(parent, grp); } exit: return error; @@ -224,7 +224,7 @@ void sysfs_remove_group(struct kobject *kobj, kernfs_get(kn); } - remove_files(kn, kobj, grp); + remove_files(kn, grp); if (grp->name) kernfs_remove(kn); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index a66ad6196f59..8a49486bf30c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,6 +13,7 @@ #define DEBUG #include <linux/fs.h> +#include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/user_namespace.h> @@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; @@ -63,7 +65,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, 0, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a1266089eca1..a81c7b556896 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f75..0479c32c5eb1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1566,6 +1580,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1599,12 +1623,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1615,9 +1648,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1640,8 +1676,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01b6a0102fbd..abda1124a70f 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -213,7 +213,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -999,13 +1011,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1133,6 +1154,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index fe9587fab17a..511c283459b1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 01db96f60cf0..833fe5d98d80 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int( args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 6e37823e2932..d2e6e948cec7 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -337,7 +337,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -347,7 +347,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -415,7 +417,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -480,7 +482,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51ef..f0efc7e970ef 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5574,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa1..296160b8e78c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41f..cb10a0aaab3a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6e95ea79f5d7..201c6091d26a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -60,10 +60,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1399e187d425..753e467aa1a5 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79e96ce98733..830c1c937b88 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -155,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -295,7 +295,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -837,11 +837,19 @@ xfs_file_fallocate( unsigned blksize_mask = (1 << inode->i_blkbits) - 1; if (offset & blksize_mask || len & blksize_mask) { - error = -EINVAL; + error = EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; goto out_unlock; } - ASSERT(offset + len < i_size_read(inode)); new_size = i_size_read(inode) - len; error = xfs_collapse_file_space(ip, offset, len); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee6..768087bedbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0d..f2fcde52b66d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca28..36d630319a27 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -72,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -93,8 +93,8 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -124,15 +124,15 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -156,8 +156,12 @@ xfs_vn_mknod( if (error) return error; - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } if (unlikely(error)) goto out_free_acl; @@ -169,18 +173,22 @@ xfs_vn_mknod( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } #endif - d_instantiate(dentry, inode); + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -189,11 +197,23 @@ xfs_vn_mknod( return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); goto out_free_acl; } STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, @@ -353,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -1053,11 +1074,7 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; - - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); - - return -error; + return xfs_generic_create(dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..a5f8bd9899d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -616,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1373,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1411,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1438,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1646,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1870,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 993cb19e7d39..944f3d9456a8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -743,8 +743,6 @@ xfs_mountfs( new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; - xfs_info(mp, "Using inode cluster size of %d bytes", - mp->m_inode_cluster_size); } /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 348e4d2ed6e6..dc977b6e6a36 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -843,22 +843,17 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - if ((error = list_lru_init(&qinf->qi_lru))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - list_lru_destroy(&qinf->qi_lru); - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); @@ -918,7 +913,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -935,6 +930,13 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index 0c0e41bbe4e3..8baf61afae1d 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -201,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 205376776377..3494eff8e4eb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1433,11 +1433,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a8..65d8c793a25c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL |