diff options
Diffstat (limited to 'fs')
90 files changed, 3373 insertions, 2192 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index b8fcb416be72..4524916fa200 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS if BLOCK +config FS_IOMAP + bool + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 85b6e13b62d3..ed2b63257ba9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_FS_IOMAP) += iomap.o obj-y += quota/ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 63cd9f939f19..4832de84d52c 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -85,18 +85,14 @@ int afs_open_socket(void) skb_queue_head_init(&afs_incoming_calls); + ret = -ENOMEM; afs_async_calls = create_singlethread_workqueue("kafsd"); - if (!afs_async_calls) { - _leave(" = -ENOMEM [wq]"); - return -ENOMEM; - } + if (!afs_async_calls) + goto error_0; ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); - if (ret < 0) { - destroy_workqueue(afs_async_calls); - _leave(" = %d [socket]", ret); - return ret; - } + if (ret < 0) + goto error_1; socket->sk->sk_allocation = GFP_NOFS; @@ -111,18 +107,26 @@ int afs_open_socket(void) sizeof(srx.transport.sin.sin_addr)); ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); - if (ret < 0) { - sock_release(socket); - destroy_workqueue(afs_async_calls); - _leave(" = %d [bind]", ret); - return ret; - } + if (ret < 0) + goto error_2; + + ret = kernel_listen(socket, INT_MAX); + if (ret < 0) + goto error_2; rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); afs_socket = socket; _leave(" = 0"); return 0; + +error_2: + sock_release(socket); +error_1: + destroy_workqueue(afs_async_calls); +error_0: + _leave(" = %d", ret); + return ret; } /* diff --git a/fs/buffer.c b/fs/buffer.c index b9fa1be75e69..9c8eb9b6db6a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/mm.h> #include <linux/percpu.h> #include <linux/slab.h> @@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) +static void +iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, + struct iomap *iomap) +{ + loff_t offset = block << inode->i_blkbits; + + bh->b_bdev = iomap->bdev; + + /* + * Block points to offset in file we need to map, iomap contains + * the offset at which the map starts. If the map ends before the + * current block, then do not map the buffer and let the caller + * handle it. + */ + BUG_ON(offset >= iomap->offset + iomap->length); + + switch (iomap->type) { + case IOMAP_HOLE: + /* + * If the buffer is not up to date or beyond the current EOF, + * we need to mark it as new to ensure sub-block zeroing is + * executed if necessary. + */ + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + break; + case IOMAP_DELALLOC: + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + set_buffer_uptodate(bh); + set_buffer_mapped(bh); + set_buffer_delay(bh); + break; + case IOMAP_UNWRITTEN: + /* + * For unwritten regions, we always need to ensure that + * sub-block writes cause the regions in the block we are not + * writing to are zeroed. Set the buffer as new to ensure this. + */ + set_buffer_new(bh); + set_buffer_unwritten(bh); + /* FALLTHRU */ + case IOMAP_MAPPED: + if (offset >= i_size_read(inode)) + set_buffer_new(bh); + bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) + + ((offset - iomap->offset) >> inode->i_blkbits); + set_buffer_mapped(bh); + break; + } +} + +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); - if (err) - break; + if (get_block) { + err = get_block(inode, block, bh, 1); + if (err) + break; + } else { + iomap_to_bh(inode, block, bh, iomap); + } + if (buffer_new(bh)) { unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); @@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, page_zero_new_buffers(page, from, to); return err; } + +int __block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + return __block_write_begin_int(page, pos, len, get_block, NULL); +} EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 1669f6291c95..df955d2209ce 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -73,6 +73,7 @@ struct dlm_cluster { unsigned int cl_toss_secs; unsigned int cl_scan_secs; unsigned int cl_log_debug; + unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; @@ -95,6 +96,7 @@ enum { CLUSTER_ATTR_TOSS_SECS, CLUSTER_ATTR_SCAN_SECS, CLUSTER_ATTR_LOG_DEBUG, + CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, @@ -166,6 +168,7 @@ CLUSTER_ATTR(recover_timer, 1); CLUSTER_ATTR(toss_secs, 1); CLUSTER_ATTR(scan_secs, 1); CLUSTER_ATTR(log_debug, 0); +CLUSTER_ATTR(log_info, 0); CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); @@ -180,6 +183,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs, [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs, [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, + [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, @@ -365,6 +369,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_toss_secs = dlm_config.ci_toss_secs; cl->cl_scan_secs = dlm_config.ci_scan_secs; cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; @@ -850,6 +855,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 +#define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 @@ -865,6 +871,7 @@ struct dlm_config_info dlm_config = { .ci_toss_secs = DEFAULT_TOSS_SECS, .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, + .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, .ci_waitwarn_us = DEFAULT_WAITWARN_US, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f30697bc2780..6041eec886ab 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -31,6 +31,7 @@ struct dlm_config_info { int ci_toss_secs; int ci_scan_secs; int ci_log_debug; + int ci_log_info; int ci_protocol; int ci_timewarn_cs; int ci_waitwarn_us; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5eff6ea3e27f..216b61604ef9 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -65,8 +65,16 @@ struct dlm_mhandle; printk(KERN_ERR "dlm: "fmt"\n" , ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) + #define log_rinfo(ls, fmt, args...) \ - printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args); +do { \ + if (dlm_config.ci_log_info) \ + printk(KERN_INFO "dlm: %s: " fmt "\n", \ + (ls)->ls_name, ##args); \ + else if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) #define log_debug(ls, fmt, args...) \ do { \ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 1ab012a27d9f..963016c8f3d1 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1279,10 +1279,9 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmalloc(sizeof(*addr), GFP_NOFS); + addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS); if (!addr) break; - memcpy(addr, &sas, sizeof(*addr)); dlm_local_addr[dlm_local_count++] = addr; } } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a31c7e859af6..4dcc9e28dc5c 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -201,7 +201,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; @@ -214,7 +213,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; - set_acl_inode(fi, inode->i_mode); + set_acl_inode(inode, inode->i_mode); if (error == 0) acl = NULL; } @@ -233,7 +232,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (acl) { value = f2fs_acl_to_disk(acl, &size); if (IS_ERR(value)) { - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -244,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } @@ -385,6 +384,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; + f2fs_mark_inode_dirty_sync(inode); + if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 997ca8edb6cb..b2334d11dae8 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,7 +37,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 124b4a3017b5..f94d01e7d001 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -48,7 +48,8 @@ repeat: goto repeat; } f2fs_wait_on_page_writeback(page, META, true); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); return page; } @@ -266,6 +267,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff, written; /* collect a number of dirty meta pages and write together */ @@ -278,7 +280,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); + blk_start_plug(&plug); written = sync_meta_pages(sbi, META, wbc->nr_to_write); + blk_finish_plug(&plug); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -366,9 +370,10 @@ static int f2fs_set_meta_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, META); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -510,10 +515,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + update_inode_page(inode); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -761,28 +767,25 @@ fail_no_cp: static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(fi, flag)) + if (is_inode_flag_set(inode, flag)) return; - set_inode_flag(fi, flag); - list_add_tail(&fi->dirty_list, &sbi->inode_list[type]); + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } static void __remove_dirty_inode(struct inode *inode, enum inode_type type) { - struct f2fs_inode_info *fi = F2FS_I(inode); int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), flag)) + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) return; - list_del_init(&fi->dirty_list); - clear_inode_flag(fi, flag); + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); stat_dec_dirty_inode(F2FS_I_SB(inode), type); } @@ -795,13 +798,12 @@ void update_dirty_page(struct inode *inode, struct page *page) !S_ISLNK(inode->i_mode)) return; - if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) { - spin_lock(&sbi->inode_lock[type]); + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) __add_dirty_inode(inode, type); - spin_unlock(&sbi->inode_lock[type]); - } - inode_inc_dirty_pages(inode); + spin_unlock(&sbi->inode_lock[type]); + SetPagePrivate(page); f2fs_trace_pid(page); } @@ -864,6 +866,34 @@ retry: goto retry; } +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_entry(head->next, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + update_inode_page(inode); + iput(inode); + } + }; + return 0; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -890,6 +920,14 @@ retry_flush_dents: goto retry_flush_dents; } + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + goto out; + goto retry_flush_dents; + } + /* * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. @@ -914,6 +952,8 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); + + build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -954,7 +994,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - if (discard_next_dnode(sbi, discard_blk)) + if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk)) invalidate = true; /* Flush all the NAT/SIT pages */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ded224518978..e2624275d828 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,8 @@ #include <linux/bio.h> #include <linux/prefetch.h> #include <linux/uio.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> #include <linux/cleancache.h> #include "f2fs.h" @@ -45,7 +47,8 @@ static void f2fs_read_end_io(struct bio *bio) struct page *page = bvec->bv_page; if (!bio->bi_error) { - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); @@ -97,10 +100,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } -static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio) +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) { - if (!is_read_io(bio_op(bio))) + if (!is_read_io(bio_op(bio))) { atomic_inc(&sbi->nr_wb_bios); + if (f2fs_sb_mounted_hmsmr(sbi->sb) && + current->plug && (type == DATA || type == NODE)) + blk_finish_plug(current->plug); + } submit_bio(bio); } @@ -118,7 +126,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) bio_set_op_attrs(io->bio, fio->op, fio->op_flags); - __submit_bio(io->sbi, io->bio); + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } @@ -240,7 +248,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio->bi_rw = fio->op_flags; bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, bio); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -326,7 +334,7 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) if (!count) return 0; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; @@ -348,9 +356,6 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) if (set_page_dirty(dn->node_page)) dn->node_changed = true; - - mark_inode_dirty(dn->inode); - sync_inode_page(dn); return 0; } @@ -446,7 +451,8 @@ got_it: */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); return page; } @@ -505,14 +511,14 @@ repeat: /* wait for read completion */ lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -557,7 +563,8 @@ struct page *get_new_data_page(struct inode *inode, if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { f2fs_put_page(page, 1); @@ -569,11 +576,8 @@ struct page *get_new_data_page(struct inode *inode, } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } @@ -586,7 +590,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) pgoff_t fofs; blkcnt_t count = 1; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); @@ -611,7 +615,7 @@ alloc: fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) - i_size_write(dn->inode, + f2fs_i_size_write(dn->inode, ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } @@ -660,7 +664,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + int mode = create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; @@ -723,8 +727,7 @@ next_block: } else { err = __allocate_data_block(&dn); if (!err) { - set_inode_flag(F2FS_I(inode), - FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); allocated = true; } } @@ -795,8 +798,6 @@ skip: else if (dn.ofs_in_node < end_offset) goto next_block; - if (allocated) - sync_inode_page(&dn); f2fs_put_dnode(&dn); if (create) { @@ -807,8 +808,6 @@ skip: goto next_dnode; sync_out: - if (allocated) - sync_inode_page(&dn); f2fs_put_dnode(&dn); unlock_out: if (create) { @@ -968,6 +967,37 @@ out: return ret; } +struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio; + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + + return bio; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -986,7 +1016,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; @@ -1047,7 +1076,8 @@ got_it: } } else { zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); goto next_page; } @@ -1058,35 +1088,15 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), bio); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { - struct fscrypt_ctx *ctx = NULL; - - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - goto set_error_page; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback( - F2FS_I_SB(inode), block_nr); - } - - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); + bio = f2fs_grab_bio(inode, block_nr, nr_pages); + if (IS_ERR(bio)) { + bio = NULL; goto set_error_page; } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, 0); } @@ -1102,7 +1112,7 @@ set_error_page: goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), bio); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1112,7 +1122,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), bio); + __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1201,14 +1211,14 @@ retry_encrypt: !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -1223,6 +1233,7 @@ static int f2fs_write_data_page(struct page *page, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_SHIFT; + loff_t psize = (page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -1260,20 +1271,18 @@ write: available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; err = do_write_data_page(&fio); goto done; } - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - SetPageError(page); - goto out; - } - if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0)) @@ -1285,6 +1294,8 @@ write: err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1311,16 +1322,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + unlock_page(page); + return err; } /* @@ -1329,8 +1332,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) + struct writeback_control *wbc) { int ret = 0; int done = 0; @@ -1343,10 +1345,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; pagevec_init(&pvec, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1401,9 +1402,6 @@ continue_unlock: goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, @@ -1416,16 +1414,11 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = (*writepage)(page, wbc, data); + ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done_index = page->index + 1; - done = 1; - break; - } + done_index = page->index + 1; + done = 1; + break; } if (--wbc->nr_to_write <= 0 && @@ -1438,11 +1431,6 @@ continue_unlock: cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; @@ -1460,9 +1448,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool locked = false; + struct blk_plug plug; int ret; - long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1478,7 +1465,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; /* skip writing during file defragment */ - if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG)) + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; /* during POR, we don't need to trigger writepage at all. */ @@ -1487,20 +1474,16 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); - diff = nr_pages_to_write(sbi, DATA, wbc); - - if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) { - mutex_lock(&sbi->writepages); - locked = true; - } - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); - if (locked) - mutex_unlock(&sbi->writepages); + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc); + blk_finish_plug(&plug); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; skip_write: @@ -1558,7 +1541,7 @@ restart: if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA) { read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_inline_node(ipage); } else { @@ -1668,39 +1651,35 @@ repeat: if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); } else { - struct f2fs_io_info fio = { - .sbi = sbi, - .type = DATA, - .op = REQ_OP_READ, - .op_flags = READ_SYNC, - .old_blkaddr = blkaddr, - .new_blkaddr = blkaddr, - .page = page, - .encrypted_page = NULL, - }; - err = f2fs_submit_page_bio(&fio); - if (err) - goto fail; + struct bio *bio; - lock_page(page); - if (unlikely(!PageUptodate(page))) { - err = -EIO; + bio = f2fs_grab_bio(inode, blkaddr, 1); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); goto fail; } + bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + err = -EFAULT; + goto fail; + } + + __submit_bio(sbi, bio, DATA); + + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - - /* avoid symlink page */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - err = fscrypt_decrypt_page(page); - if (err) - goto fail; + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; } } out_update: - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); out_clear: clear_cold_data(page); return 0; @@ -1721,13 +1700,11 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); + f2fs_put_page(page, 1); - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - } + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); - f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1752,6 +1729,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; + int rw = iov_iter_rw(iter); int err; err = check_direct_IO(inode, iter, offset); @@ -1760,18 +1738,23 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; + if (test_opt(F2FS_I_SB(inode), LFS)) + return 0; - trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); - if (iov_iter_rw(iter) == WRITE) { + up_read(&F2FS_I(inode)->dio_rwsem[rw]); + + if (rw == WRITE) { if (err > 0) - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); else if (err < 0) f2fs_write_failed(mapping, offset + count); } - trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; } @@ -1818,6 +1801,35 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 1; } +/* + * This was copied from __set_page_dirty_buffers which gives higher performance + * in very high speed storages. (e.g., pmem) + */ +void f2fs_set_page_dirty_nobuffers(struct page *page) +{ + struct address_space *mapping = page->mapping; + unsigned long flags; + + if (unlikely(!mapping)) + return; + + spin_lock(&mapping->private_lock); + lock_page_memcg(page); + SetPageDirty(page); + spin_unlock(&mapping->private_lock); + + spin_lock_irqsave(&mapping->tree_lock, flags); + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + unlock_page_memcg(page); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return; +} + static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -1825,7 +1837,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { @@ -1840,7 +1853,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d89a425055d0..badd407bb622 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -47,6 +47,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -304,8 +305,8 @@ static int stat_show(struct seq_file *s, void *v) si->inmem_pages, si->wb_bios); seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); seq_printf(s, " - datas: %4lld in files:%4d\n", si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4lld in %4d\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f9313f684540..a485f68a76b1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -185,8 +185,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, &max_slots, @@ -223,19 +228,22 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct fscrypt_name fname; int err; - *res_page = NULL; - err = fscrypt_setup_filename(dir, child, 1, &fname); - if (err) + if (err) { + *res_page = ERR_PTR(err); return NULL; + } if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; de = find_in_inline_dir(dir, &fname, res_page); goto out; } - if (npages == 0) + if (npages == 0) { + *res_page = NULL; goto out; + } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -243,13 +251,13 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, "Corrupted max_depth of %lu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; - F2FS_I(dir)->i_current_depth = max_depth; - mark_inode_dirty(dir); + f2fs_i_depth_write(dir, max_depth); } for (level = 0; level < max_depth; level++) { + *res_page = NULL; de = find_in_level(dir, level, &fname, res_page); - if (de) + if (de || IS_ERR(*res_page)) break; } out: @@ -259,35 +267,22 @@ out: struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; - - if (f2fs_has_inline_dentry(dir)) - return f2fs_parent_inline_dir(dir, p); - - page = get_lock_data_page(dir, 0, false); - if (IS_ERR(page)) - return NULL; + struct qstr dotdot = QSTR_INIT("..", 2); - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); - return de; + return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr, + struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + f2fs_dentry_kunmap(dir, *page); + f2fs_put_page(*page, 0); } return res; @@ -303,9 +298,9 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -385,7 +380,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = new_inode_page(inode); if (IS_ERR(page)) return page; @@ -429,7 +424,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + if (is_inode_flag_set(inode, FI_INC_LINK)) { file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, @@ -437,14 +432,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, */ if (inode->i_nlink == 0) remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); - inc_nlink(inode); + f2fs_i_links_write(inode, true); } return page; put_error: - /* truncate empty dir pages */ - truncate_inode_pages(&inode->i_data, 0); - clear_nlink(inode); update_inode(inode, page); f2fs_put_page(page, 1); @@ -454,23 +446,19 @@ put_error: void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + f2fs_mark_inode_dirty_sync(dir); - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } int room_for_filename(const void *bitmap, int slots, int max_slots) @@ -596,9 +584,7 @@ add_dentry: set_page_dirty(dentry_page); if (inode) { - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -607,10 +593,6 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode_page(dir); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); @@ -657,42 +639,34 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) err = PTR_ERR(page); goto fail; } - /* we don't need to mark_inode_dirty now */ - update_inode(inode, page); f2fs_put_page(page, 1); - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + clear_inode_flag(inode, FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } -void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - if (page) - update_inode(dir, page); - else - update_inode_page(dir); - } + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); + f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); } up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); else release_orphan_inode(sbi); } @@ -730,9 +704,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, NULL); + f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 5bfcdb9b69f2..2b06d4fcd954 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -170,8 +170,10 @@ static void __drop_largest_extent(struct inode *inode, { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; + f2fs_mark_inode_dirty_sync(inode); + } } /* return true, if inode page is changed */ @@ -335,11 +337,12 @@ lookup_neighbors: return en; } -static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -360,7 +363,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); spin_lock(&sbi->extent_lock); if (!list_empty(&en->list)) { @@ -371,11 +374,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, return en; } -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, +static struct extent_node *__insert_extent_tree(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p = &et->root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -402,7 +406,7 @@ do_insert: if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); /* update in global extent list */ spin_lock(&sbi->extent_lock); @@ -431,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, write_lock(&et->lock); - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); return false; } @@ -473,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(sbi, et, &ei, + en1 = __insert_extent_tree(inode, et, &ei, NULL, NULL); next_en = en1; } else { @@ -494,7 +498,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); else __release_extent_node(sbi, et, en); @@ -514,20 +518,20 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, len); - if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) - __insert_extent_tree(sbi, et, &ei, + if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) + __insert_extent_tree(inode, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + __drop_largest_extent(inode, 0, UINT_MAX); + set_inode_flag(inode, FI_NO_EXTENT); } } - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + if (is_inode_flag_set(inode, FI_NO_EXTENT)) __free_extent_tree(sbi, et); write_unlock(&et->lock); @@ -627,6 +631,19 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + __drop_largest_extent(inode, 0, UINT_MAX); + write_unlock(&et->lock); +} + void f2fs_destroy_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -685,9 +702,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } void f2fs_update_extent_cache_range(struct dnode_of_data *dn, @@ -697,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, if (!f2fs_may_extent_tree(dn->inode)) return; - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } void init_extent_cache_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 23ae6a81ccd6..7890e9071499 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -45,6 +45,7 @@ enum { FAULT_ORPHAN, FAULT_BLOCK, FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, FAULT_MAX, }; @@ -74,6 +75,8 @@ static inline bool time_to_inject(int type) return false; else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) return false; + else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type)) + return false; atomic_inc(&f2fs_fault.inject_ops); if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) { @@ -108,6 +111,8 @@ static inline bool time_to_inject(int type) #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -128,6 +133,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_HMSMR 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -158,7 +164,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ -#define DEF_IDLE_INTERVAL 120 /* 2 mins */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -262,6 +268,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -291,6 +299,13 @@ struct f2fs_defragment { u64 len; }; +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + /* * For INODE and NODE manager */ @@ -441,11 +456,14 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ + loff_t last_disk_size; /* lastly written file size */ - struct list_head dirty_list; /* linked in global dirty list */ + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ }; static inline void get_extent_info(struct extent_info *ext, @@ -498,11 +516,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) +extern void f2fs_mark_inode_dirty_sync(struct inode *); +static inline void __try_update_largest_extent(struct inode *inode, + struct extent_tree *et, struct extent_node *en) { - if (en->ei.len > et->largest.len) + if (en->ei.len > et->largest.len) { et->largest = en->ei; + f2fs_mark_inode_dirty_sync(inode); + } } struct f2fs_nm_info { @@ -517,7 +538,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ @@ -599,6 +620,7 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t submit_flush; /* # of issued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -655,6 +677,7 @@ enum count_type { F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, NR_COUNT_TYPE, }; @@ -706,6 +729,7 @@ struct f2fs_bio_info { enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ NR_INODE_TYPE, }; @@ -757,14 +781,14 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct rw_semaphore cp_rwsem; /* blocking FS operations */ + struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -1050,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - down_read(&sbi->cp_rwsem); + percpu_down_read(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - up_read(&sbi->cp_rwsem); + percpu_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write(&sbi->cp_rwsem); + percpu_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - up_write(&sbi->cp_rwsem); + percpu_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) @@ -1120,34 +1144,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - block_t valid_block_count; + blkcnt_t diff; - spin_lock(&sbi->stat_lock); #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_BLOCK)) { - spin_unlock(&sbi->stat_lock); + if (time_to_inject(FAULT_BLOCK)) return false; - } #endif - valid_block_count = - sbi->total_valid_block_count + (block_t)(*count); - if (unlikely(valid_block_count > sbi->user_block_count)) { - *count = sbi->user_block_count - sbi->total_valid_block_count; + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); + + spin_lock(&sbi->stat_lock); + sbi->total_valid_block_count += (block_t)(*count); + if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { + diff = sbi->total_valid_block_count - sbi->user_block_count; + *count -= diff; + sbi->total_valid_block_count = sbi->user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); + percpu_counter_sub(&sbi->alloc_valid_block_count, diff); return false; } } - /* *count can be recalculated */ - inode->i_blocks += *count; - sbi->total_valid_block_count = - sbi->total_valid_block_count + (block_t)(*count); spin_unlock(&sbi->stat_lock); - percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); + f2fs_i_blocks_write(inode, *count, true); return true; } @@ -1158,9 +1185,9 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < count); - inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); + f2fs_i_blocks_write(inode, count, false); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1295,7 +1322,7 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } if (inode) - inode->i_blocks++; + f2fs_i_blocks_write(inode, 1, true); sbi->total_valid_node_count++; sbi->total_valid_block_count++; @@ -1314,7 +1341,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !inode->i_blocks); - inode->i_blocks--; + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -1511,12 +1538,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ - FI_UPDATE_DIR, /* should update inode block for consistency */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1534,64 +1561,143 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) +{ + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + f2fs_mark_inode_dirty_sync(inode); + } +} + +static inline void set_inode_flag(struct inode *inode, int flag) +{ + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); +} + +static inline int is_inode_flag_set(struct inode *inode, int flag) +{ + return test_bit(flag, &F2FS_I(inode)->flags); +} + +static inline void clear_inode_flag(struct inode *inode, int flag) +{ + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); +} + +static inline void set_acl_inode(struct inode *inode, umode_t mode) { - if (!test_bit(flag, &fi->flags)) - set_bit(flag, &fi->flags); + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode); } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void f2fs_i_links_write(struct inode *inode, bool inc) { - return test_bit(flag, &fi->flags); + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + blkcnt_t diff, bool add) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + inode->i_blocks = add ? inode->i_blocks + diff : + inode->i_blocks - diff; + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline bool f2fs_skip_inode_update(struct inode *inode) { - if (test_bit(flag, &fi->flags)) - clear_bit(flag, &fi->flags); + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode); } -static inline void get_inline_info(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + if (ri->i_inline & F2FS_INLINE_XATTR) - set_inode_flag(fi, FI_INLINE_XATTR); + set_bit(FI_INLINE_XATTR, &fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_inode_flag(fi, FI_INLINE_DATA); + set_bit(FI_INLINE_DATA, &fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_inode_flag(fi, FI_INLINE_DENTRY); + set_bit(FI_INLINE_DENTRY, &fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_inode_flag(fi, FI_DATA_EXIST); + set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_inode_flag(fi, FI_INLINE_DOTS); + set_bit(FI_INLINE_DOTS, &fi->flags); } -static inline void set_raw_inline(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; - if (is_inode_flag_set(fi, FI_INLINE_DATA)) + if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; - if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; - if (is_inode_flag_set(fi, FI_DATA_EXIST)) + if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; - if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); + return is_inode_flag_set(inode, FI_INLINE_XATTR); } static inline unsigned int addrs_per_inode(struct inode *inode) @@ -1618,43 +1724,43 @@ static inline int inline_xattr_size(struct inode *inode) static inline int f2fs_has_inline_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); + return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline void f2fs_clear_inline_inode(struct inode *inode) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + clear_inode_flag(inode, FI_INLINE_DATA); + clear_inode_flag(inode, FI_DATA_EXIST); } static inline int f2fs_exist_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); + return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_has_inline_dots(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); + return is_inode_flag_set(inode, FI_INLINE_DOTS); } static inline bool f2fs_is_atomic_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); + return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_volatile_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_VOLATILE_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); } static inline bool f2fs_is_drop_cache(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); + return is_inode_flag_set(inode, FI_DROP_CACHE); } static inline void *inline_data_addr(struct page *page) @@ -1665,7 +1771,7 @@ static inline void *inline_data_addr(struct page *page) static inline int f2fs_has_inline_dentry(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); + return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) @@ -1682,11 +1788,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1713,7 +1821,7 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || - is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT)) return false; return S_ISREG(inode->i_mode); @@ -1749,7 +1857,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) } #define get_inode_mode(i) \ - ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) /* get offset of first page in next direct node */ @@ -1764,7 +1872,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1805,11 +1913,11 @@ struct page *init_inode_metadata(struct inode *, struct inode *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); +void f2fs_drop_nlink(struct inode *, struct inode *); struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +ino_t f2fs_inode_by_name(struct inode *, struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); @@ -1833,6 +1941,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_inode_dirtied(struct inode *); +void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) @@ -1866,11 +1976,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *, - bool); +int fsync_node_pages(struct f2fs_sb_info *, struct inode *, + struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); +void build_free_nids(struct f2fs_sb_info *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1944,9 +2054,10 @@ void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int f2fs_sync_inode_meta(struct f2fs_sb_info *); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void add_orphan_inode(struct inode *); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); @@ -1981,6 +2092,7 @@ struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -2012,7 +2124,7 @@ struct f2fs_stat_info { unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages; - unsigned int ndirty_dirs, ndirty_files; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, wb_bios; @@ -2181,7 +2293,6 @@ int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct fscrypt_name *, struct page **); -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); @@ -2206,6 +2317,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +void f2fs_drop_extent_tree(struct inode *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2241,6 +2353,26 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } +static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } +} + static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f4c0086655c4..0e493f63ea41 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include <linux/mount.h> #include <linux/pagevec.h> #include <linux/uuid.h> +#include <linux/file.h> #include "f2fs.h" #include "node.h" @@ -81,7 +82,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: @@ -171,22 +173,16 @@ static void try_to_fix_pino(struct inode *inode) fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { - fi->i_pino = pino; + f2fs_i_pino_write(inode, pino); file_got_pino(inode); - up_write(&fi->i_sem); - - mark_inode_dirty_sync(inode); - f2fs_write_inode(inode, NULL); - } else { - up_write(&fi->i_sem); } + up_write(&fi->i_sem); } static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; @@ -204,9 +200,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) - set_inode_flag(fi, FI_NEED_IPU); + set_inode_flag(inode, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(inode, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -214,7 +210,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync) { + if (!datasync && !f2fs_skip_inode_update(inode)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -222,14 +218,14 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, /* * if there is no written data, don't waste time to write recovery info. */ - if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; - if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; @@ -239,9 +235,9 @@ go_write: * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&fi->i_sem); + down_read(&F2FS_I(inode)->i_sem); need_cp = need_do_checkpoint(inode); - up_read(&fi->i_sem); + up_read(&F2FS_I(inode)->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ @@ -252,12 +248,12 @@ go_write: * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); - clear_inode_flag(fi, FI_APPEND_WRITE); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); goto out; } sync_nodes: - ret = fsync_node_pages(sbi, ino, &wbc, atomic); + ret = fsync_node_pages(sbi, inode, &wbc, atomic); if (ret) goto out; @@ -268,7 +264,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -279,10 +275,10 @@ sync_nodes: /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: @@ -360,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { @@ -487,8 +483,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) set_data_blkaddr(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(F2FS_I(dn->inode), - FI_FIRST_BLOCK_WRITTEN); + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; } @@ -502,7 +497,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - sync_inode_page(dn); } dn->ofs_in_node = ofs; @@ -616,7 +610,7 @@ free_partial: return err; } -int f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode) { int err; @@ -633,12 +627,12 @@ int f2fs_truncate(struct inode *inode, bool lock) return err; } - err = truncate_blocks(inode, i_size_read(inode), lock); + err = truncate_blocks(inode, i_size_read(inode), true); if (err) return err; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -654,7 +648,6 @@ int f2fs_getattr(struct vfsmount *mnt, #ifdef CONFIG_F2FS_FS_POSIX_ACL static void __setattr_copy(struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) @@ -675,7 +668,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else @@ -685,7 +678,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = inode_change_ok(inode, attr); @@ -699,7 +691,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (err) return err; f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -724,13 +716,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, get_inode_mode(inode)); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return err; } @@ -859,79 +851,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int __exchange_data_block(struct inode *inode, pgoff_t src, - pgoff_t dst, bool full) +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - block_t new_addr; - bool do_replace = false; - int ret; + int ret, done, i; +next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - if (!is_checkpointed_data(sbi, new_addr)) { + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (!is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + /* do not invalidate this block address */ f2fs_update_data_blkaddr(&dn, NULL_ADDR); - do_replace = true; + *do_replace = 1; } - f2fs_put_dnode(&dn); } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} - if (new_addr == NULL_ADDR) - return full ? truncate_hole(inode, dst, dst + 1) : 0; +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; - if (do_replace) { - struct page *ipage = get_node_page(sbi, inode->i_ino); - struct node_info ni; + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto err_out; + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); } + f2fs_put_dnode(&dn); + } + return 0; +} - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, dst); - if (ret) - goto err_out; +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; + + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; + } - truncate_data_blocks_range(&dn, 1); + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, - ni.version, true, false); - f2fs_put_dnode(&dn); - } else { - struct page *psrc, *pdst; + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; + + get_node_info(sbi, dn.nid, &ni); + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.node_page, + dn.ofs_in_node); + truncate_data_blocks_range(&dn, 1); + + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false); + f2fs_i_blocks_write(dst_inode, + 1, true); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); - psrc = get_lock_data_page(inode, src, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = get_new_data_page(inode, NULL, dst, true); - if (IS_ERR(pdst)) { + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(src_inode, src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); - } - f2fs_copy_page(psrc, pdst); - set_page_dirty(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); - return truncate_hole(inode, src, src + 1); + ret = truncate_hole(src_inode, src + i, src + i + 1); + if (ret) + return ret; + i++; + } } return 0; +} -err_out: - if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { - f2fs_update_data_blkaddr(&dn, new_addr); - f2fs_put_dnode(&dn); +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } + return 0; + +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } @@ -939,16 +1051,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; + int ret; - for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, end, start, true); - f2fs_unlock_op(sbi); - if (ret) - break; - } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); return ret; } @@ -992,7 +1103,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1128,11 +1239,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1140,7 +1248,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t pg_start, pg_end, delta, nrpages, idx; + pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1175,14 +1283,20 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; - nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; - for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, idx, idx + delta, false); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); f2fs_unlock_op(sbi); - if (ret) - break; } /* write out all moved pages, if possible */ @@ -1190,7 +1304,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1238,11 +1352,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1285,7 +1396,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1310,10 +1421,10 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_DROP_CACHE); } return 0; } @@ -1376,9 +1487,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); return ret; @@ -1412,7 +1522,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + set_inode_flag(inode, FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) @@ -1423,7 +1533,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_FILE); out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -1448,10 +1558,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); if (ret) { - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; } } @@ -1484,7 +1594,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: inode_unlock(inode); @@ -1538,7 +1648,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + clear_inode_flag(inode, FI_VOLATILE_FILE); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1871,7 +1981,7 @@ do_map: continue; } - set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + set_inode_flag(inode, FI_DO_DEFRAG); idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { @@ -1896,14 +2006,14 @@ do_map: if (idx < pg_end && cnt < blk_per_seg) goto do_map; - clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + clear_inode_flag(inode, FI_DO_DEFRAG); err = filemap_fdatawrite(inode->i_mapping); if (err) goto out; } clear_out: - clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + clear_inode_flag(inode, FI_DO_DEFRAG); out: inode_unlock(inode); if (!err) @@ -1959,6 +2069,133 @@ out: return err; } +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode)) + return -EISDIR; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + inode_lock(src); + if (src != dst) + inode_lock(dst); + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in, + pos_out, len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); +out_unlock: + if (src != dst) + inode_unlock(dst); + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1994,6 +2231,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); default: return -ENOTTY; } @@ -2003,6 +2242,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct blk_plug plug; ssize_t ret; if (f2fs_encrypted_inode(inode) && @@ -2014,8 +2254,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = generic_write_checks(iocb, from); if (ret > 0) { ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) + if (!ret) { + blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + } } inode_unlock(inode); @@ -2050,6 +2293,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: break; + case F2FS_IOC_MOVE_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f06ed73adf99..8f7fa326ce95 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -594,11 +594,11 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* write page */ lock_page(fio.encrypted_page); - if (unlikely(!PageUptodate(fio.encrypted_page))) { + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { err = -EIO; goto put_page_out; } - if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + if (unlikely(!PageUptodate(fio.encrypted_page))) { err = -EIO; goto put_page_out; } @@ -619,9 +619,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_submit_page_mbio(&fio); f2fs_update_data_blkaddr(&dn, newaddr); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: @@ -656,12 +656,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + bool is_dirty = PageDirty(page); + int err; + +retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); + set_cold_data(page); - do_write_data_page(&fio); + + err = do_write_data_page(&fio); + if (err == -ENOMEM && is_dirty) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + clear_cold_data(page); } out: @@ -748,12 +759,32 @@ next_step: /* phase 3 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->dio_rwsem[READ])) + continue; + if (!down_write_trylock( + &fi->dio_rwsem[WRITE])) { + up_write(&fi->dio_rwsem[READ]); + continue; + } + locked = true; + } + start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) move_encrypted_block(inode, start_bidx); else move_data_page(inode, start_bidx, gc_type); + + if (locked) { + up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->dio_rwsem[READ]); + } + stat_inc_data_blk_count(sbi, 1, gc_type); } } @@ -802,6 +833,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_start_plug(&plug); for (segno = start_segno; segno < end_segno; segno++) { + + if (get_valid_blocks(sbi, segno, 1) == 0) + continue; + /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); @@ -877,10 +912,13 @@ gc_more: * enough free sections, we should flush dent/node blocks and do * garbage collections. */ - if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) + if (__get_victim(sbi, &segno, gc_type) || + prefree_segments(sbi)) { write_checkpoint(sbi, &cpc); - else if (has_not_enough_free_secs(sbi, 0)) + segno = NULL_SEGNO; + } else if (has_not_enough_free_secs(sbi, 0)) { write_checkpoint(sbi, &cpc); + } } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index c15e53c1d794..ccea8735de59 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -59,7 +59,8 @@ void read_inline_data(struct page *page, struct page *ipage) memcpy(dst_addr, src_addr, MAX_INLINE_DATA); flush_dcache_page(page); kunmap_atomic(dst_addr); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } bool truncate_inline_inode(struct page *ipage, u64 from) @@ -73,7 +74,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from) f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); - + set_page_dirty(ipage); return true; } @@ -97,7 +98,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) else read_inline_data(page, ipage); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); f2fs_put_page(ipage, 1); unlock_page(page); return 0; @@ -139,7 +141,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) inode_dec_dirty_pages(dn->inode); /* this converted inline_data should be recovered. */ - set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ truncate_inline_inode(dn->inode_page, 0); @@ -147,7 +149,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); - sync_inode_page(dn); f2fs_put_dnode(dn); return 0; } @@ -213,11 +214,11 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) dst_addr = inline_data_addr(dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); - sync_inode_page(&dn); clear_inline_node(dn.inode_page); f2fs_put_dnode(&dn); return 0; @@ -253,10 +254,10 @@ process_inline: dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); - update_inode(inode, ipage); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return true; } @@ -267,7 +268,6 @@ process_inline: if (!truncate_inline_inode(ipage, 0)) return false; f2fs_clear_inline_inode(inode); - update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -289,8 +289,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } namehash = f2fs_dentry_hash(&name); @@ -307,25 +309,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return de; } -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, - struct page **p) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; - struct f2fs_dir_entry *de; - struct f2fs_inline_dentry *dentry_blk; - - ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return NULL; - - dentry_blk = inline_data_addr(ipage); - de = &dentry_blk->dentry[1]; - *p = ipage; - unlock_page(ipage); - return de; -} - int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { @@ -340,10 +323,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) { - i_size_write(inode, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + if (i_size_read(inode) < MAX_INLINE_DATA) + f2fs_i_size_write(inode, MAX_INLINE_DATA); return 0; } @@ -392,22 +373,19 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, NR_INLINE_DENTRY * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); set_page_dirty(page); /* clear inline dir and flag after data writeback */ truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + clear_inode_flag(dir, FI_INLINE_DENTRY); - F2FS_I(dir)->i_current_depth = 1; - if (i_size_read(dir) < PAGE_SIZE) { - i_size_write(dir, PAGE_SIZE); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - - sync_inode_page(&dn); + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); out: f2fs_put_page(page, 1); return err; @@ -465,7 +443,6 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct f2fs_inline_dentry *backup_dentry; - struct f2fs_inode_info *fi = F2FS_I(dir); int err; backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), @@ -487,16 +464,15 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, lock_page(ipage); stat_dec_inline_dir(dir); - clear_inode_flag(fi, FI_INLINE_DENTRY); - update_inode(dir, ipage); + clear_inode_flag(dir, FI_INLINE_DENTRY); kfree(backup_dentry); return 0; recover: lock_page(ipage); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); - fi->i_current_depth = 0; - i_size_write(dir, MAX_INLINE_DATA); - update_inode(dir, ipage); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); kfree(backup_dentry); @@ -560,8 +536,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, /* we don't need to mark_inode_dirty now */ if (inode) { - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -569,11 +544,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode(dir, ipage); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } out: f2fs_put_page(ipage, 1); return err; @@ -597,13 +567,13 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, &inline_dentry->dentry_bitmap); set_page_dirty(page); + f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, page); - - f2fs_put_page(page, 1); + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_inline_dir(struct inode *dir) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2e68adab0d64..9ac5efc15347 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -18,6 +18,13 @@ #include <trace/events/f2fs.h> +void f2fs_mark_inode_dirty_sync(struct inode *inode) +{ + if (f2fs_inode_dirtied(inode)) + return; + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -35,6 +42,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -85,8 +93,8 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) if (*start++) { f2fs_wait_on_page_writeback(ipage, NODE, true); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); set_page_dirty(ipage); return; } @@ -141,7 +149,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_init_extent_tree(inode, &ri->i_ext)) set_page_dirty(node_page); - get_inline_info(fi, ri); + get_inline_info(inode, ri); /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) @@ -151,7 +159,10 @@ static int do_read_inode(struct inode *inode) __get_inode_rdev(inode, ri); if (__written_first_block(ri)) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; f2fs_put_page(node_page, 1); @@ -227,6 +238,8 @@ int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + f2fs_inode_synced(inode); + f2fs_wait_on_page_writeback(node_page, NODE, true); ri = F2FS_INODE(node_page); @@ -244,7 +257,7 @@ int update_inode(struct inode *inode, struct page *node_page) &ri->i_ext); else memset(&ri->i_ext, 0, sizeof(ri->i_ext)); - set_raw_inline(F2FS_I(inode), ri); + set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -261,7 +274,6 @@ int update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); /* deleted inode */ if (inode->i_nlink == 0) @@ -285,6 +297,7 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } + f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -300,7 +313,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; /* @@ -318,8 +331,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - nid_t xnid = fi->i_xattr_nid; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; /* some remained atomic pages should discarded */ @@ -341,12 +353,17 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); - set_inode_flag(fi, FI_NO_ALLOC); + set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); retry: if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (!err) { f2fs_lock_op(sbi); @@ -360,6 +377,8 @@ retry: goto retry; } + if (err) + update_inode_page(inode); sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -369,13 +388,13 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(fi, FI_APPEND_WRITE)) + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) add_ino_entry(sbi, inode->i_ino, UPDATE_INO); - if (is_inode_flag_set(fi, FI_FREE_NID)) { + if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); - clear_inode_flag(fi, FI_FREE_NID); + clear_inode_flag(inode, FI_FREE_NID); } f2fs_bug_on(sbi, err && !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); @@ -407,11 +426,11 @@ void handle_failed_inode(struct inode *inode) f2fs_msg(sbi->sb, KERN_WARNING, "Too many orphan inodes, run fsck to fix."); } else { - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); } alloc_nid_done(sbi, inode->i_ino); } else { - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); } f2fs_unlock_op(sbi); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 324ed3812f30..73fa356f8fbb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); + set_inode_flag(inode, FI_NEW_INODE); + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + set_inode_flag(inode, FI_INLINE_DENTRY); f2fs_init_extent_tree(inode, NULL); @@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_dir(inode); trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); return inode; fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); if (nid_free) - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); } @@ -177,7 +180,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME; ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -190,7 +193,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); f2fs_unlock_op(sbi); return err; @@ -199,9 +202,13 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); - if (!ino) + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); return ERR_PTR(-ENOENT); + } return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } @@ -229,6 +236,9 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) @@ -239,14 +249,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); } out: - if (!err) { - clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); - mark_inode_dirty(dir); - } + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); f2fs_unlock_op(sbi); return err; @@ -281,8 +291,11 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + return (struct dentry *)page; return d_splice_alias(inode, dentry); + } ino = le32_to_cpu(de->ino); f2fs_dentry_kunmap(dir, page); @@ -329,8 +342,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); goto fail; + } f2fs_balance_fs(sbi, true); @@ -345,9 +361,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -492,7 +505,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) f2fs_balance_fs(sbi, true); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -509,7 +522,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); handle_failed_inode(inode); return err; } @@ -592,17 +605,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - + add_orphan_inode(inode); alloc_nid_done(sbi, inode->i_ino); if (whiteout) { - inode_dec_link_count(inode); + f2fs_i_links_write(inode, false); *whiteout = inode; } else { d_tmpfile(dentry, inode); } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); return 0; @@ -652,14 +665,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_old; + } } if (flags & RENAME_WHITEOUT) { @@ -677,8 +695,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_whiteout; + } f2fs_balance_fs(sbi, true); @@ -700,19 +721,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); up_write(&F2FS_I(new_inode)->i_sem); - mark_inode_dirty(new_inode); - if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); + add_orphan_inode(new_inode); else release_orphan_inode(sbi); - - update_inode_page(old_inode); - update_inode_page(new_inode); } else { f2fs_balance_fs(sbi, true); @@ -724,10 +740,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_whiteout; } - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); - } + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); /* * old entry and new entry can locate in the same inline @@ -743,7 +757,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { - err = -EIO; + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); f2fs_unlock_op(sbi); goto out_whiteout; } @@ -757,13 +773,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (whiteout) { whiteout->i_state |= I_LINKABLE; - set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; @@ -775,14 +791,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - update_inode_page(old_inode); } else { f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } - drop_nlink(old_dir); - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_i_links_write(old_dir, false); } f2fs_unlock_op(sbi); @@ -832,29 +845,39 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return -EPERM; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_old; + } /* prepare for updating ".." directory entry info later */ if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_new; + } } if (S_ISDIR(new_inode->i_mode)) { - err = -EIO; new_dir_entry = f2fs_parent_dir(new_inode, &new_dir_page); - if (!new_dir_entry) + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); goto out_old_dir; + } } } @@ -904,19 +927,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - update_inode_page(old_inode); - old_dir->i_ctime = CURRENT_TIME; if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); - if (old_nlink < 0) - drop_nlink(old_dir); - else - inc_nlink(old_dir); + f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_mark_inode_dirty_sync(old_dir); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -925,19 +942,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - update_inode_page(new_inode); - new_dir->i_ctime = CURRENT_TIME; if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); - if (new_nlink < 0) - drop_nlink(new_dir); - else - inc_nlink(new_dir); + f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - mark_inode_dirty(new_dir); - update_inode_page(new_dir); + f2fs_mark_inode_dirty_sync(new_dir); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d1867698e601..b2fa4b615925 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -52,6 +52,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; + if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD) + res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -202,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return need; } @@ -219,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -233,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return need_update; } @@ -280,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = grab_nat_entry(nm_i, ni->nid); @@ -330,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); } int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -338,8 +342,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - if (!down_write_trylock(&nm_i->nat_tree_lock)) - return 0; + percpu_down_write(&nm_i->nat_tree_lock); while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; @@ -348,7 +351,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) __del_from_nat_cache(nm_i, ne); nr_shrink--; } - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -370,13 +373,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->nid = nid; /* Check nat cache */ - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return; } @@ -400,11 +403,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); } /* @@ -646,6 +649,7 @@ release_out: if (err == -ENOENT) { dn->cur_level = i; dn->max_level = level; + dn->ofs_in_node = offset[level]; } return err; } @@ -670,8 +674,7 @@ static void truncate_node(struct dnode_of_data *dn) if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } invalidate: clear_node_page_dirty(dn->node_page); @@ -953,7 +956,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) if (IS_ERR(npage)) return PTR_ERR(npage); - F2FS_I(inode)->i_xattr_nid = 0; + f2fs_i_xnid_write(inode, 0); /* need to do checkpoint during fsync */ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -1019,7 +1022,7 @@ struct page *new_node_page(struct dnode_of_data *dn, struct page *page; int err; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); @@ -1042,21 +1045,16 @@ struct page *new_node_page(struct dnode_of_data *dn, f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (set_page_dirty(page)) dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) - F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); - return page; fail: @@ -1083,6 +1081,9 @@ static int read_node_page(struct page *page, int op_flags) .encrypted_page = NULL, }; + if (PageUptodate(page)) + return LOCKED_PAGE; + get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1090,9 +1091,6 @@ static int read_node_page(struct page *page, int op_flags) return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; - fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; return f2fs_submit_page_bio(&fio); } @@ -1150,16 +1148,21 @@ repeat: lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } + + if (unlikely(!PageUptodate(page))) + goto out_err; page_hit: - f2fs_bug_on(sbi, nid != nid_of_node(page)); + if(unlikely(nid != nid_of_node(page))) { + f2fs_bug_on(sbi, 1); + ClearPageUptodate(page); +out_err: + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -1176,24 +1179,6 @@ struct page *get_node_page_ra(struct page *parent, int start) return __get_node_page(sbi, nid, parent, start); } -void sync_inode_page(struct dnode_of_data *dn) -{ - int ret = 0; - - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - ret = update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - ret = update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); - } else { - ret = update_inode_page(dn->inode); - } - dn->node_changed = ret ? true: false; -} - static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; @@ -1319,7 +1304,7 @@ continue_unlock: return last_page; } -int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; @@ -1327,6 +1312,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, int ret = 0; struct page *last_page = NULL; bool marked = false; + nid_t ino = inode->i_ino; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1380,9 +1366,13 @@ continue_unlock: if (!atomic || page == last_page) { set_fsync_mark(page, 1); - if (IS_INODE(page)) + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + update_inode(inode, page); set_dentry_mark(page, need_dentry_mark(sbi, ino)); + } /* may be written by other thread */ if (!PageDirty(page)) set_page_dirty(page); @@ -1630,6 +1620,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff; /* balancing f2fs's metadata in background */ @@ -1643,7 +1634,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; + blk_start_plug(&plug); sync_node_pages(sbi, wbc); + blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1657,9 +1650,10 @@ static int f2fs_set_node_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, NODE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1778,7 +1772,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1787,14 +1781,14 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1826,7 +1820,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } up_read(&curseg->journal_rwsem); - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1925,12 +1919,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; + if (nm_i->fcnt <= MAX_FREE_NIDS) + return 0; + if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) break; if (i->state == NID_ALLOC) continue; @@ -1957,7 +1954,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -1999,13 +1996,11 @@ recover_xnid: get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - F2FS_I(inode)->i_xattr_nid = new_xnid; + f2fs_i_xnid_write(inode, new_xnid); /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); set_node_addr(sbi, &ni, blkaddr, false); - - update_inode_page(inode); } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2027,7 +2022,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - SetPageUptodate(ipage); + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); src = F2FS_INODE(page); @@ -2213,7 +2209,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -2236,7 +2232,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2272,7 +2268,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); - init_rwsem(&nm_i->nat_tree_lock); + if (percpu_init_rwsem(&nm_i->nat_tree_lock)) + return -ENOMEM; nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -2329,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -2354,8 +2351,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); + percpu_free_rwsem(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1f4f9d4569d9..fc7684554b1a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,18 +15,21 @@ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ -#define FREE_NID_PAGES 4 +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ -#define DEF_RAM_THRESHOLD 10 +#define DEF_RAM_THRESHOLD 1 /* control dirty nats ratio threshold (default: 10% over max nid count) */ #define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -126,6 +129,11 @@ static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) NM_I(sbi)->dirty_nats_ratio / 100; } +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3d7216d7a288..9e652d5a659b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -153,9 +153,12 @@ retry: f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_add_link(dir, &name, inode, + inode->i_ino, inode->i_mode); } - err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - goto out; out_unmap_put: @@ -175,7 +178,7 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); - i_size_write(inode, le64_to_cpu(raw->i_size)); + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); @@ -455,6 +458,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } + if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. @@ -476,6 +482,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, #endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); + if (err) + goto err; } /* Check the previous node page having this index */ @@ -490,9 +498,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } } - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - copy_node_footer(dn.node_page, page); fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); @@ -624,8 +629,12 @@ out: if (err) { bool invalidate = false; - if (discard_next_dnode(sbi, blkaddr)) + if (test_opt(sbi, LFS)) { + update_meta_page(sbi, NULL, blkaddr); + invalidate = true; + } else if (discard_next_dnode(sbi, blkaddr)) { invalidate = true; + } /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c2d1fa1e0e2..a46296f57b02 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -241,7 +241,7 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_FILE); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); @@ -346,6 +346,11 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (!need) return; + + /* balance_fs_bg is able to be pending */ + if (excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -367,7 +372,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + try_to_free_nids(sbi, MAX_FREE_NIDS); + else + build_free_nids(sbi); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || @@ -435,25 +442,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) { + if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { struct bio *bio = f2fs_bio_alloc(0); int ret; + atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); ret = submit_bio_wait(bio); + atomic_dec(&fcc->submit_flush); bio_put(bio); return ret; } init_completion(&cmd.wait); + atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); return cmd.ret; } @@ -467,6 +478,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; + atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; @@ -668,6 +680,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -705,6 +721,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason == CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -721,17 +739,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (!test_opt(sbi, DISCARD)) + if (force || !test_opt(sbi, DISCARD)) continue; - f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SECNO(sbi, start); + start_segno = secno * sbi->segs_per_sec; + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + if (force && entry->len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); cpc->trimmed += entry->len; @@ -1219,6 +1251,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + if (test_opt(sbi, LFS)) + return; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segments(sbi, i); } @@ -1392,11 +1427,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + if (fio->type == NODE || fio->type == DATA) + mutex_lock(&fio->sbi->wio_mutex[fio->type]); + allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ f2fs_submit_page_mbio(fio); + + if (fio->type == NODE || fio->type == DATA) + mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2377,7 +2418,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7a756ff5a36d..b33f73ec60a4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -470,6 +471,10 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (test_opt(sbi, LFS)) + return false; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + reserved_sections(sbi) + 1); } @@ -479,6 +484,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; @@ -531,6 +538,9 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; + if (test_opt(sbi, LFS)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -544,7 +554,7 @@ static inline bool need_inplace_update(struct inode *inode) /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + is_inode_flag_set(inode, FI_NEED_IPU)) return true; return false; @@ -706,9 +716,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) - return 3 * sbi->blocks_per_seg; + return 8 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(sbi); + return 8 * MAX_BIO_BLOCKS(sbi); else return 0; } @@ -726,10 +736,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) - desired = 3 * max_hw_blocks(sbi); + if (type == NODE) + desired = 2 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 93606f281bf9..46c915425923 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -13,6 +13,7 @@ #include <linux/f2fs_fs.h> #include "f2fs.h" +#include "node.h" static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); @@ -25,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) - return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) + return NM_I(sbi)->fcnt - MAX_FREE_NIDS; return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 74cc8520b8b1..b97c065cbe74 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", }; static void f2fs_build_fault_attr(unsigned int rate) @@ -75,6 +76,7 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, + Opt_nodiscard, Opt_noheap, Opt_user_xattr, Opt_nouser_xattr, @@ -86,13 +88,17 @@ enum { Opt_inline_data, Opt_inline_dentry, Opt_flush_merge, + Opt_noflush_merge, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_mode, Opt_fault_injection, + Opt_lazytime, + Opt_nolazytime, Opt_err, }; @@ -101,6 +107,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, @@ -112,13 +119,17 @@ static match_table_t f2fs_tokens = { {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_mode, "mode=%s"}, {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_err, NULL}, }; @@ -417,6 +428,8 @@ static int parse_options(struct super_block *sb, char *options) "the device does not support discard"); } break; + case Opt_nodiscard: + clear_opt(sbi, DISCARD); case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -478,6 +491,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; @@ -496,6 +512,23 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; @@ -506,6 +539,12 @@ static int parse_options(struct super_block *sb, char *options) "FAULT_INJECTION was not selected"); #endif break; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -537,13 +576,11 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - - set_inode_flag(fi, FI_NEW_INODE); - - if (test_opt(F2FS_SB(sb), INLINE_XATTR)) - set_inode_flag(fi, FI_INLINE_XATTR); + init_rwsem(&fi->dio_rwsem[READ]); + init_rwsem(&fi->dio_rwsem[WRITE]); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -559,7 +596,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); @@ -573,10 +610,10 @@ static int f2fs_drop_inode(struct inode *inode) f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); - i_size_write(inode, 0); + f2fs_i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); @@ -586,9 +623,47 @@ static int f2fs_drop_inode(struct inode *inode) } return 0; } + return generic_drop_inode(inode); } +int f2fs_inode_dirtied(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 1; + } + + set_inode_flag(inode, FI_DIRTY_INODE); + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + stat_inc_dirty_inode(sbi, DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + return 0; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + list_del_init(&F2FS_I(inode)->gdirty_list); + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * @@ -596,7 +671,19 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode); } static void f2fs_i_callback(struct rcu_head *head) @@ -619,6 +706,8 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); + + percpu_free_rwsem(&sbi->cp_rwsem); } static void f2fs_put_super(struct super_block *sb) @@ -738,7 +827,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -803,6 +892,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noextent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -884,6 +979,14 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); set_opt(sbi, EXTENT_CACHE); + sbi->sb->s_flags |= MS_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + set_opt(sbi, DISCARD); + } else { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -1367,6 +1470,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + mutex_init(&sbi->wio_mutex[NODE]); + mutex_init(&sbi->wio_mutex[DATA]); #ifdef CONFIG_F2FS_FS_ENCRYPTION memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, @@ -1379,6 +1484,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) { int i, err; + if (percpu_init_rwsem(&sbi->cp_rwsem)) + return -ENOMEM; + for (i = 0; i < NR_COUNT_TYPE; i++) { err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); if (err) @@ -1530,6 +1638,8 @@ try_onemore: goto free_sbi; sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1559,10 +1669,8 @@ try_onemore: memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->raw_super = raw_super; sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); @@ -1579,7 +1687,6 @@ try_onemore: sbi->write_io[i].bio = NULL; } - init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); @@ -1762,6 +1869,7 @@ try_onemore: return 0; free_kobj: + f2fs_sync_inode_meta(sbi); kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index e3decae3acfb..c8898b5148eb 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -299,6 +299,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); + set_page_dirty(ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -441,13 +442,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t len; __u32 new_hsize; - int error = -ENOMEM; + int error = 0; if (name == NULL) return -EINVAL; @@ -465,7 +465,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, base_addr = read_all_xattrs(inode, ipage); if (!base_addr) - goto exit; + return -ENOMEM; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -539,19 +539,15 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - - if (ipage) - update_inode(inode, ipage); - else - update_inode_page(inode); + f2fs_mark_inode_dirty_sync(inode); exit: kzfree(base_addr); return error; diff --git a/fs/internal.h b/fs/internal.h index f57ced528cde..cef0913e5d41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct file_system_type; +struct iomap; struct linux_binprm; struct path; struct mount; @@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) * buffer.c */ extern void guard_bio_eod(int rw, struct bio *bio); +extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap); /* * char_dev.c diff --git a/fs/iomap.c b/fs/iomap.c new file mode 100644 index 000000000000..48141b8eff5f --- /dev/null +++ b/fs/iomap.c @@ -0,0 +1,497 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/uaccess.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> +#include <linux/dax.h> +#include "internal.h" + +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +/* + * Execute a iomap write on a segment of the mapping that spans a + * contiguous range of pages that have identical block mapping state. + * + * This avoids the need to map pages individually, do individual allocations + * for each page and most importantly avoid the need for filesystem specific + * locking per page. Instead, all the operations are amortised over the entire + * range of pages. It is assumed that the filesystems will lock whatever + * resources they require in the iomap_begin call, and release them in the + * iomap_end call. + */ +static loff_t +iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, + struct iomap_ops *ops, void *data, iomap_actor_t actor) +{ + struct iomap iomap = { 0 }; + loff_t written = 0, ret; + + /* + * Need to map a range from start position for length bytes. This can + * span multiple pages - it is only guaranteed to return a range of a + * single type of pages (e.g. all into a hole, all mapped or all + * unwritten). Failure at this point has nothing to undo. + * + * If allocation is required for this range, reserve the space now so + * that the allocation is guaranteed to succeed later on. Once we copy + * the data into the page cache pages, then we cannot fail otherwise we + * expose transient stale data. If the reserve fails, we can safely + * back out at this point as there is nothing to undo. + */ + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + if (ret) + return ret; + if (WARN_ON(iomap.offset > pos)) + return -EIO; + + /* + * Cut down the length to the one actually provided by the filesystem, + * as it might not be able to give us the whole size that we requested. + */ + if (iomap.offset + iomap.length < pos + length) + length = iomap.offset + iomap.length - pos; + + /* + * Now that we have guaranteed that the space allocation will succeed. + * we can do the copy-in page by page without having to worry about + * failures exposing transient data. + */ + written = actor(inode, pos, length, data, &iomap); + + /* + * Now the data has been copied, commit the range we've copied. This + * should not fail unless the filesystem has had a fatal error. + */ + ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0, + flags, &iomap); + + return written ? written : ret; +} + +static void +iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) +{ + loff_t i_size = i_size_read(inode); + + /* + * Only truncate newly allocated pages beyoned EOF, even if the + * write started inside the existing inode size. + */ + if (pos + len > i_size) + truncate_pagecache_range(inode, max(pos, i_size), pos + len); +} + +static int +iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, + struct page **pagep, struct iomap *iomap) +{ + pgoff_t index = pos >> PAGE_SHIFT; + struct page *page; + int status = 0; + + BUG_ON(pos + len > iomap->offset + iomap->length); + + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin_int(page, pos, len, NULL, iomap); + if (unlikely(status)) { + unlock_page(page); + put_page(page); + page = NULL; + + iomap_write_failed(inode, pos, len); + } + + *pagep = page; + return status; +} + +static int +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + + ret = generic_write_end(NULL, inode->i_mapping, pos, len, + copied, page, NULL); + if (ret < len) + iomap_write_failed(inode, pos, len); + return ret; +} + +static loff_t +iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *i = data; + long status = 0; + ssize_t written = 0; + unsigned int flags = AOP_FLAG_NOFS; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + if (!iter_is_iovec(i)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_count(i)); +again: + if (bytes > length) + bytes = length; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = iomap_write_begin(inode, pos, bytes, flags, &page, + iomap); + if (unlikely(status)) + break; + + if (mapping_writably_mapped(inode->i_mapping)) + flush_dcache_page(page); + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + + flush_dcache_page(page); + mark_page_accessed(page); + + status = iomap_write_end(inode, pos, bytes, copied, page); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + length -= copied; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (iov_iter_count(i) && length); + + return written ? written : status; +} + +ssize_t +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, written = 0; + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), + IOMAP_WRITE, ops, iter, iomap_write_actor); + if (ret <= 0) + break; + pos += ret; + written += ret; + } + + return written ? written : ret; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + +static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, + unsigned bytes, struct iomap *iomap) +{ + struct page *page; + int status; + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + if (status) + return status; + + zero_user(page, offset, bytes); + mark_page_accessed(page); + + return iomap_write_end(inode, pos, bytes, bytes, page); +} + +static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, + struct iomap *iomap) +{ + sector_t sector = iomap->blkno + + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); + + return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); +} + +static loff_t +iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, + void *data, struct iomap *iomap) +{ + bool *did_zero = data; + loff_t written = 0; + int status; + + /* already zeroed? we're done. */ + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return count; + + do { + unsigned offset, bytes; + + offset = pos & (PAGE_SIZE - 1); /* Within page */ + bytes = min_t(unsigned, PAGE_SIZE - offset, count); + + if (IS_DAX(inode)) + status = iomap_dax_zero(pos, offset, bytes, iomap); + else + status = iomap_zero(inode, pos, offset, bytes, iomap); + if (status < 0) + return status; + + pos += bytes; + count -= bytes; + written += bytes; + if (did_zero) + *did_zero = true; + } while (count > 0); + + return written; +} + +int +iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, + ops, did_zero, iomap_zero_range_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_zero_range); + +int +iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + struct iomap_ops *ops) +{ + unsigned blocksize = (1 << inode->i_blkbits); + unsigned off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(iomap_truncate_page); + +static loff_t +iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct page *page = data; + int ret; + + ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, + NULL, iomap); + if (ret) + return ret; + + block_commit_write(page, 0, length); + return length; +} + +int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + unsigned long length; + loff_t offset, size; + ssize_t ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) > size) + length = size & ~PAGE_MASK; + else + length = PAGE_SIZE; + + offset = page_offset(page); + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_WRITE, + ops, page, iomap_page_mkwrite_actor); + if (unlikely(ret <= 0)) + goto out_unlock; + offset += ret; + length -= ret; + } + + set_page_dirty(page); + wait_for_stable_page(page); + return 0; +out_unlock: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_page_mkwrite); + +struct fiemap_ctx { + struct fiemap_extent_info *fi; + struct iomap prev; +}; + +static int iomap_to_fiemap(struct fiemap_extent_info *fi, + struct iomap *iomap, u32 flags) +{ + switch (iomap->type) { + case IOMAP_HOLE: + /* skip holes */ + return 0; + case IOMAP_DELALLOC: + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; + break; + case IOMAP_UNWRITTEN: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case IOMAP_MAPPED: + break; + } + + return fiemap_fill_next_extent(fi, iomap->offset, + iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, + iomap->length, flags | FIEMAP_EXTENT_MERGED); + +} + +static loff_t +iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct fiemap_ctx *ctx = data; + loff_t ret = length; + + if (iomap->type == IOMAP_HOLE) + return length; + + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); + ctx->prev = *iomap; + switch (ret) { + case 0: /* success */ + return length; + case 1: /* extent array full */ + return 0; + default: + return ret; + } +} + +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, + loff_t start, loff_t len, struct iomap_ops *ops) +{ + struct fiemap_ctx ctx; + loff_t ret; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fi = fi; + ctx.prev.type = IOMAP_HOLE; + + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, start, len, 0, ops, &ctx, + iomap_fiemap_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + start += ret; + len -= ret; + } + + if (ctx.prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (ret < 0) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_fiemap); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 31f3df193bdb..ad2c05e80a83 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -2,6 +2,7 @@ * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/exportfs.h> +#include <linux/iomap.h> #include <linux/genhd.h> #include <linux/slab.h> #include <linux/pr.h> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 6c3b316f932e..4ebaaf4b8d8a 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -3,6 +3,7 @@ */ #include <linux/sunrpc/svc.h> #include <linux/exportfs.h> +#include <linux/iomap.h> #include <linux/nfs4.h> #include "nfsd.h" diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4df61ea..35faf128f36d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a708e38b494c..88c26b827a2d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -int /* error */ +static int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -1839,19 +1839,8 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; - minleafrecs = mp->m_alloc_mnr[0]; - minnoderecs = mp->m_alloc_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_ag_maxlevels = level; + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + (mp->m_sb.sb_agblocks + 1) / 2); } /* @@ -2658,55 +2647,79 @@ error0: return error; } -/* - * Free an extent. - * Just break up the extent address and hand off to xfs_free_ag_extent - * after fixing up the freelist. - */ -int /* error */ -xfs_free_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ +/* Ensure that the freelist is at full capacity. */ +int +xfs_free_extent_fix_freelist( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **agbp) { - xfs_alloc_arg_t args; - int error; + struct xfs_alloc_arg args; + int error; - ASSERT(len != 0); - memset(&args, 0, sizeof(xfs_alloc_arg_t)); + memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; + args.agno = agno; /* * validate that the block number is legal - the enables us to detect * and handle a silent filesystem corruption rather than crashing. */ - args.agno = XFS_FSB_TO_AGNO(args.mp, bno); if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - if (args.agbno >= args.mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); ASSERT(args.pag); error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto error0; + goto out; + + *agbp = args.agbp; +out: + xfs_perag_put(args.pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + + ASSERT(len != 0); + + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + + XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); /* validate the extent size is legal now we have the agf locked */ - if (args.agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { - error = -EFSCORRUPTED; - goto error0; - } + XFS_WANT_CORRUPTED_GOTO(mp, + agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), + err); - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); - if (!error) - xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); -error0: - xfs_perag_put(args.pag); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0); + if (error) + goto err; + + xfs_extent_busy_insert(tp, agno, agbno, len, 0); + return 0; + +err: + xfs_trans_brelse(tp, agbp); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 135eb3d24db7..cf268b2d0b6c 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -212,13 +212,6 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -int /* error */ -xfs_alloc_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat); /* success/failure */ - int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ @@ -236,5 +229,7 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **agbp); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 882c8d338891..4f2aed04f827 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); -int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); @@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); - /* * Utility routines. */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 932381caef1b..2f2c85cc8117 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -570,14 +570,12 @@ xfs_bmap_validate_ret( */ void xfs_bmap_add_free( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_bmap_free *flist, /* list of extents */ xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ + xfs_filblks_t len) /* length of extent */ { - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ + struct xfs_bmap_free_item *new; /* new element */ #ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -597,17 +595,7 @@ xfs_bmap_add_free( new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); new->xbfi_startblock = bno; new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; + list_add(&new->xbfi_list, &flist->xbf_flist); flist->xbf_count++; } @@ -617,14 +605,10 @@ xfs_bmap_add_free( */ void xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ + struct xfs_bmap_free *flist, /* free item list header */ + struct xfs_bmap_free_item *free) /* list item to be freed */ { - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; + list_del(&free->xbfi_list); flist->xbf_count--; kmem_zone_free(xfs_bmap_free_item_zone, free); } @@ -634,17 +618,16 @@ xfs_bmap_del_free( */ void xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ + struct xfs_bmap_free *flist) /* list of bmap_free_items */ { - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; + struct xfs_bmap_free_item *free; /* free list item */ if (flist->xbf_count == 0) return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); + xfs_bmap_del_free(flist, free); } ASSERT(flist->xbf_count == 0); } @@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1); ip->i_d.di_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -5073,8 +5056,8 @@ xfs_bmap_del_extent( * If we need to, add to list of extents to delete. */ if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); + xfs_bmap_add_free(mp, flist, del->br_startblock, + del->br_blockcount); /* * Adjust inode # blocks in the file. */ diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 423a34e832bd..f1f3ae6c0a3f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -62,12 +62,12 @@ struct xfs_bmalloca { * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ -typedef struct xfs_bmap_free_item +struct xfs_bmap_free_item { xfs_fsblock_t xbfi_startblock;/* starting fs block number */ xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ - struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ -} xfs_bmap_free_item_t; + struct list_head xbfi_list; +}; /* * Header for free extent list. @@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item */ typedef struct xfs_bmap_free { - xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + struct list_head xbf_flist; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; @@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { - ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ - (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); + INIT_LIST_HEAD(&flp->xbf_flist); + flp->xbf_count = 0; + flp->xbf_low = 0; + *fbp = NULLFSBLOCK; } /* @@ -191,8 +193,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist, + xfs_fsblock_t bno, xfs_filblks_t len); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6282f6e708af..db0c71e470c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -526,7 +526,7 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); - xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1); ip->i_d.di_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1f88e1ce770f..07eeb0b4ca74 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -543,12 +543,12 @@ xfs_btree_ptr_addr( */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( - struct xfs_btree_cur *cur) + struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); - return (struct xfs_btree_block *)ifp->if_broot; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; } /* @@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify( return true; } + +/* + * Calculate the number of btree levels needed to store a given number of + * records in a short-format btree. + */ +uint +xfs_btree_compute_maxlevels( + struct xfs_mount *mp, + uint *limits, + unsigned long len) +{ + uint level; + unsigned long maxblocks; + + maxblocks = (len + limits[0] - 1) / limits[0]; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + limits[1] - 1) / limits[1]; + return level; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 2e874be70209..785a99682159 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, + unsigned long len); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 097bf7717d80..0f1f165f4048 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -356,7 +356,6 @@ xfs_da3_split( struct xfs_da_state_blk *newblk; struct xfs_da_state_blk *addblk; struct xfs_da_intnode *node; - struct xfs_buf *bp; int max; int action = 0; int error; @@ -397,7 +396,9 @@ xfs_da3_split( break; } /* - * Entry wouldn't fit, split the leaf again. + * Entry wouldn't fit, split the leaf again. The new + * extrablk will be consumed by xfs_da3_node_split if + * the node is split. */ state->extravalid = 1; if (state->inleaf) { @@ -446,6 +447,14 @@ xfs_da3_split( return 0; /* + * xfs_da3_node_split() should have consumed any extra blocks we added + * during a double leaf split in the attr fork. This is guaranteed as + * we can't be here if the attr fork only has a single leaf block. + */ + ASSERT(state->extravalid == 0 || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + /* * Split the root node. */ ASSERT(state->path.active == 0); @@ -457,43 +466,33 @@ xfs_da3_split( } /* - * Update pointers to the node which used to be block 0 and - * just got bumped because of the addition of a new root node. - * There might be three blocks involved if a double split occurred, - * and the original block 0 could be at any position in the list. + * Update pointers to the node which used to be block 0 and just got + * bumped because of the addition of a new root node. Note that the + * original block 0 could be at any position in the list of blocks in + * the tree. * - * Note: the magic numbers and sibling pointers are in the same - * physical place for both v2 and v3 headers (by design). Hence it - * doesn't matter which version of the xfs_da_intnode structure we use - * here as the result will be the same using either structure. + * Note: the magic numbers and sibling pointers are in the same physical + * place for both v2 and v3 headers (by design). Hence it doesn't matter + * which version of the xfs_da_intnode structure we use here as the + * result will be the same using either structure. */ node = oldblk->bp->b_addr; if (node->hdr.info.forw) { - if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } node = oldblk->bp->b_addr; if (node->hdr.info.back) { - if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } addblk->bp = NULL; return 0; diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 9d624a622946..f1e8d4dbb600 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -40,8 +40,7 @@ xfs_dir2_sf_entsize( int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += len; /* name */ - count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t); /* ino # */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ return count; } @@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) + __uint8_t *from) { if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + return get_unaligned_be64(from) & 0x00ffffffffffffffULL; else - return get_unaligned_be32(&from->i4.i); + return get_unaligned_be32(from); } static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, + __uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); + put_unaligned_be64(ino, to); else - put_unaligned_be32(ino, &to->i4.i); + put_unaligned_be32(ino, to); } static xfs_ino_t xfs_dir2_sf_get_parent_ino( struct xfs_dir2_sf_hdr *hdr) { - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); + return xfs_dir2_sf_get_ino(hdr, hdr->parent); } static void @@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino( struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); + xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); } /* @@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); } static void @@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); } static xfs_ino_t @@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); } static void @@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); } diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 8d4d8bce41bf..685f23b67056 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t; typedef uint xfs_dir2_data_aoff_t; /* argument form */ /* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - -/* * Offset in data space of a data entry. */ typedef __uint32_t xfs_dir2_dataptr_t; @@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t; */ typedef __uint32_t xfs_dir2_db_t; -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; +#define XFS_INO32_SIZE 4 +#define XFS_INO64_SIZE 8 +#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE) -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; #define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) /* @@ -246,39 +228,38 @@ typedef union { typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; + __uint8_t parent[8]; /* parent dir inode number */ +} __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __u8 namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 offset[2]; /* saved offset */ __u8 name[]; /* name, variable size */ /* * A single byte containing the file type field follows the inode * number for version 3 directory entries. * - * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a - * variable offset after the name. + * A 64-bit or 32-bit inode number follows here, at a variable offset + * after the name. */ -} __arch_pack xfs_dir2_sf_entry_t; +} xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { return sizeof(struct xfs_dir2_sf_hdr) - - (i8count == 0) * - (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); + (i8count == 0) * XFS_INO64_DIFF; } static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { - return get_unaligned_be16(&sfep->offset.i); + return get_unaligned_be16(sfep->offset); } static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { - put_unaligned_be16(off, &sfep->offset.i); + put_unaligned_be16(off, sfep->offset); } static inline struct xfs_dir2_sf_entry * diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e5bb9cc3b243..c6809ff41197 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -126,13 +126,12 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = xfs_dir2_sf_hdr_size(i8count) + /* header */ - count + /* namelen */ - count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ - namelen + /* name */ - (i8count ? /* inumber */ - (uint)sizeof(xfs_dir2_ino8_t) * count : - (uint)sizeof(xfs_dir2_ino4_t) * count); + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count * 3 * sizeof(u8) + /* namelen + offset */ + namelen + /* name */ + (i8count ? /* inumber */ + count * XFS_INO64_SIZE : + count * XFS_INO32_SIZE); if (size > XFS_IFORK_DSIZE(dp)) return size; /* size value is a failure */ } @@ -319,10 +318,7 @@ xfs_dir2_sf_addname( /* * Yes, adjust the inode size. old count + (parent + new) */ - incr_isize += - (sfp->count + 2) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + incr_isize += (sfp->count + 2) * XFS_INO64_DIFF; objchange = 1; } @@ -897,11 +893,7 @@ xfs_dir2_sf_replace( int error; /* error return value */ int newsize; /* new inode size */ - newsize = - dp->i_df.if_bytes + - (sfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ @@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4( /* * Compute the new inode size. */ - newsize = - oldsize - - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); @@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8( /* * Compute the new inode size (nb: entry count + 1 for parent) */ - newsize = - oldsize + - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index dc97eb21af07..adb204d40f22 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; * with the crc feature bit, and all accesses to them must be conditional on * that flag. */ +/* short form block header */ +struct xfs_btree_block_shdr { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; +}; + +/* long form block header */ +struct xfs_btree_block_lhdr { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ +}; + struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ __be16 bb_level; /* 0 is a leaf */ __be16 bb_numrecs; /* current # of data records */ union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be32 bb_owner; - __le32 bb_crc; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be64 bb_owner; - __le32 bb_crc; - __be32 bb_pad; /* padding for alignment */ - } l; /* long form pointers */ + struct xfs_btree_block_shdr s; + struct xfs_btree_block_lhdr l; } bb_u; /* rest */ }; -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* size of a short form block */ +#define XFS_BTREE_SBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_shdr, bb_blkno)) +/* size of a long form block */ +#define XFS_BTREE_LBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_lhdr, bb_blkno)) /* sizes of CRC enabled btree blocks */ -#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) -#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) +#define XFS_BTREE_SBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_shdr)) +#define XFS_BTREE_LBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_lhdr)) #define XFS_BTREE_SBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.s.bb_crc) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index fffe3d01bd9f..f5ec9c5ccae6 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -521,12 +521,8 @@ typedef struct xfs_swapext #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -/* XFS_IOC_FREEZE -- FIFREEZE 119 */ -/* XFS_IOC_THAW -- FITHAW 120 */ -#ifndef FIFREEZE -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) -#endif +#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ +#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 22297f9b0fd5..4b1e408169a8 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno), + mp->m_ialloc_blks); return; } @@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -2395,20 +2394,11 @@ void xfs_ialloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> - XFS_INODES_PER_CHUNK_LOG; - minleafrecs = mp->m_inobt_mnr[0]; - minnoderecs = mp->m_inobt_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_in_maxlevels = level; + uint inodes; + + inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + inodes); } /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 951c044e24e4..e2e1106c9fad 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 87d2b215cbbd..7575cfc3ad15 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -87,6 +87,12 @@ xfs_find_bdev_for_inode( * We're now finished for good with this page. Update the page state via the * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. + * + * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last + * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or + * the page at all, as we may be racing with memory reclaim and it can free both + * the bufferhead chain and the page as it will see the page as clean and + * unused. */ static void xfs_finish_page_writeback( @@ -95,8 +101,9 @@ xfs_finish_page_writeback( int error) { unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh; + struct buffer_head *head, *bh, *next; unsigned int off = 0; + unsigned int bsize; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); @@ -105,15 +112,17 @@ xfs_finish_page_writeback( bh = head = page_buffers(bvec->bv_page); + bsize = bh->b_size; do { + next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; if (off > end) break; bh->b_end_io(bh, !error); next_bh: - off += bh->b_size; - } while ((bh = bh->b_this_page) != head); + off += bsize; + } while ((bh = next) != head); } /* @@ -1041,6 +1050,20 @@ xfs_vm_releasepage( trace_xfs_releasepage(page->mapping->host, page, 0, 0); + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, + * block_invalidatepage() can send pages that are still marked dirty + * but otherwise have invalidated buffers. + * + * We've historically freed buffers on the latter. Instead, quietly + * filter out all dirty pages to avoid spurious buffer state warnings. + * This can likely be removed once shrink_active_list() is fixed. + */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); if (WARN_ON_ONCE(delalloc)) @@ -1144,6 +1167,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1151,22 +1176,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1185,37 +1202,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1236,9 +1235,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1251,7 +1248,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1280,14 +1277,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; @@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1408,234 +1398,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); -} - -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. + * We just need the method present so that open/fcntl allow direct I/O. */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; + return -EINVAL; } STATIC sector_t @@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 814aab790713..bf2d9a141a73 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 55d214981ed2..be0b79d8900f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -322,7 +322,7 @@ xfs_attr3_node_inactive( * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +static int xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d25f26b22ac9..25e76cd6c053 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ -int +static int xfs_attr_shortform_list(xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 586bb64e674b..cd4a850564f2 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -79,6 +79,23 @@ xfs_zero_extent( GFP_NOFS, true); } +/* Sort bmap items by AG. */ +static int +xfs_bmap_free_list_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_bmap_free_item *ra; + struct xfs_bmap_free_item *rb; + + ra = container_of(a, struct xfs_bmap_free_item, xbfi_list); + rb = container_of(b, struct xfs_bmap_free_item, xbfi_list); + return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock); +} + /* * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done @@ -99,14 +116,15 @@ xfs_bmap_finish( int error; /* error return value */ int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ - struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) return 0; + list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) + list_for_each_entry(free, &flist->xbf_flist, xbfi_list) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); @@ -125,9 +143,7 @@ xfs_bmap_finish( if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + SHUTDOWN_META_IO_ERROR); } return error; } @@ -138,15 +154,15 @@ xfs_bmap_finish( * on error. */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); if (error) return error; - xfs_bmap_del_free(flist, NULL, free); + xfs_bmap_del_free(flist, free); } return 0; @@ -409,7 +425,7 @@ xfs_bmap_count_tree( /* * Count fsblocks of the given fork. */ -int /* error */ +static int /* error */ xfs_bmap_count_blocks( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode */ @@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range( if (error) break; - ASSERT(!flist.xbf_count && !flist.xbf_first); + ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist)); next_block: start_fsb++; remaining--; @@ -1089,99 +1105,120 @@ error1: /* Just cancel transaction */ return error; } -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) +static int +xfs_unmap_extent( + struct xfs_inode *ip, + xfs_fileoff_t startoffset_fsb, + xfs_filblks_t len_fsb, + int *done) { - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + int error; - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - uint lock_mode; + xfs_trans_ijoin(tp, ip, 0); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, + &free_list, done); + if (error) + goto out_bmap_cancel; - lock_mode = xfs_ilock_data_map_shared(ip); - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - xfs_iunlock(ip, lock_mode); + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) + goto out_bmap_cancel; - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = xfs_trans_commit(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - if (imap.br_startblock == HOLESTARTBLOCK || - imap.br_state == XFS_EXT_UNWRITTEN) { - /* skip the entire extent */ - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + - imap.br_blockcount) - 1; - continue; - } +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; +static int +xfs_adjust_extent_unmap_boundaries( + struct xfs_inode *ip, + xfs_fileoff_t *startoffset_fsb, + xfs_fileoff_t *endoffset_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + int nimap, error; + xfs_extlen_t mod = 0; - /* DAX can just zero the backing device directly */ - if (IS_DAX(VFS_I(ip))) { - error = dax_zero_page_range(VFS_I(ip), offset, - lastoffset - offset + 1, - xfs_get_blocks_direct); - if (error) - return error; - continue; - } + nimap = 1; + error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); + if (error) + return error; - error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - xfs_fsb_to_db(ip, imap.br_startblock), - BTOBB(mp->m_sb.sb_blocksize), - 0, &bp, NULL); - if (error) - return error; + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + *startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - return error; + nimap = 1; + error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && mod != mp->m_sb.sb_rextsize) + *endoffset_fsb -= mod; } - return error; + + return 0; +} + +static int +xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t rounding, start, end; + int error; + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + start = round_down(offset, rounding); + end = round_up(offset + len, rounding) - 1; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + truncate_pagecache_range(inode, start, end); + return 0; } int @@ -1190,24 +1227,10 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_off_t iendoffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - - mp = ip->i_mount; + xfs_fileoff_t endoffset_fsb; + int done = 0, error; trace_xfs_free_file_space(ip); @@ -1215,135 +1238,45 @@ xfs_free_file_space( if (error) return error; - error = 0; if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); + return 0; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, - iendoffset); + error = xfs_flush_unmap_range(ip, offset, len); if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + return error; + + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. + * Need to zero the stuff we're not freeing, on disk. If it's a RT file + * and we can't use unwritten extents then we actually need to ensure + * to zero the whole extent, otherwise we just need to take of block + * boundaries, and xfs_bunmapi will handle the rest. */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); - if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); + if (XFS_IS_REALTIME_INODE(ip) && + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, + &endoffset_fsb); if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); + return error; } - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; + if (endoffset_fsb > startoffset_fsb) { + while (!done) { + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); + if (error) + return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) - goto error0; - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, NULL); - if (error) - goto error0; - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } - out: - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; + /* + * Now that we've unmap all full blocks we'll have to zero out any + * partial block at the beginning and/or end. xfs_zero_range is + * smart enough to skip any holes, including those we just created. + */ + return xfs_zero_range(ip, offset, len, NULL); } /* diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a1dfb4..f20071432ca6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,8 +31,6 @@ struct xfs_bmalloca; int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); @@ -43,7 +41,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ void xfs_bmap_del_free(struct xfs_bmap_free *flist, - struct xfs_bmap_free_item *prev, struct xfs_bmap_free_item *free); int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a87a0d5477bd..47a318ce82e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -80,6 +80,47 @@ xfs_buf_vmap_len( } /* + * Bump the I/O in flight count on the buftarg if we haven't yet done so for + * this buffer. The count is incremented once per buffer (per hold cycle) + * because the corresponding decrement is deferred to buffer release. Buffers + * can undergo I/O multiple times in a hold-release cycle and per buffer I/O + * tracking adds unnecessary overhead. This is used for sychronization purposes + * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * in-flight buffers. + * + * Buffers that are never released (e.g., superblock, iclog buffers) must set + * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count + * never reaches zero and unmount hangs indefinitely. + */ +static inline void +xfs_buf_ioacct_inc( + struct xfs_buf *bp) +{ + if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags |= _XBF_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); +} + +/* + * Clear the in-flight state on a buffer about to be released to the LRU or + * freed and unaccount from the buftarg. + */ +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + if (!(bp->b_flags & _XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags &= ~_XBF_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); +} + +/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -102,6 +143,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; + /* + * Once the buffer is marked stale and unlocked, a subsequent lookup + * could reset b_flags. There is no guarantee that the buffer is + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ + xfs_buf_ioacct_dec(bp); + spin_lock(&bp->b_lock); atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && @@ -815,7 +864,8 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = _xfs_buf_alloc(target, &map, 1, 0); + /* flags might contain irrelevant bits, pass only what we care about */ + bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); if (unlikely(bp == NULL)) goto fail; @@ -866,63 +916,85 @@ xfs_buf_hold( } /* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. + * Release a hold on the specified buffer. If the hold count is 1, the buffer is + * placed on LRU or freed (depending on b_lru_ref). */ void xfs_buf_rele( xfs_buf_t *bp) { struct xfs_perag *pag = bp->b_pag; + bool release; + bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); - if (atomic_dec_and_test(&bp->b_hold)) + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); + } return; } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - spin_lock(&bp->b_lock); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU take a new - * reference to the buffer for the LRU and clear the - * (now stale) dispose list state flag - */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { - bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bp->b_lock); - spin_unlock(&pag->pag_buf_lock); - } else { - /* - * most of the time buffers will already be removed from - * the LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the - * buffer was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - spin_unlock(&bp->b_lock); - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - xfs_buf_free(bp); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + spin_lock(&bp->b_lock); + if (!release) { + /* + * Drop the in-flight state if the buffer is already on the LRU + * and it holds the only reference. This is racy because we + * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + xfs_buf_ioacct_dec(bp); + goto out_unlock; + } + + /* the last reference has been dropped ... */ + xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the + * buffer for the LRU and clear the (now stale) dispose list + * state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from the + * LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the buffer + * was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + freebuf = true; } + +out_unlock: + spin_unlock(&bp->b_lock); + + if (freebuf) + xfs_buf_free(bp); } @@ -944,10 +1016,12 @@ xfs_buf_trylock( int locked; locked = down_trylock(&bp->b_sema) == 0; - if (locked) + if (locked) { XB_SET_OWNER(bp); - - trace_xfs_buf_trylock(bp, _RET_IP_); + trace_xfs_buf_trylock(bp, _RET_IP_); + } else { + trace_xfs_buf_trylock_fail(bp, _RET_IP_); + } return locked; } @@ -1339,6 +1413,7 @@ xfs_buf_submit( * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); + xfs_buf_ioacct_inc(bp); _xfs_buf_ioapply(bp); /* @@ -1524,13 +1599,19 @@ xfs_wait_buftarg( int loop = 0; /* - * We need to flush the buffer workqueue to ensure that all IO - * completion processing is 100% done. Just waiting on buffer locks is - * not sufficient for async IO as the reference count held over IO is - * not released until after the buffer lock is dropped. Hence we need to - * ensure here that all reference counts have been dropped before we - * start walking the LRU list. + * First wait on the buftarg I/O count for all in-flight buffers to be + * released. This is critical as new buffers do not make the LRU until + * they are released. + * + * Next, flush the buffer workqueue to ensure all completion processing + * has finished. Just waiting on buffer locks is not sufficient for + * async IO as the reference count held over IO is not released until + * after the buffer lock is dropped. Hence we need to ensure here that + * all reference counts have been dropped before we start walking the + * LRU list. */ + while (percpu_counter_sum(&btp->bt_io_count)) + delay(100); drain_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ @@ -1627,6 +1708,8 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) @@ -1691,6 +1774,9 @@ xfs_alloc_buftarg( if (list_lru_init(&btp->bt_lru)) goto error; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto error; + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1774,18 +1860,33 @@ xfs_buf_cmp( return 0; } +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int -__xfs_buf_delwri_submit( +xfs_buf_delwri_submit_buffers( struct list_head *buffer_list, - struct list_head *io_list, - bool wait) + struct list_head *wait_list) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + struct blk_plug plug; + list_sort(NULL, buffer_list, xfs_buf_cmp); + + blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait) { + if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; @@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. */ - if (wait) + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + if (wait_list) { xfs_buf_hold(bp); - else + list_move_tail(&bp->b_list, wait_list); + } else list_del_init(&bp->b_list); xfs_buf_submit(bp); @@ -1849,8 +1946,7 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - LIST_HEAD (io_list); - return __xfs_buf_delwri_submit(buffer_list, &io_list, false); + return xfs_buf_delwri_submit_buffers(buffer_list, NULL); } /* @@ -1865,15 +1961,15 @@ int xfs_buf_delwri_submit( struct list_head *buffer_list) { - LIST_HEAD (io_list); + LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp; - __xfs_buf_delwri_submit(buffer_list, &io_list, true); + xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); /* Wait for IO to complete. */ - while (!list_empty(&io_list)) { - bp = list_first_entry(&io_list, struct xfs_buf, b_list); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8bfb974f0772..1c2e52b2d926 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -43,6 +43,7 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ @@ -62,6 +63,7 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_IN_FLIGHT, "IN_FLIGHT" } /* @@ -115,6 +118,8 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; struct list_lru bt_lru; + + struct percpu_counter bt_io_count; } xfs_buftarg_t; struct xfs_buf; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 34257992934c..e455f9098d49 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -359,7 +359,7 @@ xfs_buf_item_format( for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); - offset += bp->b_maps[i].bm_len; + offset += BBTOB(bp->b_maps[i].bm_len); } /* @@ -915,20 +915,28 @@ xfs_buf_item_log( for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; - end = start + BBTOB(bp->b_maps[i].bm_len); + end = start + BBTOB(bp->b_maps[i].bm_len) - 1; + + /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } + + /* + * Trim the range to this segment and mark it in the bitmap. + * Note that we must convert buffer offsets to segment relative + * offsets (e.g., the first byte of each segment is byte 0 of + * that segment). + */ if (first < start) first = start; if (end > last) end = last; - - xfs_buf_item_log_segment(first, end, + xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); - start += bp->b_maps[i].bm_len; + start += BBTOB(bp->b_maps[i].bm_len); } } @@ -949,6 +957,7 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { xfs_buf_item_free_format(bip); + kmem_free(bip->bli_item.li_lv_shadow); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error( trace_xfs_buf_item_iodone_async(bp, _RET_IP_); ASSERT(bp->b_iodone != NULL); + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + /* * If the write was asynchronous then no one will be looking for the * error. If this is the first failure of this type, clear the error @@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error( * async write failure at least once, but we also need to set the buffer * up to behave correctly now for repeated failures. */ - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || + if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL); + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - bp->b_retries = 0; - bp->b_first_retry_time = jiffies; + if (cfg->retry_timeout && !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); xfs_buf_submit(bp); @@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error( * Repeated failure on an async write. Take action according to the * error configuration we have been set up to use. */ - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e0646659ce16..ccb0811963b2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -74,6 +74,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); + kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 814cff94e78f..2c7a1629e064 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + kmem_free(qfs->qql_item.li_lv_shadow); + kmem_free(lip->li_lv_shadow); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 88693a98fac5..ed7ee4e8af73 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, } int -xfs_errortag_add(int error_tag, xfs_mount_t *mp) +xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) { int i; int len; int64_t fsid; + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 4ed3042a0f16..2e4f67f68856 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 4aa0153214f9..ab779460ecbf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,6 +40,7 @@ void xfs_efi_item_free( struct xfs_efi_log_item *efip) { + kmem_free(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { + kmem_free(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1b3dc9dd8861..ed95e5bb04e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -80,61 +81,17 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +xfs_zero_range( + struct xfs_inode *ip, + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { - struct page *page; - struct address_space *mapping; - int status = 0; - - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } - pos += bytes; - count -= bytes; - } while (count); - - return status; + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int @@ -282,48 +239,35 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + loff_t isize = i_size_read(inode); + size_t count = iov_iter_count(to); + struct iov_iter data; + struct xfs_buftarg *target; ssize_t ret = 0; - int ioflags = 0; - xfs_fsize_t n; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); - - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) - return 0; - return -EINVAL; - } - } + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; + if (!count) + return 0; /* skip atime */ - if (n < size) - size = n; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; + } /* * Locking is a bit tricky here. If we take an exclusive lock for direct @@ -336,7 +280,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -351,8 +295,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -363,20 +307,95 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + data = *to; + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + file_accessed(iocb->ki_filp); + return ret; +} + +static noinline ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_aio_read(iocb, to); + else + ret = xfs_file_buffered_aio_read(iocb, to); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -389,18 +408,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -424,49 +439,6 @@ out: } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); -} - -/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in @@ -484,94 +456,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* @@ -722,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -762,7 +650,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -789,10 +677,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -809,10 +699,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + ASSERT(ret < 0 || ret == count); + return ret; +} + +static noinline ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -839,9 +789,8 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -895,7 +844,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite( if (IS_DAX(inode)) { ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b4d75825ae37..7191c3878b4a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -667,8 +667,11 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta, fdblks_delta; + __int64_t lcounter, delta; + __int64_t fdblks_delta = 0; __uint64_t request; + __int64_t free; + int error = 0; /* If inval is null, report current values and return */ if (inval == (__uint64_t *)NULL) { @@ -682,24 +685,23 @@ xfs_reserve_blocks( request = *inval; /* - * With per-cpu counters, this becomes an interesting - * problem. we needto work out if we are freeing or allocation - * blocks first, then we can do the modification as necessary. + * With per-cpu counters, this becomes an interesting problem. we need + * to work out if we are freeing or allocation blocks first, then we can + * do the modification as necessary. * - * We do this under the m_sb_lock so that if we are near - * ENOSPC, we will hold out any changes while we work out - * what to do. This means that the amount of free space can - * change while we do this, so we need to retry if we end up - * trying to reserve more space than is available. + * We do this under the m_sb_lock so that if we are near ENOSPC, we will + * hold out any changes while we work out what to do. This means that + * the amount of free space can change while we do this, so we need to + * retry if we end up trying to reserve more space than is available. */ -retry: spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, - * then move any unused blocks back to the free pool. + * then move any unused blocks back to the free pool. Modify the resblks + * counters directly since we shouldn't have any problems unreserving + * space. */ - fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ @@ -707,54 +709,67 @@ retry: mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; - } else { - __int64_t free; + if (fdblks_delta) { + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } + + goto out; + } + /* + * If the request is larger than the current reservation, reserve the + * blocks before we update the reserve counters. Sample m_fdblocks and + * perform a partial reservation if the request exceeds free space. + */ + error = -ENOSPC; + do { free = percpu_counter_sum(&mp->m_fdblocks) - XFS_ALLOC_SET_ASIDE(mp); if (!free) - goto out; /* ENOSPC and fdblks_delta = 0 */ + break; delta = request - mp->m_resblks; lcounter = free - delta; - if (lcounter < 0) { + if (lcounter < 0) /* We can't satisfy the request, just get what we can */ - mp->m_resblks += free; - mp->m_resblks_avail += free; - fdblks_delta = -free; - } else { - fdblks_delta = -delta; - mp->m_resblks = request; - mp->m_resblks_avail += delta; - } - } -out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); + fdblks_delta = free; + else + fdblks_delta = delta; - if (fdblks_delta) { /* - * If we are putting blocks back here, m_resblks_avail is - * already at its max so this will put it in the free pool. - * - * If we need space, we'll either succeed in getting it - * from the free block count or we'll get an enospc. If - * we get a ENOSPC, it means things changed while we were - * calculating fdblks_delta and so we should try again to - * see if there is anything left to reserve. + * We'll either succeed in getting space from the free block + * count or we'll get an ENOSPC. If we get a ENOSPC, it means + * things changed while we were calculating fdblks_delta and so + * we should try again to see if there is anything left to + * reserve. * * Don't set the reserved flag here - we don't want to reserve * the extra reserve blocks from the reserve..... */ - int error; - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); - if (error == -ENOSPC) - goto retry; + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } while (error == -ENOSPC); + + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error && fdblks_delta) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; } - return 0; + +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + + spin_unlock(&mp->m_sb_lock); + return error; } int diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 99ee6eee5e0b..fb39a66914dd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -765,7 +765,7 @@ restart: * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ -STATIC void +void xfs_queue_eofblocks( struct xfs_mount *mp) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 62f1f91c32cb..05bac99bef75 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); +void xfs_queue_eofblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee6799e0476f..8825bcfd314c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -void +static void xfs_lock_inodes( xfs_inode_t **ips, int inodes, @@ -667,14 +667,6 @@ xfs_ip2xflags( return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } -uint -xfs_dic2xflags( - struct xfs_dinode *dip) -{ - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), - be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -748,7 +740,7 @@ out_unlock: * are not linked into the directory structure - they are attached * directly to the superblock - and so have no parent. */ -int +static int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, @@ -1085,7 +1077,7 @@ xfs_dir_ialloc( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -int /* error */ +static int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1104,7 +1096,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -int +static int xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7aeb5b..8eb78ec4a6e2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, @@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); @@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_inode **, int *); -int xfs_droplink(struct xfs_trans *, struct xfs_inode *); -int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ enum xfs_prealloc_flags { @@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); -int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, + bool *did_zero); loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, loff_t eof, int whence); @@ -479,14 +473,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a1b07612224c..892c2aced207 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -651,6 +651,7 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { + kmem_free(ip->i_itemp->ili_item.li_lv_shadow); kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 63a6ff2cfc68..9a7c87809d3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -595,13 +595,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -626,7 +625,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1479,10 +1477,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1575,6 +1573,11 @@ xfs_ioc_swapext( goto out_put_tmp_file; } + /* + * We need to ensure that the fds passed in point to XFS inodes + * before we cast and access them as XFS structures as we have no + * control over what the user passes us here. + */ if (f.file->f_op != &xfs_file_operations || tmp.file->f_op != &xfs_file_operations) { error = -EINVAL; @@ -1625,12 +1628,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1649,7 +1648,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1708,7 +1707,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c7900b6..8b52881bfd90 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8ae327d..321f57721b92 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 58391355a44d..620fc9120444 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -940,3 +941,173 @@ error_on_bmapi_transaction: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else if (nimaps) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + iomap->offset = offset; + iomap->length = length; + } + + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8688e663d744..e066d045e2ff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,6 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ +#include <linux/iomap.h> + struct xfs_inode; struct xfs_bmbt_irec; @@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); + +extern struct iomap_ops xfs_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5d4eba6972e..ab820f84ed50 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,12 +38,13 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/capability.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/slab.h> /* @@ -801,20 +802,30 @@ xfs_setattr_size( return error; /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * - * Start with zeroing any data block beyond EOF that we may expose on - * file extension. + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); - if (error) - return error; + } else { + error = iomap_truncate_page(inode, newsize, &did_zeroing, + &xfs_iomap_ops); } + if (error) + return error; + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new @@ -823,17 +834,14 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* Now wait for all direct I/O to complete. */ - inode_dio_wait(inode); - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we @@ -851,13 +859,6 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); - else - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); - if (error) - return error; truncate_setsize(inode, newsize); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); @@ -998,51 +999,6 @@ xfs_vn_update_time( return xfs_trans_commit(tp); } -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; - - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); - - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } - - return error; -} - STATIC int xfs_vn_fiemap( struct inode *inode, @@ -1050,38 +1006,13 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; - - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBBT(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - return 0; + return error; } STATIC int diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index a8192dc797dc..b8d64d520e12 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) return x; } -/* ARM old ABI has some weird alignment/padding */ -#if defined(__arm__) && !defined(__ARM_EABI__) -#define __arch_pack __attribute__((packed)) -#else -#define __arch_pack -#endif - #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bde02f1fba73..3b74fa011bb1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -788,7 +788,7 @@ xfs_log_mount_cancel( * As far as I know, there weren't any dependencies on the old behaviour. */ -int +static int xfs_log_unmount_write(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1036,7 +1036,7 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -int +static int xfs_log_need_covered(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1177,7 +1177,7 @@ xlog_space_left( * The log manager needs its own routine, in order to control what * happens with the buffer after the write completes. */ -void +static void xlog_iodone(xfs_buf_t *bp) { struct xlog_in_core *iclog = bp->b_fspriv; @@ -1302,7 +1302,7 @@ xfs_log_work_queue( * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. */ -void +static void xfs_log_worker( struct work_struct *work) { @@ -1415,7 +1415,7 @@ xlog_alloc_log( */ error = -ENOMEM; bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), XBF_NO_IOACCT); if (!bp) goto out_free_log; @@ -1454,7 +1454,8 @@ xlog_alloc_log( prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), + XBF_NO_IOACCT); if (!bp) goto out_free_iclog; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 80ba0c047090..b5e71072fde5 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); -int xfs_log_need_covered(struct xfs_mount *mp); - -void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5e54e7955ea6..a4ab192e1792 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -78,6 +78,157 @@ xlog_cil_init_post_recovery( log->l_cilp->xc_ctx->sequence = 1; } +static inline int +xlog_cil_iovec_space( + uint niovecs) +{ + return round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); +} + +/* + * Allocate or pin log vector buffers for CIL insertion. + * + * The CIL currently uses disposable buffers for copying a snapshot of the + * modified items into the log during a push. The biggest problem with this is + * the requirement to allocate the disposable buffer during the commit if: + * a) does not exist; or + * b) it is too small + * + * If we do this allocation within xlog_cil_insert_format_items(), it is done + * under the xc_ctx_lock, which means that a CIL push cannot occur during + * the memory allocation. This means that we have a potential deadlock situation + * under low memory conditions when we have lots of dirty metadata pinned in + * the CIL and we need a CIL commit to occur to free memory. + * + * To avoid this, we need to move the memory allocation outside the + * xc_ctx_lock, but because the log vector buffers are disposable, that opens + * up a TOCTOU race condition w.r.t. the CIL committing and removing the log + * vector buffers between the check and the formatting of the item into the + * log vector buffer within the xc_ctx_lock. + * + * Because the log vector buffer needs to be unchanged during the CIL push + * process, we cannot share the buffer between the transaction commit (which + * modifies the buffer) and the CIL push context that is writing the changes + * into the log. This means skipping preallocation of buffer space is + * unreliable, but we most definitely do not want to be allocating and freeing + * buffers unnecessarily during commits when overwrites can be done safely. + * + * The simplest solution to this problem is to allocate a shadow buffer when a + * log item is committed for the second time, and then to only use this buffer + * if necessary. The buffer can remain attached to the log item until such time + * it is needed, and this is the buffer that is reallocated to match the size of + * the incoming modification. Then during the formatting of the item we can swap + * the active buffer with the new one if we can't reuse the existing buffer. We + * don't free the old buffer as it may be reused on the next modification if + * it's size is right, otherwise we'll free and reallocate it at that point. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and attaches the vector to the log item in preparation + * for the formatting step which occurs under the xc_ctx_lock. + * + * While this means the memory footprint goes up, it avoids the repeated + * alloc/free pattern that repeated modifications of an item would otherwise + * cause, and hence minimises the CPU overhead of such behaviour. + */ +static void +xlog_cil_alloc_shadow_bufs( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + xlog_cil_iovec_space(niovecs); + + /* + * if we have no shadow buffer, or it is too small, we need to + * reallocate it. + */ + if (!lip->li_lv_shadow || + buf_size > lip->li_lv_shadow->lv_size) { + + /* + * We free and allocate here as a realloc would copy + * unecessary data. We don't use kmem_zalloc() for the + * same reason - we don't need to zero the data area in + * the buffer, only the log vector header and the iovec + * storage. + */ + kmem_free(lip->li_lv_shadow); + + lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + memset(lv, 0, xlog_cil_iovec_space(niovecs)); + + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + lip->li_lv_shadow = lv; + } else { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv_shadow; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_next = NULL; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); + } + +} + /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -100,16 +251,19 @@ xfs_cil_prepare_item( /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and free it. + * old_lv, then remove the space it accounts for and make it the shadow + * buffer for later freeing. In both cases we are now switching to the + * shadow buffer, so update the the pointer to it appropriately. */ - if (!old_lv) + if (!old_lv) { lv->lv_item->li_ops->iop_pin(lv->lv_item); - else if (old_lv != lv) { + lv->lv_item->li_lv_shadow = NULL; + } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; - kmem_free(old_lv); + lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ @@ -133,11 +287,13 @@ xfs_cil_prepare_item( * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * - * This function builds a vector for the changes in each log item in the - * transaction. It then works out the length of the buffer needed for each log - * item, allocates them and formats the vector for the item into the buffer. - * The buffer is then attached to the log item are then inserted into the - * Committed Item List for tracking until the next checkpoint is written out. + * This function takes the prepared log vectors attached to each log item, and + * formats the changes into the log vector buffer. The buffer it uses is + * dependent on the current state of the vector in the CIL - the shadow lv is + * guaranteed to be large enough for the current modification, but we will only + * use that if we can't reuse the existing lv. If we can't reuse the existing + * lv, then simple swap it out for the shadow lv. We don't free it - that is + * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a @@ -170,59 +326,29 @@ xlog_cil_insert_format_items( list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_item *lip = lidp->lid_item; struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv; - int niovecs = 0; - int nbytes = 0; - int buf_size; + struct xfs_log_vec *old_lv = NULL; + struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - /* get number of vecs and size of data to be stored */ - lip->li_ops->iop_size(lip, &niovecs, &nbytes); - - /* Skip items that do not have any vectors for writing */ - if (!niovecs) - continue; - /* - * Ordered items need to be tracked but we do not wish to write - * them. We need a logvec to track the object, but we do not - * need an iovec or buffer to be allocated for copying data. + * The formatting size information is already attached to + * the shadow lv on the log item. */ - if (niovecs == XFS_LOG_VEC_ORDERED) { + shadow = lip->li_lv_shadow; + if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; - niovecs = 0; - nbytes = 0; - } - /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. - */ - nbytes += niovecs * sizeof(uint64_t); - nbytes = round_up(nbytes, sizeof(uint64_t)); - - /* grab the old item if it exists for reservation accounting */ - old_lv = lip->li_lv; - - /* - * The data buffer needs to start 64-bit aligned, so round up - * that space to ensure we can align it appropriately and not - * overrun the buffer. - */ - buf_size = nbytes + - round_up((sizeof(struct xfs_log_vec) + - niovecs * sizeof(struct xfs_log_iovec)), - sizeof(uint64_t)); + /* Skip items that do not have any vectors for writing */ + if (!shadow->lv_niovecs && !ordered) + continue; /* compare to existing item size */ - if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + old_lv = lip->li_lv; + if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; lv->lv_next = NULL; @@ -236,32 +362,29 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = shadow->lv_niovecs; + + /* reset the lv buffer information for new formatting */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + + xlog_cil_iovec_space(lv->lv_niovecs); } else { - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + /* switch to shadow buffer! */ + lv = shadow; lv->lv_item = lip; - lv->lv_size = buf_size; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; goto insert; } - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - - /* The allocated data region lies beyond the iovec region */ - lv->lv_buf_len = 0; - lv->lv_bytes = 0; - lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lip->li_ops->iop_format(lip, lv); insert: - ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } } @@ -783,6 +906,13 @@ xfs_log_commit_cil( struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + /* + * Do all necessary memory allocation before we lock the CIL. + * This ensures the allocation does not deadlock with a CIL + * push in memory reclaim (e.g. from kswapd). + */ + xlog_cil_alloc_shadow_bufs(log, tp); + /* lock out background commit */ down_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e39b02351b4a..970c19ba2f56 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -272,13 +272,15 @@ xfs_readsb( buf_ops = NULL; /* - * Allocate a (locked) buffer to hold the superblock. - * This will be kept around at all times to optimize - * access to the superblock. + * Allocate a (locked) buffer to hold the superblock. This will be kept + * around at all times to optimize access to the superblock. Therefore, + * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count + * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), XBF_NO_IOACCT, &bp, + buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44effdd5..0cc8d8f74356 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -22,6 +22,11 @@ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ #structname ") is wrong, expected " #size) +#define XFS_CHECK_OFFSET(structname, member, off) \ + BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + "XFS: offsetof(" #structname ", " #member ") is wrong, " \ + "expected " #off) + static inline void __init xfs_check_ondisk_structs(void) { @@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); @@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); */ + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index d5b756669fb5..0f14b2e4bf6c 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_format.h" #include "xfs_log_format.h" @@ -79,32 +80,6 @@ xfs_fs_get_uuid( return 0; } -static void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = - XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -} - /* * Get a layout for the pNFS client. */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 76c0a4a9bb17..355dd9e1cb64 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -98,8 +98,6 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 11ea5d51db56..0303f1005f88 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -546,7 +546,7 @@ xfs_showargs( return 0; } -__uint64_t +static __uint64_t xfs_max_file_offset( unsigned int blockshift) { @@ -1294,6 +1294,7 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_queue_eofblocks(mp); } /* rw -> ro */ @@ -1306,6 +1307,13 @@ xfs_fs_remount( * return it to the same size. */ xfs_save_resvblks(mp); + + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1565,10 +1573,6 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) - xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1692,8 +1696,9 @@ xfs_init_zones(void) if (!xfs_log_ticket_zone) goto out_free_ioend_bioset; - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_zone_init( + sizeof(struct xfs_bmap_free_item), + "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2dfb1ce4585f..529bce9fc37e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,8 +61,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern __uint64_t xfs_max_file_offset(unsigned int); - extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 4c2c55086208..79cfd3fc5324 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -634,6 +634,9 @@ xfs_error_get_cfg( { struct xfs_error_cfg *cfg; + if (error < 0) + error = -error; + switch (error) { case EIO: cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0fe5ea..145169093fe0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock_fail); DEFINE_BUF_EVENT(xfs_buf_trylock); DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); @@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, @@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9a462e892e4f..9b2b9fa89331 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -52,6 +52,7 @@ typedef struct xfs_log_item { /* delayed logging */ struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; |