summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/rxrpc.c34
-rw-r--r--fs/buffer.c76
-rw-r--r--fs/dlm/config.c7
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/dlm_internal.h10
-rw-r--r--fs/dlm/lowcomms.c3
-rw-r--r--fs/f2fs/acl.c9
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/checkpoint.c80
-rw-r--r--fs/f2fs/data.c311
-rw-r--r--fs/f2fs/debug.c5
-rw-r--r--fs/f2fs/dir.c123
-rw-r--r--fs/f2fs/extent_cache.c50
-rw-r--r--fs/f2fs/f2fs.h286
-rw-r--r--fs/f2fs/file.c499
-rw-r--r--fs/f2fs/gc.c52
-rw-r--r--fs/f2fs/inline.c92
-rw-r--r--fs/f2fs/inode.c53
-rw-r--r--fs/f2fs/namei.c147
-rw-r--r--fs/f2fs/node.c144
-rw-r--r--fs/f2fs/node.h14
-rw-r--r--fs/f2fs/recovery.c23
-rw-r--r--fs/f2fs/segment.c59
-rw-r--r--fs/f2fs/segment.h22
-rw-r--r--fs/f2fs/shrinker.c5
-rw-r--r--fs/f2fs/super.c134
-rw-r--r--fs/f2fs/xattr.c20
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap.c497
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/blocklayoutxdr.c1
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c101
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c51
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h18
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c31
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h43
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c38
-rw-r--r--fs/xfs/libxfs/xfs_format.h66
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c28
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/xfs_aops.c332
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c381
-rw-r--r--fs/xfs/xfs_bmap_util.h3
-rw-r--r--fs/xfs/xfs_buf.c236
-rw-r--r--fs/xfs/xfs_buf.h7
-rw-r--r--fs/xfs/xfs_buf_item.c31
-rw-r--r--fs/xfs/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c5
-rw-r--r--fs/xfs/xfs_error.h2
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c425
-rw-r--r--fs/xfs/xfs_fsops.c105
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c16
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_ioctl.c27
-rw-r--r--fs/xfs/xfs_ioctl.h3
-rw-r--r--fs/xfs/xfs_ioctl32.c6
-rw-r--r--fs/xfs/xfs_iomap.c171
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c113
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c13
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c258
-rw-r--r--fs/xfs/xfs_mount.c10
-rw-r--r--fs/xfs/xfs_ondisk.h31
-rw-r--r--fs/xfs/xfs_pnfs.c27
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_super.c19
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_sysfs.c3
-rw-r--r--fs/xfs/xfs_trace.h25
-rw-r--r--fs/xfs/xfs_trans.h1
90 files changed, 3373 insertions, 2192 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index b8fcb416be72..4524916fa200 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
if BLOCK
+config FS_IOMAP
+ bool
+
source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13b62d3..ed2b63257ba9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_SYSCTL) += drop_caches.o
obj-$(CONFIG_FHANDLE) += fhandle.o
+obj-$(CONFIG_FS_IOMAP) += iomap.o
obj-y += quota/
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 63cd9f939f19..4832de84d52c 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -85,18 +85,14 @@ int afs_open_socket(void)
skb_queue_head_init(&afs_incoming_calls);
+ ret = -ENOMEM;
afs_async_calls = create_singlethread_workqueue("kafsd");
- if (!afs_async_calls) {
- _leave(" = -ENOMEM [wq]");
- return -ENOMEM;
- }
+ if (!afs_async_calls)
+ goto error_0;
ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
- if (ret < 0) {
- destroy_workqueue(afs_async_calls);
- _leave(" = %d [socket]", ret);
- return ret;
- }
+ if (ret < 0)
+ goto error_1;
socket->sk->sk_allocation = GFP_NOFS;
@@ -111,18 +107,26 @@ int afs_open_socket(void)
sizeof(srx.transport.sin.sin_addr));
ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
- if (ret < 0) {
- sock_release(socket);
- destroy_workqueue(afs_async_calls);
- _leave(" = %d [bind]", ret);
- return ret;
- }
+ if (ret < 0)
+ goto error_2;
+
+ ret = kernel_listen(socket, INT_MAX);
+ if (ret < 0)
+ goto error_2;
rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
afs_socket = socket;
_leave(" = 0");
return 0;
+
+error_2:
+ sock_release(socket);
+error_1:
+ destroy_workqueue(afs_async_calls);
+error_0:
+ _leave(" = %d", ret);
+ return ret;
}
/*
diff --git a/fs/buffer.c b/fs/buffer.c
index b9fa1be75e69..9c8eb9b6db6a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
}
EXPORT_SYMBOL(page_zero_new_buffers);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+ struct iomap *iomap)
+{
+ loff_t offset = block << inode->i_blkbits;
+
+ bh->b_bdev = iomap->bdev;
+
+ /*
+ * Block points to offset in file we need to map, iomap contains
+ * the offset at which the map starts. If the map ends before the
+ * current block, then do not map the buffer and let the caller
+ * handle it.
+ */
+ BUG_ON(offset >= iomap->offset + iomap->length);
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /*
+ * If the buffer is not up to date or beyond the current EOF,
+ * we need to mark it as new to ensure sub-block zeroing is
+ * executed if necessary.
+ */
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ break;
+ case IOMAP_DELALLOC:
+ if (!buffer_uptodate(bh) ||
+ (offset >= i_size_read(inode)))
+ set_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_mapped(bh);
+ set_buffer_delay(bh);
+ break;
+ case IOMAP_UNWRITTEN:
+ /*
+ * For unwritten regions, we always need to ensure that
+ * sub-block writes cause the regions in the block we are not
+ * writing to are zeroed. Set the buffer as new to ensure this.
+ */
+ set_buffer_new(bh);
+ set_buffer_unwritten(bh);
+ /* FALLTHRU */
+ case IOMAP_MAPPED:
+ if (offset >= i_size_read(inode))
+ set_buffer_new(bh);
+ bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+ ((offset - iomap->offset) >> inode->i_blkbits);
+ set_buffer_mapped(bh);
+ break;
+ }
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
- if (err)
- break;
+ if (get_block) {
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ break;
+ } else {
+ iomap_to_bh(inode, block, bh, iomap);
+ }
+
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
page_zero_new_buffers(page, from, to);
return err;
}
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block)
+{
+ return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 1669f6291c95..df955d2209ce 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -73,6 +73,7 @@ struct dlm_cluster {
unsigned int cl_toss_secs;
unsigned int cl_scan_secs;
unsigned int cl_log_debug;
+ unsigned int cl_log_info;
unsigned int cl_protocol;
unsigned int cl_timewarn_cs;
unsigned int cl_waitwarn_us;
@@ -95,6 +96,7 @@ enum {
CLUSTER_ATTR_TOSS_SECS,
CLUSTER_ATTR_SCAN_SECS,
CLUSTER_ATTR_LOG_DEBUG,
+ CLUSTER_ATTR_LOG_INFO,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_TIMEWARN_CS,
CLUSTER_ATTR_WAITWARN_US,
@@ -166,6 +168,7 @@ CLUSTER_ATTR(recover_timer, 1);
CLUSTER_ATTR(toss_secs, 1);
CLUSTER_ATTR(scan_secs, 1);
CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(log_info, 0);
CLUSTER_ATTR(protocol, 0);
CLUSTER_ATTR(timewarn_cs, 1);
CLUSTER_ATTR(waitwarn_us, 0);
@@ -180,6 +183,7 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+ [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
@@ -365,6 +369,7 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_toss_secs = dlm_config.ci_toss_secs;
cl->cl_scan_secs = dlm_config.ci_scan_secs;
cl->cl_log_debug = dlm_config.ci_log_debug;
+ cl->cl_log_info = dlm_config.ci_log_info;
cl->cl_protocol = dlm_config.ci_protocol;
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
@@ -850,6 +855,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_TOSS_SECS 10
#define DEFAULT_SCAN_SECS 5
#define DEFAULT_LOG_DEBUG 0
+#define DEFAULT_LOG_INFO 1
#define DEFAULT_PROTOCOL 0
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
#define DEFAULT_WAITWARN_US 0
@@ -865,6 +871,7 @@ struct dlm_config_info dlm_config = {
.ci_toss_secs = DEFAULT_TOSS_SECS,
.ci_scan_secs = DEFAULT_SCAN_SECS,
.ci_log_debug = DEFAULT_LOG_DEBUG,
+ .ci_log_info = DEFAULT_LOG_INFO,
.ci_protocol = DEFAULT_PROTOCOL,
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
.ci_waitwarn_us = DEFAULT_WAITWARN_US,
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f30697bc2780..6041eec886ab 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -31,6 +31,7 @@ struct dlm_config_info {
int ci_toss_secs;
int ci_scan_secs;
int ci_log_debug;
+ int ci_log_info;
int ci_protocol;
int ci_timewarn_cs;
int ci_waitwarn_us;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5eff6ea3e27f..216b61604ef9 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,8 +65,16 @@ struct dlm_mhandle;
printk(KERN_ERR "dlm: "fmt"\n" , ##args)
#define log_error(ls, fmt, args...) \
printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
#define log_rinfo(ls, fmt, args...) \
- printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
+do { \
+ if (dlm_config.ci_log_info) \
+ printk(KERN_INFO "dlm: %s: " fmt "\n", \
+ (ls)->ls_name, ##args); \
+ else if (dlm_config.ci_log_debug) \
+ printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+ (ls)->ls_name , ##args); \
+} while (0)
#define log_debug(ls, fmt, args...) \
do { \
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 1ab012a27d9f..963016c8f3d1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1279,10 +1279,9 @@ static void init_local(void)
if (dlm_our_addr(&sas, i))
break;
- addr = kmalloc(sizeof(*addr), GFP_NOFS);
+ addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
if (!addr)
break;
- memcpy(addr, &sas, sizeof(*addr));
dlm_local_addr[dlm_local_count++] = addr;
}
}
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index a31c7e859af6..4dcc9e28dc5c 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -201,7 +201,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
int name_index;
void *value = NULL;
size_t size = 0;
@@ -214,7 +213,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
- set_acl_inode(fi, inode->i_mode);
+ set_acl_inode(inode, inode->i_mode);
if (error == 0)
acl = NULL;
}
@@ -233,7 +232,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (acl) {
value = f2fs_acl_to_disk(acl, &size);
if (IS_ERR(value)) {
- clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
return (int)PTR_ERR(value);
}
}
@@ -244,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (!error)
set_cached_acl(inode, type, acl);
- clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
return error;
}
@@ -385,6 +384,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
if (error)
return error;
+ f2fs_mark_inode_dirty_sync(inode);
+
if (default_acl) {
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 997ca8edb6cb..b2334d11dae8 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,7 +37,7 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
struct page *);
#else
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 124b4a3017b5..f94d01e7d001 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -48,7 +48,8 @@ repeat:
goto repeat;
}
f2fs_wait_on_page_writeback(page, META, true);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
return page;
}
@@ -266,6 +267,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct blk_plug plug;
long diff, written;
/* collect a number of dirty meta pages and write together */
@@ -278,7 +280,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
diff = nr_pages_to_write(sbi, META, wbc);
+ blk_start_plug(&plug);
written = sync_meta_pages(sbi, META, wbc->nr_to_write);
+ blk_finish_plug(&plug);
mutex_unlock(&sbi->cp_mutex);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -366,9 +370,10 @@ static int f2fs_set_meta_page_dirty(struct page *page)
{
trace_f2fs_set_page_dirty(page, META);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
SetPagePrivate(page);
f2fs_trace_pid(page);
@@ -510,10 +515,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
spin_unlock(&im->ino_lock);
}
-void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+void add_orphan_inode(struct inode *inode)
{
/* add new orphan ino entry into list */
- __add_ino_entry(sbi, ino, ORPHAN_INO);
+ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+ update_inode_page(inode);
}
void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -761,28 +767,25 @@ fail_no_cp:
static void __add_dirty_inode(struct inode *inode, enum inode_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (is_inode_flag_set(fi, flag))
+ if (is_inode_flag_set(inode, flag))
return;
- set_inode_flag(fi, flag);
- list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
+ set_inode_flag(inode, flag);
+ list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (get_dirty_pages(inode) ||
- !is_inode_flag_set(F2FS_I(inode), flag))
+ if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag))
return;
- list_del_init(&fi->dirty_list);
- clear_inode_flag(fi, flag);
+ list_del_init(&F2FS_I(inode)->dirty_list);
+ clear_inode_flag(inode, flag);
stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
@@ -795,13 +798,12 @@ void update_dirty_page(struct inode *inode, struct page *page)
!S_ISLNK(inode->i_mode))
return;
- if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) {
- spin_lock(&sbi->inode_lock[type]);
+ spin_lock(&sbi->inode_lock[type]);
+ if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH))
__add_dirty_inode(inode, type);
- spin_unlock(&sbi->inode_lock[type]);
- }
-
inode_inc_dirty_pages(inode);
+ spin_unlock(&sbi->inode_lock[type]);
+
SetPagePrivate(page);
f2fs_trace_pid(page);
}
@@ -864,6 +866,34 @@ retry:
goto retry;
}
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &sbi->inode_list[DIRTY_META];
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ s64 total = get_pages(sbi, F2FS_DIRTY_IMETA);
+
+ while (total--) {
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (list_empty(head)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return 0;
+ }
+ fi = list_entry(head->next, struct f2fs_inode_info,
+ gdirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ if (inode) {
+ update_inode_page(inode);
+ iput(inode);
+ }
+ };
+ return 0;
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -890,6 +920,14 @@ retry_flush_dents:
goto retry_flush_dents;
}
+ if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+ f2fs_unlock_all(sbi);
+ err = f2fs_sync_inode_meta(sbi);
+ if (err)
+ goto out;
+ goto retry_flush_dents;
+ }
+
/*
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush.
@@ -914,6 +952,8 @@ out:
static void unblock_operations(struct f2fs_sb_info *sbi)
{
up_write(&sbi->node_write);
+
+ build_free_nids(sbi);
f2fs_unlock_all(sbi);
}
@@ -954,7 +994,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* This avoids to conduct wrong roll-forward operations and uses
* metapages, so should be called prior to sync_meta_pages below.
*/
- if (discard_next_dnode(sbi, discard_blk))
+ if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk))
invalidate = true;
/* Flush all the NAT/SIT pages */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ded224518978..e2624275d828 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -19,6 +19,8 @@
#include <linux/bio.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include "f2fs.h"
@@ -45,7 +47,8 @@ static void f2fs_read_end_io(struct bio *bio)
struct page *page = bvec->bv_page;
if (!bio->bi_error) {
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
} else {
ClearPageUptodate(page);
SetPageError(page);
@@ -97,10 +100,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
return bio;
}
-static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio)
+static inline void __submit_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type)
{
- if (!is_read_io(bio_op(bio)))
+ if (!is_read_io(bio_op(bio))) {
atomic_inc(&sbi->nr_wb_bios);
+ if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+ current->plug && (type == DATA || type == NODE))
+ blk_finish_plug(current->plug);
+ }
submit_bio(bio);
}
@@ -118,7 +126,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
- __submit_bio(io->sbi, io->bio);
+ __submit_bio(io->sbi, io->bio, fio->type);
io->bio = NULL;
}
@@ -240,7 +248,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
bio->bi_rw = fio->op_flags;
bio_set_op_attrs(bio, fio->op, fio->op_flags);
- __submit_bio(fio->sbi, bio);
+ __submit_bio(fio->sbi, bio, fio->type);
return 0;
}
@@ -326,7 +334,7 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
if (!count)
return 0;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
return -ENOSPC;
@@ -348,9 +356,6 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
if (set_page_dirty(dn->node_page))
dn->node_changed = true;
-
- mark_inode_dirty(dn->inode);
- sync_inode_page(dn);
return 0;
}
@@ -446,7 +451,8 @@ got_it:
*/
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
unlock_page(page);
return page;
}
@@ -505,14 +511,14 @@ repeat:
/* wait for read completion */
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
return page;
}
@@ -557,7 +563,8 @@ struct page *get_new_data_page(struct inode *inode,
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
} else {
f2fs_put_page(page, 1);
@@ -569,11 +576,8 @@ struct page *get_new_data_page(struct inode *inode,
}
got_it:
if (new_i_size && i_size_read(inode) <
- ((loff_t)(index + 1) << PAGE_SHIFT)) {
- i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
- /* Only the directory inode sets new_i_size */
- set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- }
+ ((loff_t)(index + 1) << PAGE_SHIFT))
+ f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
return page;
}
@@ -586,7 +590,7 @@ static int __allocate_data_block(struct dnode_of_data *dn)
pgoff_t fofs;
blkcnt_t count = 1;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
@@ -611,7 +615,7 @@ alloc:
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
- i_size_write(dn->inode,
+ f2fs_i_size_write(dn->inode,
((loff_t)(fofs + 1) << PAGE_SHIFT));
return 0;
}
@@ -660,7 +664,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
+ int mode = create ? ALLOC_NODE : LOOKUP_NODE;
pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
unsigned int ofs_in_node, last_ofs_in_node;
@@ -723,8 +727,7 @@ next_block:
} else {
err = __allocate_data_block(&dn);
if (!err) {
- set_inode_flag(F2FS_I(inode),
- FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_APPEND_WRITE);
allocated = true;
}
}
@@ -795,8 +798,6 @@ skip:
else if (dn.ofs_in_node < end_offset)
goto next_block;
- if (allocated)
- sync_inode_page(&dn);
f2fs_put_dnode(&dn);
if (create) {
@@ -807,8 +808,6 @@ skip:
goto next_dnode;
sync_out:
- if (allocated)
- sync_inode_page(&dn);
f2fs_put_dnode(&dn);
unlock_out:
if (create) {
@@ -968,6 +967,37 @@ out:
return ret;
}
+struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+ unsigned nr_pages)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct fscrypt_ctx *ctx = NULL;
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct bio *bio;
+
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return ERR_CAST(ctx);
+
+ /* wait the page to be moved by cleaning */
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+ if (!bio) {
+ if (ctx)
+ fscrypt_release_ctx(ctx);
+ return ERR_PTR(-ENOMEM);
+ }
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = ctx;
+
+ return bio;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -986,7 +1016,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
sector_t last_block;
sector_t last_block_in_file;
sector_t block_nr;
- struct block_device *bdev = inode->i_sb->s_bdev;
struct f2fs_map_blocks map;
map.m_pblk = 0;
@@ -1047,7 +1076,8 @@ got_it:
}
} else {
zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
unlock_page(page);
goto next_page;
}
@@ -1058,35 +1088,15 @@ got_it:
*/
if (bio && (last_block_in_bio != block_nr - 1)) {
submit_and_realloc:
- __submit_bio(F2FS_I_SB(inode), bio);
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
- struct fscrypt_ctx *ctx = NULL;
-
- if (f2fs_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
-
- ctx = fscrypt_get_ctx(inode, GFP_NOFS);
- if (IS_ERR(ctx))
- goto set_error_page;
-
- /* wait the page to be moved by cleaning */
- f2fs_wait_on_encrypted_page_writeback(
- F2FS_I_SB(inode), block_nr);
- }
-
- bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
+ bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+ if (IS_ERR(bio)) {
+ bio = NULL;
goto set_error_page;
}
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
- bio->bi_end_io = f2fs_read_end_io;
- bio->bi_private = ctx;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
}
@@ -1102,7 +1112,7 @@ set_error_page:
goto next_page;
confused:
if (bio) {
- __submit_bio(F2FS_I_SB(inode), bio);
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
unlock_page(page);
@@ -1112,7 +1122,7 @@ next_page:
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- __submit_bio(F2FS_I_SB(inode), bio);
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
@@ -1201,14 +1211,14 @@ retry_encrypt:
!IS_ATOMIC_WRITTEN_PAGE(page) &&
need_inplace_update(inode))) {
rewrite_data_page(fio);
- set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+ set_inode_flag(inode, FI_UPDATE_WRITE);
trace_f2fs_do_write_data_page(page, IPU);
} else {
write_data_page(&dn, fio);
trace_f2fs_do_write_data_page(page, OPU);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
- set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
}
out_writepage:
f2fs_put_dnode(&dn);
@@ -1223,6 +1233,7 @@ static int f2fs_write_data_page(struct page *page,
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_SHIFT;
+ loff_t psize = (page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
@@ -1260,20 +1271,18 @@ write:
available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ mapping_set_error(page->mapping, -EIO);
+ goto out;
+ }
+
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
- if (unlikely(f2fs_cp_error(sbi)))
- goto redirty_out;
err = do_write_data_page(&fio);
goto done;
}
- /* we should bypass data pages to proceed the kworkder jobs */
- if (unlikely(f2fs_cp_error(sbi))) {
- SetPageError(page);
- goto out;
- }
-
if (!wbc->for_reclaim)
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0))
@@ -1285,6 +1294,8 @@ write:
err = f2fs_write_inline_data(inode, page);
if (err == -EAGAIN)
err = do_write_data_page(&fio);
+ if (F2FS_I(inode)->last_disk_size < psize)
+ F2FS_I(inode)->last_disk_size = psize;
f2fs_unlock_op(sbi);
done:
if (err && err != -ENOENT)
@@ -1311,16 +1322,8 @@ out:
redirty_out:
redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
-}
-
-static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
-{
- struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(page, wbc);
- mapping_set_error(mapping, ret);
- return ret;
+ unlock_page(page);
+ return err;
}
/*
@@ -1329,8 +1332,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
* warm/hot data page.
*/
static int f2fs_write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
+ struct writeback_control *wbc)
{
int ret = 0;
int done = 0;
@@ -1343,10 +1345,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int cycled;
int range_whole = 0;
int tag;
- int step = 0;
pagevec_init(&pvec, 0);
-next:
+
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
@@ -1401,9 +1402,6 @@ continue_unlock:
goto continue_unlock;
}
- if (step == is_cold_data(page))
- goto continue_unlock;
-
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
@@ -1416,16 +1414,11 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- ret = (*writepage)(page, wbc, data);
+ ret = mapping->a_ops->writepage(page, wbc);
if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
- ret = 0;
- } else {
- done_index = page->index + 1;
- done = 1;
- break;
- }
+ done_index = page->index + 1;
+ done = 1;
+ break;
}
if (--wbc->nr_to_write <= 0 &&
@@ -1438,11 +1431,6 @@ continue_unlock:
cond_resched();
}
- if (step < 1) {
- step++;
- goto next;
- }
-
if (!cycled && !done) {
cycled = 1;
index = 0;
@@ -1460,9 +1448,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- bool locked = false;
+ struct blk_plug plug;
int ret;
- long diff;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
@@ -1478,7 +1465,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
goto skip_write;
/* skip writing during file defragment */
- if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+ if (is_inode_flag_set(inode, FI_DO_DEFRAG))
goto skip_write;
/* during POR, we don't need to trigger writepage at all. */
@@ -1487,20 +1474,16 @@ static int f2fs_write_data_pages(struct address_space *mapping,
trace_f2fs_writepages(mapping->host, wbc, DATA);
- diff = nr_pages_to_write(sbi, DATA, wbc);
-
- if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) {
- mutex_lock(&sbi->writepages);
- locked = true;
- }
- ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
- f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
- if (locked)
- mutex_unlock(&sbi->writepages);
+ blk_start_plug(&plug);
+ ret = f2fs_write_cache_pages(mapping, wbc);
+ blk_finish_plug(&plug);
+ /*
+ * if some pages were truncated, we cannot guarantee its mapping->host
+ * to detect pending bios.
+ */
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
remove_dirty_inode(inode);
-
- wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
skip_write:
@@ -1558,7 +1541,7 @@ restart:
if (f2fs_has_inline_data(inode)) {
if (pos + len <= MAX_INLINE_DATA) {
read_inline_data(page, ipage);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
set_inline_node(ipage);
} else {
@@ -1668,39 +1651,35 @@ repeat:
if (blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_SIZE);
} else {
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .type = DATA,
- .op = REQ_OP_READ,
- .op_flags = READ_SYNC,
- .old_blkaddr = blkaddr,
- .new_blkaddr = blkaddr,
- .page = page,
- .encrypted_page = NULL,
- };
- err = f2fs_submit_page_bio(&fio);
- if (err)
- goto fail;
+ struct bio *bio;
- lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- err = -EIO;
+ bio = f2fs_grab_bio(inode, blkaddr, 1);
+ if (IS_ERR(bio)) {
+ err = PTR_ERR(bio);
goto fail;
}
+ bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ bio_put(bio);
+ err = -EFAULT;
+ goto fail;
+ }
+
+ __submit_bio(sbi, bio, DATA);
+
+ lock_page(page);
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
-
- /* avoid symlink page */
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- err = fscrypt_decrypt_page(page);
- if (err)
- goto fail;
+ if (unlikely(!PageUptodate(page))) {
+ err = -EIO;
+ goto fail;
}
}
out_update:
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
out_clear:
clear_cold_data(page);
return 0;
@@ -1721,13 +1700,11 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
set_page_dirty(page);
+ f2fs_put_page(page, 1);
- if (pos + copied > i_size_read(inode)) {
- i_size_write(inode, pos + copied);
- mark_inode_dirty(inode);
- }
+ if (pos + copied > i_size_read(inode))
+ f2fs_i_size_write(inode, pos + copied);
- f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
@@ -1752,6 +1729,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
+ int rw = iov_iter_rw(iter);
int err;
err = check_direct_IO(inode, iter, offset);
@@ -1760,18 +1738,23 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return 0;
+ if (test_opt(F2FS_I_SB(inode), LFS))
+ return 0;
- trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+ trace_f2fs_direct_IO_enter(inode, offset, count, rw);
+ down_read(&F2FS_I(inode)->dio_rwsem[rw]);
err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
- if (iov_iter_rw(iter) == WRITE) {
+ up_read(&F2FS_I(inode)->dio_rwsem[rw]);
+
+ if (rw == WRITE) {
if (err > 0)
- set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+ set_inode_flag(inode, FI_UPDATE_WRITE);
else if (err < 0)
f2fs_write_failed(mapping, offset + count);
}
- trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
+ trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
return err;
}
@@ -1818,6 +1801,35 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 1;
}
+/*
+ * This was copied from __set_page_dirty_buffers which gives higher performance
+ * in very high speed storages. (e.g., pmem)
+ */
+void f2fs_set_page_dirty_nobuffers(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ unsigned long flags;
+
+ if (unlikely(!mapping))
+ return;
+
+ spin_lock(&mapping->private_lock);
+ lock_page_memcg(page);
+ SetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ unlock_page_memcg(page);
+
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ return;
+}
+
static int f2fs_set_data_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1825,7 +1837,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
trace_f2fs_set_page_dirty(page, DATA);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (f2fs_is_atomic_file(inode)) {
if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
@@ -1840,7 +1853,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
}
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
update_dirty_page(inode, page);
return 1;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index d89a425055d0..badd407bb622 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -47,6 +47,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+ si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->wb_bios = atomic_read(&sbi->nr_wb_bios);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
@@ -304,8 +305,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inmem_pages, si->wb_bios);
seq_printf(s, " - nodes: %4lld in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents: %4lld in dirs:%4d\n",
- si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n",
+ si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
seq_printf(s, " - datas: %4lld in files:%4d\n",
si->ndirty_data, si->ndirty_files);
seq_printf(s, " - meta: %4lld in %4d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index f9313f684540..a485f68a76b1 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -185,8 +185,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
/* no need to allocate new dentry pages to all the indices */
dentry_page = find_data_page(dir, bidx);
if (IS_ERR(dentry_page)) {
- room = true;
- continue;
+ if (PTR_ERR(dentry_page) == -ENOENT) {
+ room = true;
+ continue;
+ } else {
+ *res_page = dentry_page;
+ break;
+ }
}
de = find_in_block(dentry_page, fname, namehash, &max_slots,
@@ -223,19 +228,22 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct fscrypt_name fname;
int err;
- *res_page = NULL;
-
err = fscrypt_setup_filename(dir, child, 1, &fname);
- if (err)
+ if (err) {
+ *res_page = ERR_PTR(err);
return NULL;
+ }
if (f2fs_has_inline_dentry(dir)) {
+ *res_page = NULL;
de = find_in_inline_dir(dir, &fname, res_page);
goto out;
}
- if (npages == 0)
+ if (npages == 0) {
+ *res_page = NULL;
goto out;
+ }
max_depth = F2FS_I(dir)->i_current_depth;
if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
@@ -243,13 +251,13 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
"Corrupted max_depth of %lu: %u",
dir->i_ino, max_depth);
max_depth = MAX_DIR_HASH_DEPTH;
- F2FS_I(dir)->i_current_depth = max_depth;
- mark_inode_dirty(dir);
+ f2fs_i_depth_write(dir, max_depth);
}
for (level = 0; level < max_depth; level++) {
+ *res_page = NULL;
de = find_in_level(dir, level, &fname, res_page);
- if (de)
+ if (de || IS_ERR(*res_page))
break;
}
out:
@@ -259,35 +267,22 @@ out:
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct page *page;
- struct f2fs_dir_entry *de;
- struct f2fs_dentry_block *dentry_blk;
-
- if (f2fs_has_inline_dentry(dir))
- return f2fs_parent_inline_dir(dir, p);
-
- page = get_lock_data_page(dir, 0, false);
- if (IS_ERR(page))
- return NULL;
+ struct qstr dotdot = QSTR_INIT("..", 2);
- dentry_blk = kmap(page);
- de = &dentry_blk->dentry[1];
- *p = page;
- unlock_page(page);
- return de;
+ return f2fs_find_entry(dir, &dotdot, p);
}
-ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr,
+ struct page **page)
{
ino_t res = 0;
struct f2fs_dir_entry *de;
- struct page *page;
- de = f2fs_find_entry(dir, qstr, &page);
+ de = f2fs_find_entry(dir, qstr, page);
if (de) {
res = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, page);
- f2fs_put_page(page, 0);
+ f2fs_dentry_kunmap(dir, *page);
+ f2fs_put_page(*page, 0);
}
return res;
@@ -303,9 +298,9 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
set_de_type(de, inode->i_mode);
f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- mark_inode_dirty(dir);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ f2fs_mark_inode_dirty_sync(dir);
f2fs_put_page(page, 1);
}
@@ -385,7 +380,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
struct page *page;
int err;
- if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ if (is_inode_flag_set(inode, FI_NEW_INODE)) {
page = new_inode_page(inode);
if (IS_ERR(page))
return page;
@@ -429,7 +424,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
* This file should be checkpointed during fsync.
* We lost i_pino from now on.
*/
- if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ if (is_inode_flag_set(inode, FI_INC_LINK)) {
file_lost_pino(inode);
/*
* If link the tmpfile to alias through linkat path,
@@ -437,14 +432,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
*/
if (inode->i_nlink == 0)
remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
- inc_nlink(inode);
+ f2fs_i_links_write(inode, true);
}
return page;
put_error:
- /* truncate empty dir pages */
- truncate_inode_pages(&inode->i_data, 0);
-
clear_nlink(inode);
update_inode(inode, page);
f2fs_put_page(page, 1);
@@ -454,23 +446,19 @@ put_error:
void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- if (S_ISDIR(inode->i_mode)) {
- inc_nlink(dir);
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
- clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) {
+ if (S_ISDIR(inode->i_mode))
+ f2fs_i_links_write(dir, true);
+ clear_inode_flag(inode, FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- mark_inode_dirty(dir);
+ f2fs_mark_inode_dirty_sync(dir);
- if (F2FS_I(dir)->i_current_depth != current_depth) {
- F2FS_I(dir)->i_current_depth = current_depth;
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
+ if (F2FS_I(dir)->i_current_depth != current_depth)
+ f2fs_i_depth_write(dir, current_depth);
- if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ if (inode && is_inode_flag_set(inode, FI_INC_LINK))
+ clear_inode_flag(inode, FI_INC_LINK);
}
int room_for_filename(const void *bitmap, int slots, int max_slots)
@@ -596,9 +584,7 @@ add_dentry:
set_page_dirty(dentry_page);
if (inode) {
- /* we don't need to mark_inode_dirty now */
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
+ f2fs_i_pino_write(inode, dir->i_ino);
f2fs_put_page(page, 1);
}
@@ -607,10 +593,6 @@ fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
- update_inode_page(dir);
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
@@ -657,42 +639,34 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
err = PTR_ERR(page);
goto fail;
}
- /* we don't need to mark_inode_dirty now */
- update_inode(inode, page);
f2fs_put_page(page, 1);
- clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ clear_inode_flag(inode, FI_NEW_INODE);
fail:
up_write(&F2FS_I(inode)->i_sem);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
-void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page)
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
down_write(&F2FS_I(inode)->i_sem);
- if (S_ISDIR(inode->i_mode)) {
- drop_nlink(dir);
- if (page)
- update_inode(dir, page);
- else
- update_inode_page(dir);
- }
+ if (S_ISDIR(inode->i_mode))
+ f2fs_i_links_write(dir, false);
inode->i_ctime = CURRENT_TIME;
- drop_nlink(inode);
+ f2fs_i_links_write(inode, false);
if (S_ISDIR(inode->i_mode)) {
- drop_nlink(inode);
- i_size_write(inode, 0);
+ f2fs_i_links_write(inode, false);
+ f2fs_i_size_write(inode, 0);
}
up_write(&F2FS_I(inode)->i_sem);
- update_inode_page(inode);
if (inode->i_nlink == 0)
- add_orphan_inode(sbi, inode->i_ino);
+ add_orphan_inode(inode);
else
release_orphan_inode(sbi);
}
@@ -730,9 +704,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ f2fs_mark_inode_dirty_sync(dir);
if (inode)
- f2fs_drop_nlink(dir, inode, NULL);
+ f2fs_drop_nlink(dir, inode);
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!truncate_hole(dir, page->index, page->index + 1)) {
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 5bfcdb9b69f2..2b06d4fcd954 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -170,8 +170,10 @@ static void __drop_largest_extent(struct inode *inode,
{
struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
- if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
+ if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
largest->len = 0;
+ f2fs_mark_inode_dirty_sync(inode);
+ }
}
/* return true, if inode page is changed */
@@ -335,11 +337,12 @@ lookup_neighbors:
return en;
}
-static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
+static struct extent_node *__try_merge_extent_node(struct inode *inode,
struct extent_tree *et, struct extent_info *ei,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_node *en = NULL;
if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
@@ -360,7 +363,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
if (!en)
return NULL;
- __try_update_largest_extent(et, en);
+ __try_update_largest_extent(inode, et, en);
spin_lock(&sbi->extent_lock);
if (!list_empty(&en->list)) {
@@ -371,11 +374,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
return en;
}
-static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+static struct extent_node *__insert_extent_tree(struct inode *inode,
struct extent_tree *et, struct extent_info *ei,
struct rb_node **insert_p,
struct rb_node *insert_parent)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct rb_node **p = &et->root.rb_node;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -402,7 +406,7 @@ do_insert:
if (!en)
return NULL;
- __try_update_largest_extent(et, en);
+ __try_update_largest_extent(inode, et, en);
/* update in global extent list */
spin_lock(&sbi->extent_lock);
@@ -431,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
write_lock(&et->lock);
- if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
write_unlock(&et->lock);
return false;
}
@@ -473,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
set_extent_info(&ei, end,
end - dei.fofs + dei.blk,
org_end - end);
- en1 = __insert_extent_tree(sbi, et, &ei,
+ en1 = __insert_extent_tree(inode, et, &ei,
NULL, NULL);
next_en = en1;
} else {
@@ -494,7 +498,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
}
if (parts)
- __try_update_largest_extent(et, en);
+ __try_update_largest_extent(inode, et, en);
else
__release_extent_node(sbi, et, en);
@@ -514,20 +518,20 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (blkaddr) {
set_extent_info(&ei, fofs, blkaddr, len);
- if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
- __insert_extent_tree(sbi, et, &ei,
+ if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en))
+ __insert_extent_tree(inode, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
if (dei.len >= 1 &&
prev.len < F2FS_MIN_EXTENT_LEN &&
et->largest.len < F2FS_MIN_EXTENT_LEN) {
- et->largest.len = 0;
- set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
+ __drop_largest_extent(inode, 0, UINT_MAX);
+ set_inode_flag(inode, FI_NO_EXTENT);
}
}
- if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
__free_extent_tree(sbi, et);
write_unlock(&et->lock);
@@ -627,6 +631,19 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
return node_cnt;
}
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+
+ set_inode_flag(inode, FI_NO_EXTENT);
+
+ write_lock(&et->lock);
+ __free_extent_tree(sbi, et);
+ __drop_largest_extent(inode, 0, UINT_MAX);
+ write_unlock(&et->lock);
+}
+
void f2fs_destroy_extent_tree(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -685,9 +702,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
-
- if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1))
- sync_inode_page(dn);
+ f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
}
void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
@@ -697,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
if (!f2fs_may_extent_tree(dn->inode))
return;
- if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len))
- sync_inode_page(dn);
+ f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
}
void init_extent_cache_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 23ae6a81ccd6..7890e9071499 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -45,6 +45,7 @@ enum {
FAULT_ORPHAN,
FAULT_BLOCK,
FAULT_DIR_DEPTH,
+ FAULT_EVICT_INODE,
FAULT_MAX,
};
@@ -74,6 +75,8 @@ static inline bool time_to_inject(int type)
return false;
else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type))
return false;
+ else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type))
+ return false;
atomic_inc(&f2fs_fault.inject_ops);
if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) {
@@ -108,6 +111,8 @@ static inline bool time_to_inject(int type)
#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
+#define F2FS_MOUNT_ADAPTIVE 0x00020000
+#define F2FS_MOUNT_LFS 0x00040000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -128,6 +133,7 @@ struct f2fs_mount_info {
};
#define F2FS_FEATURE_ENCRYPT 0x0001
+#define F2FS_FEATURE_HMSMR 0x0002
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -158,7 +164,7 @@ enum {
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
#define DEF_CP_INTERVAL 60 /* 60 secs */
-#define DEF_IDLE_INTERVAL 120 /* 2 mins */
+#define DEF_IDLE_INTERVAL 5 /* 5 secs */
struct cp_control {
int reason;
@@ -262,6 +268,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
+ struct f2fs_move_range)
#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -291,6 +299,13 @@ struct f2fs_defragment {
u64 len;
};
+struct f2fs_move_range {
+ u32 dst_fd; /* destination fd */
+ u64 pos_in; /* start position in src_fd */
+ u64 pos_out; /* start position in dst_fd */
+ u64 len; /* size to move */
+};
+
/*
* For INODE and NODE manager
*/
@@ -441,11 +456,14 @@ struct f2fs_inode_info {
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
+ loff_t last_disk_size; /* lastly written file size */
- struct list_head dirty_list; /* linked in global dirty list */
+ struct list_head dirty_list; /* dirty list for dirs and files */
+ struct list_head gdirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -498,11 +516,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
-static inline void __try_update_largest_extent(struct extent_tree *et,
- struct extent_node *en)
+extern void f2fs_mark_inode_dirty_sync(struct inode *);
+static inline void __try_update_largest_extent(struct inode *inode,
+ struct extent_tree *et, struct extent_node *en)
{
- if (en->ei.len > et->largest.len)
+ if (en->ei.len > et->largest.len) {
et->largest = en->ei;
+ f2fs_mark_inode_dirty_sync(inode);
+ }
}
struct f2fs_nm_info {
@@ -517,7 +538,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
+ struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
@@ -599,6 +620,7 @@ struct flush_cmd {
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
+ atomic_t submit_flush; /* # of issued flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -655,6 +677,7 @@ enum count_type {
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
+ F2FS_DIRTY_IMETA,
NR_COUNT_TYPE,
};
@@ -706,6 +729,7 @@ struct f2fs_bio_info {
enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
+ DIRTY_META, /* for all dirtied inode metadata */
NR_INODE_TYPE,
};
@@ -757,14 +781,14 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info read_io; /* for read bios */
struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
+ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
- struct rw_semaphore cp_rwsem; /* blocking FS operations */
+ struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
- struct mutex writepages; /* mutex for writepages() */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -1050,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- down_read(&sbi->cp_rwsem);
+ percpu_down_read(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- up_read(&sbi->cp_rwsem);
+ percpu_up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- down_write(&sbi->cp_rwsem);
+ percpu_down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->cp_rwsem);
+ percpu_up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
@@ -1120,34 +1144,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
return ofs == XATTR_NODE_OFFSET;
}
+static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool);
static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode, blkcnt_t *count)
{
- block_t valid_block_count;
+ blkcnt_t diff;
- spin_lock(&sbi->stat_lock);
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_BLOCK)) {
- spin_unlock(&sbi->stat_lock);
+ if (time_to_inject(FAULT_BLOCK))
return false;
- }
#endif
- valid_block_count =
- sbi->total_valid_block_count + (block_t)(*count);
- if (unlikely(valid_block_count > sbi->user_block_count)) {
- *count = sbi->user_block_count - sbi->total_valid_block_count;
+ /*
+ * let's increase this in prior to actual block count change in order
+ * for f2fs_sync_file to avoid data races when deciding checkpoint.
+ */
+ percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
+
+ spin_lock(&sbi->stat_lock);
+ sbi->total_valid_block_count += (block_t)(*count);
+ if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) {
+ diff = sbi->total_valid_block_count - sbi->user_block_count;
+ *count -= diff;
+ sbi->total_valid_block_count = sbi->user_block_count;
if (!*count) {
spin_unlock(&sbi->stat_lock);
+ percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
return false;
}
}
- /* *count can be recalculated */
- inode->i_blocks += *count;
- sbi->total_valid_block_count =
- sbi->total_valid_block_count + (block_t)(*count);
spin_unlock(&sbi->stat_lock);
- percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
+ f2fs_i_blocks_write(inode, *count, true);
return true;
}
@@ -1158,9 +1185,9 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
f2fs_bug_on(sbi, inode->i_blocks < count);
- inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
+ f2fs_i_blocks_write(inode, count, false);
}
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1295,7 +1322,7 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
}
if (inode)
- inode->i_blocks++;
+ f2fs_i_blocks_write(inode, 1, true);
sbi->total_valid_node_count++;
sbi->total_valid_block_count++;
@@ -1314,7 +1341,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, !sbi->total_valid_node_count);
f2fs_bug_on(sbi, !inode->i_blocks);
- inode->i_blocks--;
+ f2fs_i_blocks_write(inode, 1, false);
sbi->total_valid_node_count--;
sbi->total_valid_block_count--;
@@ -1511,12 +1538,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_AUTO_RECOVER, /* indicate inode is recoverable */
FI_DIRTY_DIR, /* indicate directory has dirty pages */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
FI_FREE_NID, /* free allocated nide */
- FI_UPDATE_DIR, /* should update inode block for consistency */
FI_NO_EXTENT, /* not to use the extent cache */
FI_INLINE_XATTR, /* used for inline xattr */
FI_INLINE_DATA, /* used for inline data*/
@@ -1534,64 +1561,143 @@ enum {
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
};
-static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void __mark_inode_dirty_flag(struct inode *inode,
+ int flag, bool set)
+{
+ switch (flag) {
+ case FI_INLINE_XATTR:
+ case FI_INLINE_DATA:
+ case FI_INLINE_DENTRY:
+ if (set)
+ return;
+ case FI_DATA_EXIST:
+ case FI_INLINE_DOTS:
+ f2fs_mark_inode_dirty_sync(inode);
+ }
+}
+
+static inline void set_inode_flag(struct inode *inode, int flag)
+{
+ if (!test_bit(flag, &F2FS_I(inode)->flags))
+ set_bit(flag, &F2FS_I(inode)->flags);
+ __mark_inode_dirty_flag(inode, flag, true);
+}
+
+static inline int is_inode_flag_set(struct inode *inode, int flag)
+{
+ return test_bit(flag, &F2FS_I(inode)->flags);
+}
+
+static inline void clear_inode_flag(struct inode *inode, int flag)
+{
+ if (test_bit(flag, &F2FS_I(inode)->flags))
+ clear_bit(flag, &F2FS_I(inode)->flags);
+ __mark_inode_dirty_flag(inode, flag, false);
+}
+
+static inline void set_acl_inode(struct inode *inode, umode_t mode)
{
- if (!test_bit(flag, &fi->flags))
- set_bit(flag, &fi->flags);
+ F2FS_I(inode)->i_acl_mode = mode;
+ set_inode_flag(inode, FI_ACL_MODE);
+ f2fs_mark_inode_dirty_sync(inode);
}
-static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+static inline void f2fs_i_links_write(struct inode *inode, bool inc)
{
- return test_bit(flag, &fi->flags);
+ if (inc)
+ inc_nlink(inode);
+ else
+ drop_nlink(inode);
+ f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_blocks_write(struct inode *inode,
+ blkcnt_t diff, bool add)
+{
+ bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+ bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+ inode->i_blocks = add ? inode->i_blocks + diff :
+ inode->i_blocks - diff;
+ f2fs_mark_inode_dirty_sync(inode);
+ if (clean || recover)
+ set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+ bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+ bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+ if (i_size_read(inode) == i_size)
+ return;
+
+ i_size_write(inode, i_size);
+ f2fs_mark_inode_dirty_sync(inode);
+ if (clean || recover)
+ set_inode_flag(inode, FI_AUTO_RECOVER);
}
-static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline bool f2fs_skip_inode_update(struct inode *inode)
{
- if (test_bit(flag, &fi->flags))
- clear_bit(flag, &fi->flags);
+ if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
+ return false;
+ return F2FS_I(inode)->last_disk_size == i_size_read(inode);
}
-static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
{
- fi->i_acl_mode = mode;
- set_inode_flag(fi, FI_ACL_MODE);
+ F2FS_I(inode)->i_current_depth = depth;
+ f2fs_mark_inode_dirty_sync(inode);
}
-static inline void get_inline_info(struct f2fs_inode_info *fi,
- struct f2fs_inode *ri)
+static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
{
+ F2FS_I(inode)->i_xattr_nid = xnid;
+ f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
+{
+ F2FS_I(inode)->i_pino = pino;
+ f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
if (ri->i_inline & F2FS_INLINE_XATTR)
- set_inode_flag(fi, FI_INLINE_XATTR);
+ set_bit(FI_INLINE_XATTR, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DATA)
- set_inode_flag(fi, FI_INLINE_DATA);
+ set_bit(FI_INLINE_DATA, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DENTRY)
- set_inode_flag(fi, FI_INLINE_DENTRY);
+ set_bit(FI_INLINE_DENTRY, &fi->flags);
if (ri->i_inline & F2FS_DATA_EXIST)
- set_inode_flag(fi, FI_DATA_EXIST);
+ set_bit(FI_DATA_EXIST, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
- set_inode_flag(fi, FI_INLINE_DOTS);
+ set_bit(FI_INLINE_DOTS, &fi->flags);
}
-static inline void set_raw_inline(struct f2fs_inode_info *fi,
- struct f2fs_inode *ri)
+static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
{
ri->i_inline = 0;
- if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ if (is_inode_flag_set(inode, FI_INLINE_XATTR))
ri->i_inline |= F2FS_INLINE_XATTR;
- if (is_inode_flag_set(fi, FI_INLINE_DATA))
+ if (is_inode_flag_set(inode, FI_INLINE_DATA))
ri->i_inline |= F2FS_INLINE_DATA;
- if (is_inode_flag_set(fi, FI_INLINE_DENTRY))
+ if (is_inode_flag_set(inode, FI_INLINE_DENTRY))
ri->i_inline |= F2FS_INLINE_DENTRY;
- if (is_inode_flag_set(fi, FI_DATA_EXIST))
+ if (is_inode_flag_set(inode, FI_DATA_EXIST))
ri->i_inline |= F2FS_DATA_EXIST;
- if (is_inode_flag_set(fi, FI_INLINE_DOTS))
+ if (is_inode_flag_set(inode, FI_INLINE_DOTS))
ri->i_inline |= F2FS_INLINE_DOTS;
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+ return is_inode_flag_set(inode, FI_INLINE_XATTR);
}
static inline unsigned int addrs_per_inode(struct inode *inode)
@@ -1618,43 +1724,43 @@ static inline int inline_xattr_size(struct inode *inode)
static inline int f2fs_has_inline_data(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+ return is_inode_flag_set(inode, FI_INLINE_DATA);
}
static inline void f2fs_clear_inline_inode(struct inode *inode)
{
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ clear_inode_flag(inode, FI_INLINE_DATA);
+ clear_inode_flag(inode, FI_DATA_EXIST);
}
static inline int f2fs_exist_data(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
+ return is_inode_flag_set(inode, FI_DATA_EXIST);
}
static inline int f2fs_has_inline_dots(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
+ return is_inode_flag_set(inode, FI_INLINE_DOTS);
}
static inline bool f2fs_is_atomic_file(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+ return is_inode_flag_set(inode, FI_ATOMIC_FILE);
}
static inline bool f2fs_is_volatile_file(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+ return is_inode_flag_set(inode, FI_VOLATILE_FILE);
}
static inline bool f2fs_is_first_block_written(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
}
static inline bool f2fs_is_drop_cache(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+ return is_inode_flag_set(inode, FI_DROP_CACHE);
}
static inline void *inline_data_addr(struct page *page)
@@ -1665,7 +1771,7 @@ static inline void *inline_data_addr(struct page *page)
static inline int f2fs_has_inline_dentry(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
+ return is_inode_flag_set(inode, FI_INLINE_DENTRY);
}
static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
@@ -1682,11 +1788,13 @@ static inline int is_file(struct inode *inode, int type)
static inline void set_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise |= type;
+ f2fs_mark_inode_dirty_sync(inode);
}
static inline void clear_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise &= ~type;
+ f2fs_mark_inode_dirty_sync(inode);
}
static inline int f2fs_readonly(struct super_block *sb)
@@ -1713,7 +1821,7 @@ static inline bool is_dot_dotdot(const struct qstr *str)
static inline bool f2fs_may_extent_tree(struct inode *inode)
{
if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
- is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ is_inode_flag_set(inode, FI_NO_EXTENT))
return false;
return S_ISREG(inode->i_mode);
@@ -1749,7 +1857,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
}
#define get_inode_mode(i) \
- ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
/* get offset of first page in next direct node */
@@ -1764,7 +1872,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
int truncate_blocks(struct inode *, u64, bool);
-int f2fs_truncate(struct inode *, bool);
+int f2fs_truncate(struct inode *);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
@@ -1805,11 +1913,11 @@ struct page *init_inode_metadata(struct inode *, struct inode *,
const struct qstr *, struct page *);
void update_parent_metadata(struct inode *, struct inode *, unsigned int);
int room_for_filename(const void *, int, int);
-void f2fs_drop_nlink(struct inode *, struct inode *, struct page *);
+void f2fs_drop_nlink(struct inode *, struct inode *);
struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
struct page **);
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
-ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+ino_t f2fs_inode_by_name(struct inode *, struct qstr *, struct page **);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
@@ -1833,6 +1941,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
/*
* super.c
*/
+int f2fs_inode_dirtied(struct inode *);
+void f2fs_inode_synced(struct inode *);
int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
extern __printf(3, 4)
@@ -1866,11 +1976,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
-void sync_inode_page(struct dnode_of_data *);
void move_node_page(struct page *, int);
-int fsync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *,
- bool);
+int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
+ struct writeback_control *, bool);
int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
+void build_free_nids(struct f2fs_sb_info *);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -1944,9 +2054,10 @@ void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
void release_ino_entry(struct f2fs_sb_info *, bool);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
+int f2fs_sync_inode_meta(struct f2fs_sb_info *);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
-void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void add_orphan_inode(struct inode *);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
@@ -1981,6 +2092,7 @@ struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct f2fs_io_info *);
int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_set_page_dirty_nobuffers(struct page *);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
@@ -2012,7 +2124,7 @@ struct f2fs_stat_info {
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages;
- unsigned int ndirty_dirs, ndirty_files;
+ unsigned int ndirty_dirs, ndirty_files, ndirty_all;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
int bg_gc, wb_bios;
@@ -2181,7 +2293,6 @@ int f2fs_write_inline_data(struct inode *, struct page *);
bool recover_inline_data(struct inode *, struct page *);
struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
struct fscrypt_name *, struct page **);
-struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
nid_t, umode_t);
@@ -2206,6 +2317,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *);
*/
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+void f2fs_drop_extent_tree(struct inode *);
unsigned int f2fs_destroy_extent_node(struct inode *);
void f2fs_destroy_extent_tree(struct inode *);
bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
@@ -2241,6 +2353,26 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
}
+static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+}
+
+static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+{
+ clear_opt(sbi, ADAPTIVE);
+ clear_opt(sbi, LFS);
+
+ switch (mt) {
+ case F2FS_MOUNT_ADAPTIVE:
+ set_opt(sbi, ADAPTIVE);
+ break;
+ case F2FS_MOUNT_LFS:
+ set_opt(sbi, LFS);
+ break;
+ }
+}
+
static inline bool f2fs_may_encrypt(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f4c0086655c4..0e493f63ea41 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -21,6 +21,7 @@
#include <linux/mount.h>
#include <linux/pagevec.h>
#include <linux/uuid.h>
+#include <linux/file.h>
#include "f2fs.h"
#include "node.h"
@@ -81,7 +82,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
zero_user_segment(page, offset, PAGE_SIZE);
}
set_page_dirty(page);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
@@ -171,22 +173,16 @@ static void try_to_fix_pino(struct inode *inode)
fi->xattr_ver = 0;
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
- fi->i_pino = pino;
+ f2fs_i_pino_write(inode, pino);
file_got_pino(inode);
- up_write(&fi->i_sem);
-
- mark_inode_dirty_sync(inode);
- f2fs_write_inode(inode, NULL);
- } else {
- up_write(&fi->i_sem);
}
+ up_write(&fi->i_sem);
}
static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
int datasync, bool atomic)
{
struct inode *inode = file->f_mapping->host;
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t ino = inode->i_ino;
int ret = 0;
@@ -204,9 +200,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
/* if fdatasync is triggered, let's do in-place-update */
if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
- set_inode_flag(fi, FI_NEED_IPU);
+ set_inode_flag(inode, FI_NEED_IPU);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- clear_inode_flag(fi, FI_NEED_IPU);
+ clear_inode_flag(inode, FI_NEED_IPU);
if (ret) {
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -214,7 +210,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
}
/* if the inode is dirty, let's recover all the time */
- if (!datasync) {
+ if (!datasync && !f2fs_skip_inode_update(inode)) {
f2fs_write_inode(inode, NULL);
goto go_write;
}
@@ -222,14 +218,14 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
/*
* if there is no written data, don't waste time to write recovery info.
*/
- if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
+ if (!is_inode_flag_set(inode, FI_APPEND_WRITE) &&
!exist_written_data(sbi, ino, APPEND_INO)) {
/* it may call write_inode just prior to fsync */
if (need_inode_page_update(sbi, ino))
goto go_write;
- if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE) ||
exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
@@ -239,9 +235,9 @@ go_write:
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&fi->i_sem);
+ down_read(&F2FS_I(inode)->i_sem);
need_cp = need_do_checkpoint(inode);
- up_read(&fi->i_sem);
+ up_read(&F2FS_I(inode)->i_sem);
if (need_cp) {
/* all the dirty node pages should be flushed for POR */
@@ -252,12 +248,12 @@ go_write:
* will be used only for fsynced inodes after checkpoint.
*/
try_to_fix_pino(inode);
- clear_inode_flag(fi, FI_APPEND_WRITE);
- clear_inode_flag(fi, FI_UPDATE_WRITE);
+ clear_inode_flag(inode, FI_APPEND_WRITE);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
goto out;
}
sync_nodes:
- ret = fsync_node_pages(sbi, ino, &wbc, atomic);
+ ret = fsync_node_pages(sbi, inode, &wbc, atomic);
if (ret)
goto out;
@@ -268,7 +264,7 @@ sync_nodes:
}
if (need_inode_block_update(sbi, ino)) {
- mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode);
f2fs_write_inode(inode, NULL);
goto sync_nodes;
}
@@ -279,10 +275,10 @@ sync_nodes:
/* once recovery info is written, don't need to tack this */
remove_ino_entry(sbi, ino, APPEND_INO);
- clear_inode_flag(fi, FI_APPEND_WRITE);
+ clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
remove_ino_entry(sbi, ino, UPDATE_INO);
- clear_inode_flag(fi, FI_UPDATE_WRITE);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
ret = f2fs_issue_flush(sbi);
f2fs_update_time(sbi, REQ_TIME);
out:
@@ -360,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
@@ -487,8 +483,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
set_data_blkaddr(dn);
invalidate_blocks(sbi, blkaddr);
if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
- clear_inode_flag(F2FS_I(dn->inode),
- FI_FIRST_BLOCK_WRITTEN);
+ clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
nr_free++;
}
@@ -502,7 +497,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
dn->inode) + ofs;
f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
- sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
@@ -616,7 +610,7 @@ free_partial:
return err;
}
-int f2fs_truncate(struct inode *inode, bool lock)
+int f2fs_truncate(struct inode *inode)
{
int err;
@@ -633,12 +627,12 @@ int f2fs_truncate(struct inode *inode, bool lock)
return err;
}
- err = truncate_blocks(inode, i_size_read(inode), lock);
+ err = truncate_blocks(inode, i_size_read(inode), true);
if (err)
return err;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
return 0;
}
@@ -654,7 +648,6 @@ int f2fs_getattr(struct vfsmount *mnt,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
static void __setattr_copy(struct inode *inode, const struct iattr *attr)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
@@ -675,7 +668,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
mode &= ~S_ISGID;
- set_acl_inode(fi, mode);
+ set_acl_inode(inode, mode);
}
}
#else
@@ -685,7 +678,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
- struct f2fs_inode_info *fi = F2FS_I(inode);
int err;
err = inode_change_ok(inode, attr);
@@ -699,7 +691,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size <= i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
- err = f2fs_truncate(inode, true);
+ err = f2fs_truncate(inode);
if (err)
return err;
f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -724,13 +716,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_MODE) {
err = posix_acl_chmod(inode, get_inode_mode(inode));
- if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
- inode->i_mode = fi->i_acl_mode;
- clear_inode_flag(fi, FI_ACL_MODE);
+ if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ clear_inode_flag(inode, FI_ACL_MODE);
}
}
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
return err;
}
@@ -859,79 +851,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int __exchange_data_block(struct inode *inode, pgoff_t src,
- pgoff_t dst, bool full)
+static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr,
+ int *do_replace, pgoff_t off, pgoff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- block_t new_addr;
- bool do_replace = false;
- int ret;
+ int ret, done, i;
+next_dnode:
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+ ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
if (ret && ret != -ENOENT) {
return ret;
} else if (ret == -ENOENT) {
- new_addr = NULL_ADDR;
- } else {
- new_addr = dn.data_blkaddr;
- if (!is_checkpointed_data(sbi, new_addr)) {
+ if (dn.max_level == 0)
+ return -ENOENT;
+ done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len);
+ blkaddr += done;
+ do_replace += done;
+ goto next;
+ }
+
+ done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
+ dn.ofs_in_node, len);
+ for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
+ *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ if (!is_checkpointed_data(sbi, *blkaddr)) {
+
+ if (test_opt(sbi, LFS)) {
+ f2fs_put_dnode(&dn);
+ return -ENOTSUPP;
+ }
+
/* do not invalidate this block address */
f2fs_update_data_blkaddr(&dn, NULL_ADDR);
- do_replace = true;
+ *do_replace = 1;
}
- f2fs_put_dnode(&dn);
}
+ f2fs_put_dnode(&dn);
+next:
+ len -= done;
+ off += done;
+ if (len)
+ goto next_dnode;
+ return 0;
+}
- if (new_addr == NULL_ADDR)
- return full ? truncate_hole(inode, dst, dst + 1) : 0;
+static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr,
+ int *do_replace, pgoff_t off, int len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ int ret, i;
- if (do_replace) {
- struct page *ipage = get_node_page(sbi, inode->i_ino);
- struct node_info ni;
+ for (i = 0; i < len; i++, do_replace++, blkaddr++) {
+ if (*do_replace == 0)
+ continue;
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto err_out;
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
+ if (ret) {
+ dec_valid_block_count(sbi, inode, 1);
+ invalidate_blocks(sbi, *blkaddr);
+ } else {
+ f2fs_update_data_blkaddr(&dn, *blkaddr);
}
+ f2fs_put_dnode(&dn);
+ }
+ return 0;
+}
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, dst);
- if (ret)
- goto err_out;
+static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
+ block_t *blkaddr, int *do_replace,
+ pgoff_t src, pgoff_t dst, pgoff_t len, bool full)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode);
+ pgoff_t i = 0;
+ int ret;
+
+ while (i < len) {
+ if (blkaddr[i] == NULL_ADDR && !full) {
+ i++;
+ continue;
+ }
- truncate_data_blocks_range(&dn, 1);
+ if (do_replace[i] || blkaddr[i] == NULL_ADDR) {
+ struct dnode_of_data dn;
+ struct node_info ni;
+ size_t new_size;
+ pgoff_t ilen;
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
- ni.version, true, false);
- f2fs_put_dnode(&dn);
- } else {
- struct page *psrc, *pdst;
+ set_new_dnode(&dn, dst_inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE);
+ if (ret)
+ return ret;
+
+ get_node_info(sbi, dn.nid, &ni);
+ ilen = min((pgoff_t)
+ ADDRS_PER_PAGE(dn.node_page, dst_inode) -
+ dn.ofs_in_node, len - i);
+ do {
+ dn.data_blkaddr = datablock_addr(dn.node_page,
+ dn.ofs_in_node);
+ truncate_data_blocks_range(&dn, 1);
+
+ if (do_replace[i]) {
+ f2fs_i_blocks_write(src_inode,
+ 1, false);
+ f2fs_i_blocks_write(dst_inode,
+ 1, true);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ blkaddr[i], ni.version, true, false);
+
+ do_replace[i] = 0;
+ }
+ dn.ofs_in_node++;
+ i++;
+ new_size = (dst + i) << PAGE_SHIFT;
+ if (dst_inode->i_size < new_size)
+ f2fs_i_size_write(dst_inode, new_size);
+ } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
- psrc = get_lock_data_page(inode, src, true);
- if (IS_ERR(psrc))
- return PTR_ERR(psrc);
- pdst = get_new_data_page(inode, NULL, dst, true);
- if (IS_ERR(pdst)) {
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *psrc, *pdst;
+
+ psrc = get_lock_data_page(src_inode, src + i, true);
+ if (IS_ERR(psrc))
+ return PTR_ERR(psrc);
+ pdst = get_new_data_page(dst_inode, NULL, dst + i,
+ true);
+ if (IS_ERR(pdst)) {
+ f2fs_put_page(psrc, 1);
+ return PTR_ERR(pdst);
+ }
+ f2fs_copy_page(psrc, pdst);
+ set_page_dirty(pdst);
+ f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
- return PTR_ERR(pdst);
- }
- f2fs_copy_page(psrc, pdst);
- set_page_dirty(pdst);
- f2fs_put_page(pdst, 1);
- f2fs_put_page(psrc, 1);
- return truncate_hole(inode, src, src + 1);
+ ret = truncate_hole(src_inode, src + i, src + i + 1);
+ if (ret)
+ return ret;
+ i++;
+ }
}
return 0;
+}
-err_out:
- if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
- f2fs_update_data_blkaddr(&dn, new_addr);
- f2fs_put_dnode(&dn);
+static int __exchange_data_block(struct inode *src_inode,
+ struct inode *dst_inode, pgoff_t src, pgoff_t dst,
+ pgoff_t len, bool full)
+{
+ block_t *src_blkaddr;
+ int *do_replace;
+ pgoff_t olen;
+ int ret;
+
+ while (len) {
+ olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
+
+ src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ if (!src_blkaddr)
+ return -ENOMEM;
+
+ do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ if (!do_replace) {
+ kvfree(src_blkaddr);
+ return -ENOMEM;
+ }
+
+ ret = __read_out_blkaddrs(src_inode, src_blkaddr,
+ do_replace, src, olen);
+ if (ret)
+ goto roll_back;
+
+ ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr,
+ do_replace, src, dst, olen, full);
+ if (ret)
+ goto roll_back;
+
+ src += olen;
+ dst += olen;
+ len -= olen;
+
+ kvfree(src_blkaddr);
+ kvfree(do_replace);
}
+ return 0;
+
+roll_back:
+ __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len);
+ kvfree(src_blkaddr);
+ kvfree(do_replace);
return ret;
}
@@ -939,16 +1051,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
- int ret = 0;
+ int ret;
- for (; end < nrpages; start++, end++) {
- f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
- ret = __exchange_data_block(inode, end, start, true);
- f2fs_unlock_op(sbi);
- if (ret)
- break;
- }
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+
+ f2fs_drop_extent_tree(inode);
+
+ ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
+ f2fs_unlock_op(sbi);
return ret;
}
@@ -992,7 +1103,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ret = truncate_blocks(inode, new_size, true);
if (!ret)
- i_size_write(inode, new_size);
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1128,11 +1239,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
}
out:
- if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
- i_size_write(inode, new_size);
- mark_inode_dirty(inode);
- update_inode_page(inode);
- }
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1140,7 +1248,7 @@ out:
static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- pgoff_t pg_start, pg_end, delta, nrpages, idx;
+ pgoff_t nr, pg_start, pg_end, delta, idx;
loff_t new_size;
int ret = 0;
@@ -1175,14 +1283,20 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
pg_start = offset >> PAGE_SHIFT;
pg_end = (offset + len) >> PAGE_SHIFT;
delta = pg_end - pg_start;
- nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ while (!ret && idx > pg_start) {
+ nr = idx - pg_start;
+ if (nr > delta)
+ nr = delta;
+ idx -= nr;
- for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
f2fs_lock_op(sbi);
- ret = __exchange_data_block(inode, idx, idx + delta, false);
+ f2fs_drop_extent_tree(inode);
+
+ ret = __exchange_data_block(inode, inode, idx,
+ idx + delta, nr, false);
f2fs_unlock_op(sbi);
- if (ret)
- break;
}
/* write out all moved pages, if possible */
@@ -1190,7 +1304,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache(inode, offset);
if (!ret)
- i_size_write(inode, new_size);
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1238,11 +1352,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
- i_size_write(inode, new_size);
- mark_inode_dirty(inode);
- update_inode_page(inode);
- }
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+ f2fs_i_size_write(inode, new_size);
return ret;
}
@@ -1285,7 +1396,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1310,10 +1421,10 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
if (f2fs_is_atomic_file(inode))
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
+ set_inode_flag(inode, FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
- clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+ clear_inode_flag(inode, FI_DROP_CACHE);
}
return 0;
}
@@ -1376,9 +1487,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
fi->i_flags = flags;
inode_unlock(inode);
- f2fs_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ f2fs_set_inode_flags(inode);
out:
mnt_drop_write_file(filp);
return ret;
@@ -1412,7 +1522,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (ret)
goto out;
- set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_ATOMIC_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
if (!get_dirty_pages(inode))
@@ -1423,7 +1533,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
out:
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -1448,10 +1558,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
goto err_out;
if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
ret = commit_inmem_pages(inode);
if (ret) {
- set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_ATOMIC_FILE);
goto err_out;
}
}
@@ -1484,7 +1594,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (ret)
goto out;
- set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ set_inode_flag(inode, FI_VOLATILE_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
out:
inode_unlock(inode);
@@ -1538,7 +1648,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (f2fs_is_atomic_file(inode))
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
}
@@ -1871,7 +1981,7 @@ do_map:
continue;
}
- set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_DO_DEFRAG);
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
@@ -1896,14 +2006,14 @@ do_map:
if (idx < pg_end && cnt < blk_per_seg)
goto do_map;
- clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_DO_DEFRAG);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
goto out;
}
clear_out:
- clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_DO_DEFRAG);
out:
inode_unlock(inode);
if (!err)
@@ -1959,6 +2069,133 @@ out:
return err;
}
+static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, size_t len)
+{
+ struct inode *src = file_inode(file_in);
+ struct inode *dst = file_inode(file_out);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(src);
+ size_t olen = len, dst_max_i_size = 0;
+ size_t dst_osize;
+ int ret;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ src->i_sb != dst->i_sb)
+ return -EXDEV;
+
+ if (unlikely(f2fs_readonly(src->i_sb)))
+ return -EROFS;
+
+ if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode))
+ return -EISDIR;
+
+ if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+ return -EOPNOTSUPP;
+
+ inode_lock(src);
+ if (src != dst)
+ inode_lock(dst);
+
+ ret = -EINVAL;
+ if (pos_in + len > src->i_size || pos_in + len < pos_in)
+ goto out_unlock;
+ if (len == 0)
+ olen = len = src->i_size - pos_in;
+ if (pos_in + len == src->i_size)
+ len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in;
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ dst_osize = dst->i_size;
+ if (pos_out + olen > dst->i_size)
+ dst_max_i_size = pos_out + olen;
+
+ /* verify the end result is block aligned */
+ if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) ||
+ !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) ||
+ !IS_ALIGNED(pos_out, F2FS_BLKSIZE))
+ goto out_unlock;
+
+ ret = f2fs_convert_inline_inode(src);
+ if (ret)
+ goto out_unlock;
+
+ ret = f2fs_convert_inline_inode(dst);
+ if (ret)
+ goto out_unlock;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(src->i_mapping,
+ pos_in, pos_in + len);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(dst->i_mapping,
+ pos_out, pos_out + len);
+ if (ret)
+ goto out_unlock;
+
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+ ret = __exchange_data_block(src, dst, pos_in,
+ pos_out, len >> F2FS_BLKSIZE_BITS, false);
+
+ if (!ret) {
+ if (dst_max_i_size)
+ f2fs_i_size_write(dst, dst_max_i_size);
+ else if (dst_osize != dst->i_size)
+ f2fs_i_size_write(dst, dst_osize);
+ }
+ f2fs_unlock_op(sbi);
+out_unlock:
+ if (src != dst)
+ inode_unlock(dst);
+ inode_unlock(src);
+ return ret;
+}
+
+static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
+{
+ struct f2fs_move_range range;
+ struct fd dst;
+ int err;
+
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&range, (struct f2fs_move_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ dst = fdget(range.dst_fd);
+ if (!dst.file)
+ return -EBADF;
+
+ if (!(dst.file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto err_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto err_out;
+
+ err = f2fs_move_file_range(filp, range.pos_in, dst.file,
+ range.pos_out, range.len);
+
+ mnt_drop_write_file(filp);
+
+ if (copy_to_user((struct f2fs_move_range __user *)arg,
+ &range, sizeof(range)))
+ err = -EFAULT;
+err_out:
+ fdput(dst);
+ return err;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -1994,6 +2231,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_write_checkpoint(filp, arg);
case F2FS_IOC_DEFRAGMENT:
return f2fs_ioc_defragment(filp, arg);
+ case F2FS_IOC_MOVE_RANGE:
+ return f2fs_ioc_move_range(filp, arg);
default:
return -ENOTTY;
}
@@ -2003,6 +2242,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct blk_plug plug;
ssize_t ret;
if (f2fs_encrypted_inode(inode) &&
@@ -2014,8 +2254,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = generic_write_checks(iocb, from);
if (ret > 0) {
ret = f2fs_preallocate_blocks(iocb, from);
- if (!ret)
+ if (!ret) {
+ blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
+ blk_finish_plug(&plug);
+ }
}
inode_unlock(inode);
@@ -2050,6 +2293,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_WRITE_CHECKPOINT:
case F2FS_IOC_DEFRAGMENT:
break;
+ case F2FS_IOC_MOVE_RANGE:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f06ed73adf99..8f7fa326ce95 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -594,11 +594,11 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
/* write page */
lock_page(fio.encrypted_page);
- if (unlikely(!PageUptodate(fio.encrypted_page))) {
+ if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
err = -EIO;
goto put_page_out;
}
- if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+ if (unlikely(!PageUptodate(fio.encrypted_page))) {
err = -EIO;
goto put_page_out;
}
@@ -619,9 +619,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
f2fs_submit_page_mbio(&fio);
f2fs_update_data_blkaddr(&dn, newaddr);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
- set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
@@ -656,12 +656,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
.page = page,
.encrypted_page = NULL,
};
+ bool is_dirty = PageDirty(page);
+ int err;
+
+retry:
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page))
inode_dec_dirty_pages(inode);
+
set_cold_data(page);
- do_write_data_page(&fio);
+
+ err = do_write_data_page(&fio);
+ if (err == -ENOMEM && is_dirty) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+
clear_cold_data(page);
}
out:
@@ -748,12 +759,32 @@ next_step:
/* phase 3 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ bool locked = false;
+
+ if (S_ISREG(inode->i_mode)) {
+ if (!down_write_trylock(&fi->dio_rwsem[READ]))
+ continue;
+ if (!down_write_trylock(
+ &fi->dio_rwsem[WRITE])) {
+ up_write(&fi->dio_rwsem[READ]);
+ continue;
+ }
+ locked = true;
+ }
+
start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
move_encrypted_block(inode, start_bidx);
else
move_data_page(inode, start_bidx, gc_type);
+
+ if (locked) {
+ up_write(&fi->dio_rwsem[WRITE]);
+ up_write(&fi->dio_rwsem[READ]);
+ }
+
stat_inc_data_blk_count(sbi, 1, gc_type);
}
}
@@ -802,6 +833,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
blk_start_plug(&plug);
for (segno = start_segno; segno < end_segno; segno++) {
+
+ if (get_valid_blocks(sbi, segno, 1) == 0)
+ continue;
+
/* find segment summary of victim */
sum_page = find_get_page(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
@@ -877,10 +912,13 @@ gc_more:
* enough free sections, we should flush dent/node blocks and do
* garbage collections.
*/
- if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
+ if (__get_victim(sbi, &segno, gc_type) ||
+ prefree_segments(sbi)) {
write_checkpoint(sbi, &cpc);
- else if (has_not_enough_free_secs(sbi, 0))
+ segno = NULL_SEGNO;
+ } else if (has_not_enough_free_secs(sbi, 0)) {
write_checkpoint(sbi, &cpc);
+ }
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index c15e53c1d794..ccea8735de59 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -59,7 +59,8 @@ void read_inline_data(struct page *page, struct page *ipage)
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
flush_dcache_page(page);
kunmap_atomic(dst_addr);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
}
bool truncate_inline_inode(struct page *ipage, u64 from)
@@ -73,7 +74,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
f2fs_wait_on_page_writeback(ipage, NODE, true);
memset(addr + from, 0, MAX_INLINE_DATA - from);
-
+ set_page_dirty(ipage);
return true;
}
@@ -97,7 +98,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
else
read_inline_data(page, ipage);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
f2fs_put_page(ipage, 1);
unlock_page(page);
return 0;
@@ -139,7 +141,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
inode_dec_dirty_pages(dn->inode);
/* this converted inline_data should be recovered. */
- set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
+ set_inode_flag(dn->inode, FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
truncate_inline_inode(dn->inode_page, 0);
@@ -147,7 +149,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
clear_out:
stat_dec_inline_inode(dn->inode);
f2fs_clear_inline_inode(dn->inode);
- sync_inode_page(dn);
f2fs_put_dnode(dn);
return 0;
}
@@ -213,11 +214,11 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
dst_addr = inline_data_addr(dn.inode_page);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
kunmap_atomic(src_addr);
+ set_page_dirty(dn.inode_page);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_DATA_EXIST);
- sync_inode_page(&dn);
clear_inline_node(dn.inode_page);
f2fs_put_dnode(&dn);
return 0;
@@ -253,10 +254,10 @@ process_inline:
dst_addr = inline_data_addr(ipage);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_inode_flag(inode, FI_INLINE_DATA);
+ set_inode_flag(inode, FI_DATA_EXIST);
- update_inode(inode, ipage);
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return true;
}
@@ -267,7 +268,6 @@ process_inline:
if (!truncate_inline_inode(ipage, 0))
return false;
f2fs_clear_inline_inode(inode);
- update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
if (truncate_blocks(inode, 0, false))
@@ -289,8 +289,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
f2fs_hash_t namehash;
ipage = get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage))
+ if (IS_ERR(ipage)) {
+ *res_page = ipage;
return NULL;
+ }
namehash = f2fs_dentry_hash(&name);
@@ -307,25 +309,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
return de;
}
-struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir,
- struct page **p)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct page *ipage;
- struct f2fs_dir_entry *de;
- struct f2fs_inline_dentry *dentry_blk;
-
- ipage = get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage))
- return NULL;
-
- dentry_blk = inline_data_addr(ipage);
- de = &dentry_blk->dentry[1];
- *p = ipage;
- unlock_page(ipage);
- return de;
-}
-
int make_empty_inline_dir(struct inode *inode, struct inode *parent,
struct page *ipage)
{
@@ -340,10 +323,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
set_page_dirty(ipage);
/* update i_size to MAX_INLINE_DATA */
- if (i_size_read(inode) < MAX_INLINE_DATA) {
- i_size_write(inode, MAX_INLINE_DATA);
- set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- }
+ if (i_size_read(inode) < MAX_INLINE_DATA)
+ f2fs_i_size_write(inode, MAX_INLINE_DATA);
return 0;
}
@@ -392,22 +373,19 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
NR_INLINE_DENTRY * F2FS_SLOT_LEN);
kunmap_atomic(dentry_blk);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
truncate_inline_inode(ipage, 0);
stat_dec_inline_dir(dir);
- clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+ clear_inode_flag(dir, FI_INLINE_DENTRY);
- F2FS_I(dir)->i_current_depth = 1;
- if (i_size_read(dir) < PAGE_SIZE) {
- i_size_write(dir, PAGE_SIZE);
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
-
- sync_inode_page(&dn);
+ f2fs_i_depth_write(dir, 1);
+ if (i_size_read(dir) < PAGE_SIZE)
+ f2fs_i_size_write(dir, PAGE_SIZE);
out:
f2fs_put_page(page, 1);
return err;
@@ -465,7 +443,6 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
struct f2fs_inline_dentry *inline_dentry)
{
struct f2fs_inline_dentry *backup_dentry;
- struct f2fs_inode_info *fi = F2FS_I(dir);
int err;
backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry),
@@ -487,16 +464,15 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
lock_page(ipage);
stat_dec_inline_dir(dir);
- clear_inode_flag(fi, FI_INLINE_DENTRY);
- update_inode(dir, ipage);
+ clear_inode_flag(dir, FI_INLINE_DENTRY);
kfree(backup_dentry);
return 0;
recover:
lock_page(ipage);
memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
- fi->i_current_depth = 0;
- i_size_write(dir, MAX_INLINE_DATA);
- update_inode(dir, ipage);
+ f2fs_i_depth_write(dir, 0);
+ f2fs_i_size_write(dir, MAX_INLINE_DATA);
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
kfree(backup_dentry);
@@ -560,8 +536,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
/* we don't need to mark_inode_dirty now */
if (inode) {
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
+ f2fs_i_pino_write(inode, dir->i_ino);
f2fs_put_page(page, 1);
}
@@ -569,11 +544,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
-
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
- update_inode(dir, ipage);
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
out:
f2fs_put_page(ipage, 1);
return err;
@@ -597,13 +567,13 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
&inline_dentry->dentry_bitmap);
set_page_dirty(page);
+ f2fs_put_page(page, 1);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ f2fs_mark_inode_dirty_sync(dir);
if (inode)
- f2fs_drop_nlink(dir, inode, page);
-
- f2fs_put_page(page, 1);
+ f2fs_drop_nlink(dir, inode);
}
bool f2fs_empty_inline_dir(struct inode *dir)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2e68adab0d64..9ac5efc15347 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -18,6 +18,13 @@
#include <trace/events/f2fs.h>
+void f2fs_mark_inode_dirty_sync(struct inode *inode)
+{
+ if (f2fs_inode_dirtied(inode))
+ return;
+ mark_inode_dirty_sync(inode);
+}
+
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
@@ -35,6 +42,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_DIRSYNC;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ f2fs_mark_inode_dirty_sync(inode);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -85,8 +93,8 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
if (*start++) {
f2fs_wait_on_page_writeback(ipage, NODE, true);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
+ set_inode_flag(inode, FI_DATA_EXIST);
+ set_raw_inline(inode, F2FS_INODE(ipage));
set_page_dirty(ipage);
return;
}
@@ -141,7 +149,7 @@ static int do_read_inode(struct inode *inode)
if (f2fs_init_extent_tree(inode, &ri->i_ext))
set_page_dirty(node_page);
- get_inline_info(fi, ri);
+ get_inline_info(inode, ri);
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
@@ -151,7 +159,10 @@ static int do_read_inode(struct inode *inode)
__get_inode_rdev(inode, ri);
if (__written_first_block(ri))
- set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+ if (!need_inode_block_update(sbi, inode->i_ino))
+ fi->last_disk_size = inode->i_size;
f2fs_put_page(node_page, 1);
@@ -227,6 +238,8 @@ int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
+ f2fs_inode_synced(inode);
+
f2fs_wait_on_page_writeback(node_page, NODE, true);
ri = F2FS_INODE(node_page);
@@ -244,7 +257,7 @@ int update_inode(struct inode *inode, struct page *node_page)
&ri->i_ext);
else
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
- set_raw_inline(F2FS_I(inode), ri);
+ set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -261,7 +274,6 @@ int update_inode(struct inode *inode, struct page *node_page)
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
/* deleted inode */
if (inode->i_nlink == 0)
@@ -285,6 +297,7 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi, false);
}
+ f2fs_inode_synced(inode);
return 0;
}
ret = update_inode(inode, node_page);
@@ -300,7 +313,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
- if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
/*
@@ -318,8 +331,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
- nid_t xnid = fi->i_xattr_nid;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int err = 0;
/* some remained atomic pages should discarded */
@@ -341,12 +353,17 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(FAULT_EVICT_INODE))
+ goto no_delete;
+#endif
+
sb_start_intwrite(inode->i_sb);
- set_inode_flag(fi, FI_NO_ALLOC);
+ set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
if (F2FS_HAS_BLOCKS(inode))
- err = f2fs_truncate(inode, true);
+ err = f2fs_truncate(inode);
if (!err) {
f2fs_lock_op(sbi);
@@ -360,6 +377,8 @@ retry:
goto retry;
}
+ if (err)
+ update_inode_page(inode);
sb_end_intwrite(inode->i_sb);
no_delete:
stat_dec_inline_xattr(inode);
@@ -369,13 +388,13 @@ no_delete:
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(fi, FI_APPEND_WRITE))
+ if (is_inode_flag_set(inode, FI_APPEND_WRITE))
add_ino_entry(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
- if (is_inode_flag_set(fi, FI_FREE_NID)) {
+ if (is_inode_flag_set(inode, FI_FREE_NID)) {
alloc_nid_failed(sbi, inode->i_ino);
- clear_inode_flag(fi, FI_FREE_NID);
+ clear_inode_flag(inode, FI_FREE_NID);
}
f2fs_bug_on(sbi, err &&
!exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
@@ -407,11 +426,11 @@ void handle_failed_inode(struct inode *inode)
f2fs_msg(sbi->sb, KERN_WARNING,
"Too many orphan inodes, run fsck to fix.");
} else {
- add_orphan_inode(sbi, inode->i_ino);
+ add_orphan_inode(inode);
}
alloc_nid_done(sbi, inode->i_ino);
} else {
- set_inode_flag(F2FS_I(inode), FI_FREE_NID);
+ set_inode_flag(inode, FI_FREE_NID);
}
f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 324ed3812f30..73fa356f8fbb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
+ set_inode_flag(inode, FI_NEW_INODE);
+
+ if (test_opt(sbi, INLINE_XATTR))
+ set_inode_flag(inode, FI_INLINE_XATTR);
if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
- set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+ set_inode_flag(inode, FI_INLINE_DENTRY);
f2fs_init_extent_tree(inode, NULL);
@@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
stat_inc_inline_dir(inode);
trace_f2fs_new_inode(inode, 0);
- mark_inode_dirty(inode);
return inode;
fail:
trace_f2fs_new_inode(inode, err);
make_bad_inode(inode);
if (nid_free)
- set_inode_flag(F2FS_I(inode), FI_FREE_NID);
+ set_inode_flag(inode, FI_FREE_NID);
iput(inode);
return ERR_PTR(err);
}
@@ -177,7 +180,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
inode->i_ctime = CURRENT_TIME;
ihold(inode);
- set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -190,7 +193,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_sync_fs(sbi->sb, 1);
return 0;
out:
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_inode_flag(inode, FI_INC_LINK);
iput(inode);
f2fs_unlock_op(sbi);
return err;
@@ -199,9 +202,13 @@ out:
struct dentry *f2fs_get_parent(struct dentry *child)
{
struct qstr dotdot = QSTR_INIT("..", 2);
- unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot);
- if (!ino)
+ struct page *page;
+ unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+ if (!ino) {
+ if (IS_ERR(page))
+ return ERR_CAST(page);
return ERR_PTR(-ENOENT);
+ }
return d_obtain_alias(f2fs_iget(child->d_sb, ino));
}
@@ -229,6 +236,9 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
if (de) {
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out;
} else {
err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
if (err)
@@ -239,14 +249,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
if (de) {
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
} else {
err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
}
out:
- if (!err) {
- clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
- mark_inode_dirty(dir);
- }
+ if (!err)
+ clear_inode_flag(dir, FI_INLINE_DOTS);
f2fs_unlock_op(sbi);
return err;
@@ -281,8 +291,11 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (!de)
+ if (!de) {
+ if (IS_ERR(page))
+ return (struct dentry *)page;
return d_splice_alias(inode, dentry);
+ }
ino = le32_to_cpu(de->ino);
f2fs_dentry_kunmap(dir, page);
@@ -329,8 +342,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
trace_f2fs_unlink_enter(dir, dentry);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (!de)
+ if (!de) {
+ if (IS_ERR(page))
+ err = PTR_ERR(page);
goto fail;
+ }
f2fs_balance_fs(sbi, true);
@@ -345,9 +361,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_delete_entry(de, page, dir, inode);
f2fs_unlock_op(sbi);
- /* In order to evict this inode, we set it dirty */
- mark_inode_dirty(inode);
-
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
@@ -492,7 +505,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
f2fs_balance_fs(sbi, true);
- set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -509,7 +522,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
out_fail:
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_inode_flag(inode, FI_INC_LINK);
handle_failed_inode(inode);
return err;
}
@@ -592,17 +605,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
* add this non-linked tmpfile to orphan list, in this way we could
* remove all unused data of tmpfile after abnormal power-off.
*/
- add_orphan_inode(sbi, inode->i_ino);
- f2fs_unlock_op(sbi);
-
+ add_orphan_inode(inode);
alloc_nid_done(sbi, inode->i_ino);
if (whiteout) {
- inode_dec_link_count(inode);
+ f2fs_i_links_write(inode, false);
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
}
+ /* link_count was changed by d_tmpfile as well. */
+ f2fs_unlock_op(sbi);
unlock_new_inode(inode);
return 0;
@@ -652,14 +665,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
- if (!old_entry)
+ if (!old_entry) {
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
goto out;
+ }
if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
- if (!old_dir_entry)
+ if (!old_dir_entry) {
+ if (IS_ERR(old_dir_page))
+ err = PTR_ERR(old_dir_page);
goto out_old;
+ }
}
if (flags & RENAME_WHITEOUT) {
@@ -677,8 +695,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
&new_page);
- if (!new_entry)
+ if (!new_entry) {
+ if (IS_ERR(new_page))
+ err = PTR_ERR(new_page);
goto out_whiteout;
+ }
f2fs_balance_fs(sbi, true);
@@ -700,19 +721,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode->i_ctime = CURRENT_TIME;
down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
- drop_nlink(new_inode);
- drop_nlink(new_inode);
+ f2fs_i_links_write(new_inode, false);
+ f2fs_i_links_write(new_inode, false);
up_write(&F2FS_I(new_inode)->i_sem);
- mark_inode_dirty(new_inode);
-
if (!new_inode->i_nlink)
- add_orphan_inode(sbi, new_inode->i_ino);
+ add_orphan_inode(new_inode);
else
release_orphan_inode(sbi);
-
- update_inode_page(old_inode);
- update_inode_page(new_inode);
} else {
f2fs_balance_fs(sbi, true);
@@ -724,10 +740,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_whiteout;
}
- if (old_dir_entry) {
- inc_nlink(new_dir);
- update_inode_page(new_dir);
- }
+ if (old_dir_entry)
+ f2fs_i_links_write(new_dir, true);
/*
* old entry and new entry can locate in the same inline
@@ -743,7 +757,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_entry = f2fs_find_entry(old_dir,
&old_dentry->d_name, &old_page);
if (!old_entry) {
- err = -EIO;
+ err = -ENOENT;
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
f2fs_unlock_op(sbi);
goto out_whiteout;
}
@@ -757,13 +773,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(old_inode);
+ f2fs_mark_inode_dirty_sync(old_inode);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
if (whiteout) {
whiteout->i_state |= I_LINKABLE;
- set_inode_flag(F2FS_I(whiteout), FI_INC_LINK);
+ set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
if (err)
goto put_out_dir;
@@ -775,14 +791,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir && !whiteout) {
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
- update_inode_page(old_inode);
} else {
f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
- drop_nlink(old_dir);
- mark_inode_dirty(old_dir);
- update_inode_page(old_dir);
+ f2fs_i_links_write(old_dir, false);
}
f2fs_unlock_op(sbi);
@@ -832,29 +845,39 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return -EPERM;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
- if (!old_entry)
+ if (!old_entry) {
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
goto out;
+ }
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
- if (!new_entry)
+ if (!new_entry) {
+ if (IS_ERR(new_page))
+ err = PTR_ERR(new_page);
goto out_old;
+ }
/* prepare for updating ".." directory entry info later */
if (old_dir != new_dir) {
if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
old_dir_entry = f2fs_parent_dir(old_inode,
&old_dir_page);
- if (!old_dir_entry)
+ if (!old_dir_entry) {
+ if (IS_ERR(old_dir_page))
+ err = PTR_ERR(old_dir_page);
goto out_new;
+ }
}
if (S_ISDIR(new_inode->i_mode)) {
- err = -EIO;
new_dir_entry = f2fs_parent_dir(new_inode,
&new_dir_page);
- if (!new_dir_entry)
+ if (!new_dir_entry) {
+ if (IS_ERR(new_dir_page))
+ err = PTR_ERR(new_dir_page);
goto out_old_dir;
+ }
}
}
@@ -904,19 +927,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
- update_inode_page(old_inode);
-
old_dir->i_ctime = CURRENT_TIME;
if (old_nlink) {
down_write(&F2FS_I(old_dir)->i_sem);
- if (old_nlink < 0)
- drop_nlink(old_dir);
- else
- inc_nlink(old_dir);
+ f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
}
- mark_inode_dirty(old_dir);
- update_inode_page(old_dir);
+ f2fs_mark_inode_dirty_sync(old_dir);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -925,19 +942,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(new_inode);
up_write(&F2FS_I(new_inode)->i_sem);
- update_inode_page(new_inode);
-
new_dir->i_ctime = CURRENT_TIME;
if (new_nlink) {
down_write(&F2FS_I(new_dir)->i_sem);
- if (new_nlink < 0)
- drop_nlink(new_dir);
- else
- inc_nlink(new_dir);
+ f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
}
- mark_inode_dirty(new_dir);
- update_inode_page(new_dir);
+ f2fs_mark_inode_dirty_sync(new_dir);
f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d1867698e601..b2fa4b615925 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -52,6 +52,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+ if (excess_cached_nats(sbi))
+ res = false;
+ if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD)
+ res = false;
} else if (type == DIRTY_DENTS) {
if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
@@ -202,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -219,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -233,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -280,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = grab_nat_entry(nm_i, ni->nid);
@@ -330,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
}
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -338,8 +342,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- if (!down_write_trylock(&nm_i->nat_tree_lock))
- return 0;
+ percpu_down_write(&nm_i->nat_tree_lock);
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
@@ -348,7 +351,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
}
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -370,13 +373,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->nid = nid;
/* Check nat cache */
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
return;
}
@@ -400,11 +403,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
cache_nat_entry(sbi, nid, &ne);
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
}
/*
@@ -646,6 +649,7 @@ release_out:
if (err == -ENOENT) {
dn->cur_level = i;
dn->max_level = level;
+ dn->ofs_in_node = offset[level];
}
return err;
}
@@ -670,8 +674,7 @@ static void truncate_node(struct dnode_of_data *dn)
if (dn->nid == dn->inode->i_ino) {
remove_orphan_inode(sbi, dn->nid);
dec_valid_inode_count(sbi);
- } else {
- sync_inode_page(dn);
+ f2fs_inode_synced(dn->inode);
}
invalidate:
clear_node_page_dirty(dn->node_page);
@@ -953,7 +956,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
if (IS_ERR(npage))
return PTR_ERR(npage);
- F2FS_I(inode)->i_xattr_nid = 0;
+ f2fs_i_xnid_write(inode, 0);
/* need to do checkpoint during fsync */
F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -1019,7 +1022,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
struct page *page;
int err;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
@@ -1042,21 +1045,16 @@ struct page *new_node_page(struct dnode_of_data *dn,
f2fs_wait_on_page_writeback(page, NODE, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (set_page_dirty(page))
dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
- F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
+ f2fs_i_xnid_write(dn->inode, dn->nid);
- dn->node_page = page;
- if (ipage)
- update_inode(dn->inode, ipage);
- else
- sync_inode_page(dn);
if (ofs == 0)
inc_valid_inode_count(sbi);
-
return page;
fail:
@@ -1083,6 +1081,9 @@ static int read_node_page(struct page *page, int op_flags)
.encrypted_page = NULL,
};
+ if (PageUptodate(page))
+ return LOCKED_PAGE;
+
get_node_info(sbi, page->index, &ni);
if (unlikely(ni.blk_addr == NULL_ADDR)) {
@@ -1090,9 +1091,6 @@ static int read_node_page(struct page *page, int op_flags)
return -ENOENT;
}
- if (PageUptodate(page))
- return LOCKED_PAGE;
-
fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
return f2fs_submit_page_bio(&fio);
}
@@ -1150,16 +1148,21 @@ repeat:
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
+
+ if (unlikely(!PageUptodate(page)))
+ goto out_err;
page_hit:
- f2fs_bug_on(sbi, nid != nid_of_node(page));
+ if(unlikely(nid != nid_of_node(page))) {
+ f2fs_bug_on(sbi, 1);
+ ClearPageUptodate(page);
+out_err:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
return page;
}
@@ -1176,24 +1179,6 @@ struct page *get_node_page_ra(struct page *parent, int start)
return __get_node_page(sbi, nid, parent, start);
}
-void sync_inode_page(struct dnode_of_data *dn)
-{
- int ret = 0;
-
- if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
- ret = update_inode(dn->inode, dn->node_page);
- } else if (dn->inode_page) {
- if (!dn->inode_page_locked)
- lock_page(dn->inode_page);
- ret = update_inode(dn->inode, dn->inode_page);
- if (!dn->inode_page_locked)
- unlock_page(dn->inode_page);
- } else {
- ret = update_inode_page(dn->inode);
- }
- dn->node_changed = ret ? true: false;
-}
-
static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode;
@@ -1319,7 +1304,7 @@ continue_unlock:
return last_page;
}
-int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
+int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic)
{
pgoff_t index, end;
@@ -1327,6 +1312,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
int ret = 0;
struct page *last_page = NULL;
bool marked = false;
+ nid_t ino = inode->i_ino;
if (atomic) {
last_page = last_fsync_dnode(sbi, ino);
@@ -1380,9 +1366,13 @@ continue_unlock:
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
- if (IS_INODE(page))
+ if (IS_INODE(page)) {
+ if (is_inode_flag_set(inode,
+ FI_DIRTY_INODE))
+ update_inode(inode, page);
set_dentry_mark(page,
need_dentry_mark(sbi, ino));
+ }
/* may be written by other thread */
if (!PageDirty(page))
set_page_dirty(page);
@@ -1630,6 +1620,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct blk_plug plug;
long diff;
/* balancing f2fs's metadata in background */
@@ -1643,7 +1634,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
+ blk_start_plug(&plug);
sync_node_pages(sbi, wbc);
+ blk_finish_plug(&plug);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return 0;
@@ -1657,9 +1650,10 @@ static int f2fs_set_node_page_dirty(struct page *page)
{
trace_f2fs_set_page_dirty(page, NODE);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
f2fs_trace_pid(page);
@@ -1778,7 +1772,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
}
}
-static void build_free_nids(struct f2fs_sb_info *sbi)
+void build_free_nids(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1787,14 +1781,14 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
nid_t nid = nm_i->next_scan_nid;
/* Enough entries */
- if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+ if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
return;
/* readahead nat pages to be scanned */
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- down_read(&nm_i->nat_tree_lock);
+ percpu_down_read(&nm_i->nat_tree_lock);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1826,7 +1820,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
remove_free_nid(nm_i, nid);
}
up_read(&curseg->journal_rwsem);
- up_read(&nm_i->nat_tree_lock);
+ percpu_up_read(&nm_i->nat_tree_lock);
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -1925,12 +1919,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
struct free_nid *i, *next;
int nr = nr_shrink;
+ if (nm_i->fcnt <= MAX_FREE_NIDS)
+ return 0;
+
if (!mutex_trylock(&nm_i->build_lock))
return 0;
spin_lock(&nm_i->free_nid_list_lock);
list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
- if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK)
+ if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
break;
if (i->state == NID_ALLOC)
continue;
@@ -1957,7 +1954,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
ri = F2FS_INODE(page);
if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
- clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+ clear_inode_flag(inode, FI_INLINE_XATTR);
goto update_inode;
}
@@ -1999,13 +1996,11 @@ recover_xnid:
get_node_info(sbi, new_xnid, &ni);
ni.ino = inode->i_ino;
set_node_addr(sbi, &ni, NEW_ADDR, false);
- F2FS_I(inode)->i_xattr_nid = new_xnid;
+ f2fs_i_xnid_write(inode, new_xnid);
/* 3: update xattr blkaddr */
refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
set_node_addr(sbi, &ni, blkaddr, false);
-
- update_inode_page(inode);
}
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -2027,7 +2022,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
/* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
- SetPageUptodate(ipage);
+ if (!PageUptodate(ipage))
+ SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
src = F2FS_INODE(page);
@@ -2213,7 +2209,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!nm_i->dirty_nat_cnt)
return;
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -2236,7 +2232,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
@@ -2272,7 +2268,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
- init_rwsem(&nm_i->nat_tree_lock);
+ if (percpu_init_rwsem(&nm_i->nat_tree_lock))
+ return -ENOMEM;
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2329,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
- down_write(&nm_i->nat_tree_lock);
+ percpu_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -2354,8 +2351,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- up_write(&nm_i->nat_tree_lock);
+ percpu_up_write(&nm_i->nat_tree_lock);
+ percpu_free_rwsem(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
sbi->nm_info = NULL;
kfree(nm_i);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 1f4f9d4569d9..fc7684554b1a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -15,18 +15,21 @@
#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
/* # of pages to perform synchronous readahead before building free nids */
-#define FREE_NID_PAGES 4
+#define FREE_NID_PAGES 8
+#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
-#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */
+#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
/* control the memory footprint threshold (10MB per 1GB ram) */
-#define DEF_RAM_THRESHOLD 10
+#define DEF_RAM_THRESHOLD 1
/* control dirty nats ratio threshold (default: 10% over max nid count) */
#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10
+/* control total # of nats */
+#define DEF_NAT_CACHE_THRESHOLD 100000
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
@@ -126,6 +129,11 @@ static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
NM_I(sbi)->dirty_nats_ratio / 100;
}
+static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
+}
+
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 3d7216d7a288..9e652d5a659b 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -153,9 +153,12 @@ retry:
f2fs_delete_entry(de, page, dir, einode);
iput(einode);
goto retry;
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ } else {
+ err = __f2fs_add_link(dir, &name, inode,
+ inode->i_ino, inode->i_mode);
}
- err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
-
goto out;
out_unmap_put:
@@ -175,7 +178,7 @@ static void recover_inode(struct inode *inode, struct page *page)
char *name;
inode->i_mode = le16_to_cpu(raw->i_mode);
- i_size_write(inode, le64_to_cpu(raw->i_size));
+ f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
@@ -455,6 +458,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
continue;
}
+ if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+ f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
+
/*
* dest is reserved block, invalidate src block
* and then reserve one new block in dnode page.
@@ -476,6 +482,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
#endif
/* We should not get -ENOSPC */
f2fs_bug_on(sbi, err);
+ if (err)
+ goto err;
}
/* Check the previous node page having this index */
@@ -490,9 +498,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
}
- if (IS_INODE(dn.node_page))
- sync_inode_page(&dn);
-
copy_node_footer(dn.node_page, page);
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
@@ -624,8 +629,12 @@ out:
if (err) {
bool invalidate = false;
- if (discard_next_dnode(sbi, blkaddr))
+ if (test_opt(sbi, LFS)) {
+ update_meta_page(sbi, NULL, blkaddr);
+ invalidate = true;
+ } else if (discard_next_dnode(sbi, blkaddr)) {
invalidate = true;
+ }
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c2d1fa1e0e2..a46296f57b02 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -241,7 +241,7 @@ void drop_inmem_pages(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
mutex_lock(&fi->inmem_lock);
__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
@@ -346,6 +346,11 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (!need)
return;
+
+ /* balance_fs_bg is able to be pending */
+ if (excess_cached_nats(sbi))
+ f2fs_balance_fs_bg(sbi);
+
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
@@ -367,7 +372,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
if (!available_free_memory(sbi, FREE_NIDS))
- try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES);
+ try_to_free_nids(sbi, MAX_FREE_NIDS);
+ else
+ build_free_nids(sbi);
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
@@ -435,25 +442,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE)) {
+ if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
struct bio *bio = f2fs_bio_alloc(0);
int ret;
+ atomic_inc(&fcc->submit_flush);
bio->bi_bdev = sbi->sb->s_bdev;
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
ret = submit_bio_wait(bio);
+ atomic_dec(&fcc->submit_flush);
bio_put(bio);
return ret;
}
init_completion(&cmd.wait);
+ atomic_inc(&fcc->submit_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->submit_flush);
return cmd.ret;
}
@@ -467,6 +478,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
+ atomic_set(&fcc->submit_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->cmd_control_info = fcc;
@@ -668,6 +680,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+ if (force && start && end != max_blocks
+ && (end - start) < cpc->trim_minlen)
+ continue;
+
__add_discard_entry(sbi, cpc, se, start, end);
}
}
@@ -705,6 +721,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
+ unsigned int secno, start_segno;
+ bool force = (cpc->reason == CP_DISCARD);
mutex_lock(&dirty_i->seglist_lock);
@@ -721,17 +739,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
dirty_i->nr_dirty[PRE] -= end - start;
- if (!test_opt(sbi, DISCARD))
+ if (force || !test_opt(sbi, DISCARD))
continue;
- f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+ if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
+ continue;
+ }
+next:
+ secno = GET_SECNO(sbi, start);
+ start_segno = secno * sbi->segs_per_sec;
+ if (!IS_CURSEC(sbi, secno) &&
+ !get_valid_blocks(sbi, start, sbi->segs_per_sec))
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
+ sbi->segs_per_sec << sbi->log_blocks_per_seg);
+
+ start = start_segno + sbi->segs_per_sec;
+ if (start < end)
+ goto next;
}
mutex_unlock(&dirty_i->seglist_lock);
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
- if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen)
+ if (force && entry->len < cpc->trim_minlen)
goto skip;
f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
cpc->trimmed += entry->len;
@@ -1219,6 +1251,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
+ if (test_opt(sbi, LFS))
+ return;
+
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
__allocate_new_segments(sbi, i);
}
@@ -1392,11 +1427,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio->page, fio->type);
+ if (fio->type == NODE || fio->type == DATA)
+ mutex_lock(&fio->sbi->wio_mutex[fio->type]);
+
allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type);
/* writeout dirty page into bdev */
f2fs_submit_page_mbio(fio);
+
+ if (fio->type == NODE || fio->type == DATA)
+ mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
}
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -2377,7 +2418,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
sm_info->rec_prefree_segments = sm_info->main_segments *
DEF_RECLAIM_PREFREE_SEGMENTS / 100;
- sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
+ if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
+ sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
+
+ if (!test_opt(sbi, LFS))
+ sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7a756ff5a36d..b33f73ec60a4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -16,6 +16,7 @@
#define NULL_SECNO ((unsigned int)(~0))
#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
+#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
@@ -470,6 +471,10 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+
+ if (test_opt(sbi, LFS))
+ return false;
+
return free_sections(sbi) <= (node_secs + 2 * dent_secs +
reserved_sections(sbi) + 1);
}
@@ -479,6 +484,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
@@ -531,6 +538,9 @@ static inline bool need_inplace_update(struct inode *inode)
if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
return false;
+ if (test_opt(sbi, LFS))
+ return false;
+
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
@@ -544,7 +554,7 @@ static inline bool need_inplace_update(struct inode *inode)
/* this is only set during fdatasync */
if (policy & (0x1 << F2FS_IPU_FSYNC) &&
- is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+ is_inode_flag_set(inode, FI_NEED_IPU))
return true;
return false;
@@ -706,9 +716,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
if (type == DATA)
return sbi->blocks_per_seg;
else if (type == NODE)
- return 3 * sbi->blocks_per_seg;
+ return 8 * sbi->blocks_per_seg;
else if (type == META)
- return MAX_BIO_BLOCKS(sbi);
+ return 8 * MAX_BIO_BLOCKS(sbi);
else
return 0;
}
@@ -726,10 +736,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
nr_to_write = wbc->nr_to_write;
- if (type == DATA)
- desired = 4096;
- else if (type == NODE)
- desired = 3 * max_hw_blocks(sbi);
+ if (type == NODE)
+ desired = 2 * max_hw_blocks(sbi);
else
desired = MAX_BIO_BLOCKS(sbi);
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 93606f281bf9..46c915425923 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -13,6 +13,7 @@
#include <linux/f2fs_fs.h>
#include "f2fs.h"
+#include "node.h"
static LIST_HEAD(f2fs_list);
static DEFINE_SPINLOCK(f2fs_list_lock);
@@ -25,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
{
- if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK)
- return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK;
+ if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
+ return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
return 0;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 74cc8520b8b1..b97c065cbe74 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = {
[FAULT_ORPHAN] = "orphan",
[FAULT_BLOCK] = "no more block",
[FAULT_DIR_DEPTH] = "too big dir depth",
+ [FAULT_EVICT_INODE] = "evict_inode fail",
};
static void f2fs_build_fault_attr(unsigned int rate)
@@ -75,6 +76,7 @@ enum {
Opt_disable_roll_forward,
Opt_norecovery,
Opt_discard,
+ Opt_nodiscard,
Opt_noheap,
Opt_user_xattr,
Opt_nouser_xattr,
@@ -86,13 +88,17 @@ enum {
Opt_inline_data,
Opt_inline_dentry,
Opt_flush_merge,
+ Opt_noflush_merge,
Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
Opt_noextent_cache,
Opt_noinline_data,
Opt_data_flush,
+ Opt_mode,
Opt_fault_injection,
+ Opt_lazytime,
+ Opt_nolazytime,
Opt_err,
};
@@ -101,6 +107,7 @@ static match_table_t f2fs_tokens = {
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_norecovery, "norecovery"},
{Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_noheap, "no_heap"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
@@ -112,13 +119,17 @@ static match_table_t f2fs_tokens = {
{Opt_inline_data, "inline_data"},
{Opt_inline_dentry, "inline_dentry"},
{Opt_flush_merge, "flush_merge"},
+ {Opt_noflush_merge, "noflush_merge"},
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
{Opt_extent_cache, "extent_cache"},
{Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
{Opt_data_flush, "data_flush"},
+ {Opt_mode, "mode=%s"},
{Opt_fault_injection, "fault_injection=%u"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_err, NULL},
};
@@ -417,6 +428,8 @@ static int parse_options(struct super_block *sb, char *options)
"the device does not support discard");
}
break;
+ case Opt_nodiscard:
+ clear_opt(sbi, DISCARD);
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
@@ -478,6 +491,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_flush_merge:
set_opt(sbi, FLUSH_MERGE);
break;
+ case Opt_noflush_merge:
+ clear_opt(sbi, FLUSH_MERGE);
+ break;
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
@@ -496,6 +512,23 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_data_flush:
set_opt(sbi, DATA_FLUSH);
break;
+ case Opt_mode:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 8 &&
+ !strncmp(name, "adaptive", 8)) {
+ set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ } else if (strlen(name) == 3 &&
+ !strncmp(name, "lfs", 3)) {
+ set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
case Opt_fault_injection:
if (args->from && match_int(args, &arg))
return -EINVAL;
@@ -506,6 +539,12 @@ static int parse_options(struct super_block *sb, char *options)
"FAULT_INJECTION was not selected");
#endif
break;
+ case Opt_lazytime:
+ sb->s_flags |= MS_LAZYTIME;
+ break;
+ case Opt_nolazytime:
+ sb->s_flags &= ~MS_LAZYTIME;
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -537,13 +576,11 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
INIT_LIST_HEAD(&fi->dirty_list);
+ INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
-
- set_inode_flag(fi, FI_NEW_INODE);
-
- if (test_opt(F2FS_SB(sb), INLINE_XATTR))
- set_inode_flag(fi, FI_INLINE_XATTR);
+ init_rwsem(&fi->dio_rwsem[READ]);
+ init_rwsem(&fi->dio_rwsem[WRITE]);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -559,7 +596,7 @@ static int f2fs_drop_inode(struct inode *inode)
* - f2fs_gc -> iput -> evict
* - inode_wait_for_writeback(inode)
*/
- if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
+ if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* to avoid evict_inode call simultaneously */
atomic_inc(&inode->i_count);
@@ -573,10 +610,10 @@ static int f2fs_drop_inode(struct inode *inode)
f2fs_destroy_extent_node(inode);
sb_start_intwrite(inode->i_sb);
- i_size_write(inode, 0);
+ f2fs_i_size_write(inode, 0);
if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode, true);
+ f2fs_truncate(inode);
sb_end_intwrite(inode->i_sb);
@@ -586,9 +623,47 @@ static int f2fs_drop_inode(struct inode *inode)
}
return 0;
}
+
return generic_drop_inode(inode);
}
+int f2fs_inode_dirtied(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return 1;
+ }
+
+ set_inode_flag(inode, FI_DIRTY_INODE);
+ list_add_tail(&F2FS_I(inode)->gdirty_list,
+ &sbi->inode_list[DIRTY_META]);
+ inc_page_count(sbi, F2FS_DIRTY_IMETA);
+ stat_inc_dirty_inode(sbi, DIRTY_META);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+
+ return 0;
+}
+
+void f2fs_inode_synced(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return;
+ }
+ list_del_init(&F2FS_I(inode)->gdirty_list);
+ clear_inode_flag(inode, FI_DIRTY_INODE);
+ clear_inode_flag(inode, FI_AUTO_RECOVER);
+ dec_page_count(sbi, F2FS_DIRTY_IMETA);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+}
+
/*
* f2fs_dirty_inode() is called from __mark_inode_dirty()
*
@@ -596,7 +671,19 @@ static int f2fs_drop_inode(struct inode *inode)
*/
static void f2fs_dirty_inode(struct inode *inode, int flags)
{
- set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ return;
+
+ if (flags == I_DIRTY_TIME)
+ return;
+
+ if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
+ clear_inode_flag(inode, FI_AUTO_RECOVER);
+
+ f2fs_inode_dirtied(inode);
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -619,6 +706,8 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
percpu_counter_destroy(&sbi->nr_pages[i]);
percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
+
+ percpu_free_rwsem(&sbi->cp_rwsem);
}
static void f2fs_put_super(struct super_block *sb)
@@ -738,7 +827,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = sbi->blocksize;
buf->f_blocks = total_count - start_count;
- buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+ buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
@@ -803,6 +892,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noextent_cache");
if (test_opt(sbi, DATA_FLUSH))
seq_puts(seq, ",data_flush");
+
+ seq_puts(seq, ",mode=");
+ if (test_opt(sbi, ADAPTIVE))
+ seq_puts(seq, "adaptive");
+ else if (test_opt(sbi, LFS))
+ seq_puts(seq, "lfs");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -884,6 +979,14 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, EXTENT_CACHE);
+ sbi->sb->s_flags |= MS_LAZYTIME;
+ set_opt(sbi, FLUSH_MERGE);
+ if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+ set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ set_opt(sbi, DISCARD);
+ } else {
+ set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ }
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -1367,6 +1470,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
+ mutex_init(&sbi->wio_mutex[NODE]);
+ mutex_init(&sbi->wio_mutex[DATA]);
#ifdef CONFIG_F2FS_FS_ENCRYPTION
memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
@@ -1379,6 +1484,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
{
int i, err;
+ if (percpu_init_rwsem(&sbi->cp_rwsem))
+ return -ENOMEM;
+
for (i = 0; i < NR_COUNT_TYPE; i++) {
err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
if (err)
@@ -1530,6 +1638,8 @@ try_onemore:
goto free_sbi;
sb->s_fs_info = sbi;
+ sbi->raw_super = raw_super;
+
default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1559,10 +1669,8 @@ try_onemore:
memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
- sbi->raw_super = raw_super;
sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
- mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
@@ -1579,7 +1687,6 @@ try_onemore:
sbi->write_io[i].bio = NULL;
}
- init_rwsem(&sbi->cp_rwsem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
@@ -1762,6 +1869,7 @@ try_onemore:
return 0;
free_kobj:
+ f2fs_sync_inode_meta(sbi);
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index e3decae3acfb..c8898b5148eb 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
return -EINVAL;
F2FS_I(inode)->i_advise |= *(char *)value;
- mark_inode_dirty(inode);
+ f2fs_mark_inode_dirty_sync(inode);
return 0;
}
@@ -299,6 +299,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true);
+ set_page_dirty(ipage);
} else {
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -441,13 +442,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
struct page *ipage, int flags)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_xattr_entry *here, *last;
void *base_addr;
int found, newsize;
size_t len;
__u32 new_hsize;
- int error = -ENOMEM;
+ int error = 0;
if (name == NULL)
return -EINVAL;
@@ -465,7 +465,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
base_addr = read_all_xattrs(inode, ipage);
if (!base_addr)
- goto exit;
+ return -ENOMEM;
/* find entry with wanted name. */
here = __find_xattr(base_addr, index, len, name);
@@ -539,19 +539,15 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
goto exit;
- if (is_inode_flag_set(fi, FI_ACL_MODE)) {
- inode->i_mode = fi->i_acl_mode;
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
inode->i_ctime = CURRENT_TIME;
- clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
}
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
-
- if (ipage)
- update_inode(inode, ipage);
- else
- update_inode_page(inode);
+ f2fs_mark_inode_dirty_sync(inode);
exit:
kzfree(base_addr);
return error;
diff --git a/fs/internal.h b/fs/internal.h
index f57ced528cde..cef0913e5d41 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
struct super_block;
struct file_system_type;
+struct iomap;
struct linux_binprm;
struct path;
struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
* buffer.c
*/
extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, struct iomap *iomap);
/*
* char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 000000000000..48141b8eff5f
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+ void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+ struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+ struct iomap iomap = { 0 };
+ loff_t written = 0, ret;
+
+ /*
+ * Need to map a range from start position for length bytes. This can
+ * span multiple pages - it is only guaranteed to return a range of a
+ * single type of pages (e.g. all into a hole, all mapped or all
+ * unwritten). Failure at this point has nothing to undo.
+ *
+ * If allocation is required for this range, reserve the space now so
+ * that the allocation is guaranteed to succeed later on. Once we copy
+ * the data into the page cache pages, then we cannot fail otherwise we
+ * expose transient stale data. If the reserve fails, we can safely
+ * back out at this point as there is nothing to undo.
+ */
+ ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+ if (ret)
+ return ret;
+ if (WARN_ON(iomap.offset > pos))
+ return -EIO;
+
+ /*
+ * Cut down the length to the one actually provided by the filesystem,
+ * as it might not be able to give us the whole size that we requested.
+ */
+ if (iomap.offset + iomap.length < pos + length)
+ length = iomap.offset + iomap.length - pos;
+
+ /*
+ * Now that we have guaranteed that the space allocation will succeed.
+ * we can do the copy-in page by page without having to worry about
+ * failures exposing transient data.
+ */
+ written = actor(inode, pos, length, data, &iomap);
+
+ /*
+ * Now the data has been copied, commit the range we've copied. This
+ * should not fail unless the filesystem has had a fatal error.
+ */
+ ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+ flags, &iomap);
+
+ return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+ loff_t i_size = i_size_read(inode);
+
+ /*
+ * Only truncate newly allocated pages beyoned EOF, even if the
+ * write started inside the existing inode size.
+ */
+ if (pos + len > i_size)
+ truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, struct iomap *iomap)
+{
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct page *page;
+ int status = 0;
+
+ BUG_ON(pos + len > iomap->offset + iomap->length);
+
+ page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ if (unlikely(status)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+
+ iomap_write_failed(inode, pos, len);
+ }
+
+ *pagep = page;
+ return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+
+ ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+ copied, page, NULL);
+ if (ret < len)
+ iomap_write_failed(inode, pos, len);
+ return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iov_iter *i = data;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = AOP_FLAG_NOFS;
+
+ /*
+ * Copies from kernel address space cannot fail (NFSD is a big user).
+ */
+ if (!iter_is_iovec(i))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_count(i));
+again:
+ if (bytes > length)
+ bytes = length;
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = iomap_write_begin(inode, pos, bytes, flags, &page,
+ iomap);
+ if (unlikely(status))
+ break;
+
+ if (mapping_writably_mapped(inode->i_mapping))
+ flush_dcache_page(page);
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+
+ flush_dcache_page(page);
+ mark_page_accessed(page);
+
+ status = iomap_write_end(inode, pos, bytes, copied, page);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+ length -= copied;
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ } while (iov_iter_count(i) && length);
+
+ return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+ while (iov_iter_count(iter)) {
+ ret = iomap_apply(inode, pos, iov_iter_count(iter),
+ IOMAP_WRITE, ops, iter, iomap_write_actor);
+ if (ret <= 0)
+ break;
+ pos += ret;
+ written += ret;
+ }
+
+ return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+ unsigned bytes, struct iomap *iomap)
+{
+ struct page *page;
+ int status;
+
+ status = iomap_write_begin(inode, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+ if (status)
+ return status;
+
+ zero_user(page, offset, bytes);
+ mark_page_accessed(page);
+
+ return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+ struct iomap *iomap)
+{
+ sector_t sector = iomap->blkno +
+ (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+
+ return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+ void *data, struct iomap *iomap)
+{
+ bool *did_zero = data;
+ loff_t written = 0;
+ int status;
+
+ /* already zeroed? we're done. */
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ return count;
+
+ do {
+ unsigned offset, bytes;
+
+ offset = pos & (PAGE_SIZE - 1); /* Within page */
+ bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+ if (IS_DAX(inode))
+ status = iomap_dax_zero(pos, offset, bytes, iomap);
+ else
+ status = iomap_zero(inode, pos, offset, bytes, iomap);
+ if (status < 0)
+ return status;
+
+ pos += bytes;
+ count -= bytes;
+ written += bytes;
+ if (did_zero)
+ *did_zero = true;
+ } while (count > 0);
+
+ return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ loff_t ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+ ops, did_zero, iomap_zero_range_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ struct iomap_ops *ops)
+{
+ unsigned blocksize = (1 << inode->i_blkbits);
+ unsigned off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct page *page = data;
+ int ret;
+
+ ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+ NULL, iomap);
+ if (ret)
+ return ret;
+
+ block_commit_write(page, 0, length);
+ return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ unsigned long length;
+ loff_t offset, size;
+ ssize_t ret;
+
+ lock_page(page);
+ size = i_size_read(inode);
+ if ((page->mapping != inode->i_mapping) ||
+ (page_offset(page) > size)) {
+ /* We overload EFAULT to mean page got truncated */
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_SHIFT) > size)
+ length = size & ~PAGE_MASK;
+ else
+ length = PAGE_SIZE;
+
+ offset = page_offset(page);
+ while (length > 0) {
+ ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+ ops, page, iomap_page_mkwrite_actor);
+ if (unlikely(ret <= 0))
+ goto out_unlock;
+ offset += ret;
+ length -= ret;
+ }
+
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+ return 0;
+out_unlock:
+ unlock_page(page);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+ struct fiemap_extent_info *fi;
+ struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+ struct iomap *iomap, u32 flags)
+{
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ /* skip holes */
+ return 0;
+ case IOMAP_DELALLOC:
+ flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+ break;
+ case IOMAP_UNWRITTEN:
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ break;
+ case IOMAP_MAPPED:
+ break;
+ }
+
+ return fiemap_fill_next_extent(fi, iomap->offset,
+ iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+ iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct fiemap_ctx *ctx = data;
+ loff_t ret = length;
+
+ if (iomap->type == IOMAP_HOLE)
+ return length;
+
+ ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+ ctx->prev = *iomap;
+ switch (ret) {
+ case 0: /* success */
+ return length;
+ case 1: /* extent array full */
+ return 0;
+ default:
+ return ret;
+ }
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+ loff_t start, loff_t len, struct iomap_ops *ops)
+{
+ struct fiemap_ctx ctx;
+ loff_t ret;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.fi = fi;
+ ctx.prev.type = IOMAP_HOLE;
+
+ ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+ iomap_fiemap_actor);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ break;
+
+ start += ret;
+ len -= ret;
+ }
+
+ if (ctx.prev.type != IOMAP_HOLE) {
+ ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 31f3df193bdb..ad2c05e80a83 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
* Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316f932e..4ebaaf4b8d8a 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
+#include <linux/iomap.h>
#include <linux/nfs4.h>
#include "nfsd.h"
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4df61ea..35faf128f36d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
+ select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a708e38b494c..88c26b827a2d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
* Lookup the first record less than or equal to [bno, len]
* in the btree given by cur.
*/
-int /* error */
+static int /* error */
xfs_alloc_lookup_le(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -1839,19 +1839,8 @@ void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- int level;
- uint maxblocks;
- uint maxleafents;
- int minleafrecs;
- int minnoderecs;
-
- maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
- minleafrecs = mp->m_alloc_mnr[0];
- minnoderecs = mp->m_alloc_mnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- mp->m_ag_maxlevels = level;
+ mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
+ (mp->m_sb.sb_agblocks + 1) / 2);
}
/*
@@ -2658,55 +2647,79 @@ error0:
return error;
}
-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int /* error */
-xfs_free_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len) /* length of extent */
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **agbp)
{
- xfs_alloc_arg_t args;
- int error;
+ struct xfs_alloc_arg args;
+ int error;
- ASSERT(len != 0);
- memset(&args, 0, sizeof(xfs_alloc_arg_t));
+ memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
+ args.agno = agno;
/*
* validate that the block number is legal - the enables us to detect
* and handle a silent filesystem corruption rather than crashing.
*/
- args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
if (args.agno >= args.mp->m_sb.sb_agcount)
return -EFSCORRUPTED;
- args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
- if (args.agbno >= args.mp->m_sb.sb_agblocks)
- return -EFSCORRUPTED;
-
args.pag = xfs_perag_get(args.mp, args.agno);
ASSERT(args.pag);
error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
if (error)
- goto error0;
+ goto out;
+
+ *agbp = args.agbp;
+out:
+ xfs_perag_put(args.pag);
+ return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int /* error */
+xfs_free_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ int error;
+
+ ASSERT(len != 0);
+
+ error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ if (error)
+ return error;
+
+ XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
/* validate the extent size is legal now we have the agf locked */
- if (args.agbno + len >
- be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
- error = -EFSCORRUPTED;
- goto error0;
- }
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
+ err);
- error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
- if (!error)
- xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
- xfs_perag_put(args.pag);
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
+ if (error)
+ goto err;
+
+ xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+ return 0;
+
+err:
+ xfs_trans_brelse(tp, agbp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 135eb3d24db7..cf268b2d0b6c 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -212,13 +212,6 @@ xfs_free_extent(
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len); /* length of extent */
-int /* error */
-xfs_alloc_lookup_le(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat); /* success/failure */
-
int /* error */
xfs_alloc_lookup_ge(
struct xfs_btree_cur *cur, /* btree cursor */
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **agbp);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 882c8d338891..4f2aed04f827 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
-int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
struct xfs_da_state_blk *drop_blk,
struct xfs_da_state_blk *save_blk);
-int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
-
/*
* Utility routines.
*/
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 932381caef1b..2f2c85cc8117 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
*/
void
xfs_bmap_add_free(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_bmap_free *flist, /* list of extents */
xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- xfs_mount_t *mp) /* mount point structure */
+ xfs_filblks_t len) /* length of extent */
{
- xfs_bmap_free_item_t *cur; /* current (next) element */
- xfs_bmap_free_item_t *new; /* new element */
- xfs_bmap_free_item_t *prev; /* previous element */
+ struct xfs_bmap_free_item *new; /* new element */
#ifdef DEBUG
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
new->xbfi_startblock = bno;
new->xbfi_blockcount = (xfs_extlen_t)len;
- for (prev = NULL, cur = flist->xbf_first;
- cur != NULL;
- prev = cur, cur = cur->xbfi_next) {
- if (cur->xbfi_startblock >= bno)
- break;
- }
- if (prev)
- prev->xbfi_next = new;
- else
- flist->xbf_first = new;
- new->xbfi_next = cur;
+ list_add(&new->xbfi_list, &flist->xbf_flist);
flist->xbf_count++;
}
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
*/
void
xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free) /* list item to be freed */
+ struct xfs_bmap_free *flist, /* free item list header */
+ struct xfs_bmap_free_item *free) /* list item to be freed */
{
- if (prev)
- prev->xbfi_next = free->xbfi_next;
- else
- flist->xbf_first = free->xbfi_next;
+ list_del(&free->xbfi_list);
flist->xbf_count--;
kmem_zone_free(xfs_bmap_free_item_zone, free);
}
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
*/
void
xfs_bmap_cancel(
- xfs_bmap_free_t *flist) /* list of bmap_free_items */
+ struct xfs_bmap_free *flist) /* list of bmap_free_items */
{
- xfs_bmap_free_item_t *free; /* free list item */
- xfs_bmap_free_item_t *next;
+ struct xfs_bmap_free_item *free; /* free list item */
if (flist->xbf_count == 0)
return;
- ASSERT(flist->xbf_first != NULL);
- for (free = flist->xbf_first; free; free = next) {
- next = free->xbfi_next;
- xfs_bmap_del_free(flist, NULL, free);
+ while (!list_empty(&flist->xbf_flist)) {
+ free = list_first_entry(&flist->xbf_flist,
+ struct xfs_bmap_free_item, xbfi_list);
+ xfs_bmap_del_free(flist, free);
}
ASSERT(flist->xbf_count == 0);
}
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
- xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+ xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
ip->i_d.di_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
* If we need to, add to list of extents to delete.
*/
if (do_fx)
- xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
- mp);
+ xfs_bmap_add_free(mp, flist, del->br_startblock,
+ del->br_blockcount);
/*
* Adjust inode # blocks in the file.
*/
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 423a34e832bd..f1f3ae6c0a3f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
* List of extents to be free "later".
* The list is kept sorted on xbf_startblock.
*/
-typedef struct xfs_bmap_free_item
+struct xfs_bmap_free_item
{
xfs_fsblock_t xbfi_startblock;/* starting fs block number */
xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
- struct xfs_bmap_free_item *xbfi_next; /* link to next entry */
-} xfs_bmap_free_item_t;
+ struct list_head xbfi_list;
+};
/*
* Header for free extent list.
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
*/
typedef struct xfs_bmap_free
{
- xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
+ struct list_head xbf_flist; /* list of to-be-free extents */
int xbf_count; /* count of items on list */
int xbf_low; /* alloc in low mode */
} xfs_bmap_free_t;
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
{
- ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
- (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+ INIT_LIST_HEAD(&flp->xbf_flist);
+ flp->xbf_count = 0;
+ flp->xbf_low = 0;
+ *fbp = NULLFSBLOCK;
}
/*
@@ -191,8 +193,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
- struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
+ xfs_fsblock_t bno, xfs_filblks_t len);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6282f6e708af..db0c71e470c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
- xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+ xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
ip->i_d.di_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 1f88e1ce770f..07eeb0b4ca74 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
*/
STATIC struct xfs_btree_block *
xfs_btree_get_iroot(
- struct xfs_btree_cur *cur)
+ struct xfs_btree_cur *cur)
{
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
- return (struct xfs_btree_block *)ifp->if_broot;
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ return (struct xfs_btree_block *)ifp->if_broot;
}
/*
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
return true;
}
+
+/*
+ * Calculate the number of btree levels needed to store a given number of
+ * records in a short-format btree.
+ */
+uint
+xfs_btree_compute_maxlevels(
+ struct xfs_mount *mp,
+ uint *limits,
+ unsigned long len)
+{
+ uint level;
+ unsigned long maxblocks;
+
+ maxblocks = (len + limits[0] - 1) / limits[0];
+ for (level = 1; maxblocks > 1; level++)
+ maxblocks = (maxblocks + limits[1] - 1) / limits[1];
+ return level;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 2e874be70209..785a99682159 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
+ unsigned long len);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 097bf7717d80..0f1f165f4048 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
struct xfs_da_state_blk *newblk;
struct xfs_da_state_blk *addblk;
struct xfs_da_intnode *node;
- struct xfs_buf *bp;
int max;
int action = 0;
int error;
@@ -397,7 +396,9 @@ xfs_da3_split(
break;
}
/*
- * Entry wouldn't fit, split the leaf again.
+ * Entry wouldn't fit, split the leaf again. The new
+ * extrablk will be consumed by xfs_da3_node_split if
+ * the node is split.
*/
state->extravalid = 1;
if (state->inleaf) {
@@ -446,6 +447,14 @@ xfs_da3_split(
return 0;
/*
+ * xfs_da3_node_split() should have consumed any extra blocks we added
+ * during a double leaf split in the attr fork. This is guaranteed as
+ * we can't be here if the attr fork only has a single leaf block.
+ */
+ ASSERT(state->extravalid == 0 ||
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /*
* Split the root node.
*/
ASSERT(state->path.active == 0);
@@ -457,43 +466,33 @@ xfs_da3_split(
}
/*
- * Update pointers to the node which used to be block 0 and
- * just got bumped because of the addition of a new root node.
- * There might be three blocks involved if a double split occurred,
- * and the original block 0 could be at any position in the list.
+ * Update pointers to the node which used to be block 0 and just got
+ * bumped because of the addition of a new root node. Note that the
+ * original block 0 could be at any position in the list of blocks in
+ * the tree.
*
- * Note: the magic numbers and sibling pointers are in the same
- * physical place for both v2 and v3 headers (by design). Hence it
- * doesn't matter which version of the xfs_da_intnode structure we use
- * here as the result will be the same using either structure.
+ * Note: the magic numbers and sibling pointers are in the same physical
+ * place for both v2 and v3 headers (by design). Hence it doesn't matter
+ * which version of the xfs_da_intnode structure we use here as the
+ * result will be the same using either structure.
*/
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
- if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
- bp = addblk->bp;
- } else {
- ASSERT(state->extravalid);
- bp = state->extrablk.bp;
- }
- node = bp->b_addr;
+ ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+ node = addblk->bp->b_addr;
node->hdr.info.back = cpu_to_be32(oldblk->blkno);
- xfs_trans_log_buf(state->args->trans, bp,
- XFS_DA_LOGRANGE(node, &node->hdr.info,
- sizeof(node->hdr.info)));
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
}
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
- if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
- bp = addblk->bp;
- } else {
- ASSERT(state->extravalid);
- bp = state->extrablk.bp;
- }
- node = bp->b_addr;
+ ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+ node = addblk->bp->b_addr;
node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
- xfs_trans_log_buf(state->args->trans, bp,
- XFS_DA_LOGRANGE(node, &node->hdr.info,
- sizeof(node->hdr.info)));
+ xfs_trans_log_buf(state->args->trans, addblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
}
addblk->bp = NULL;
return 0;
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 9d624a622946..f1e8d4dbb600 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
count += len; /* name */
- count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
- sizeof(xfs_dir2_ino4_t); /* ino # */
+ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
return count;
}
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
static xfs_ino_t
xfs_dir2_sf_get_ino(
struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *from)
+ __uint8_t *from)
{
if (hdr->i8count)
- return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+ return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
else
- return get_unaligned_be32(&from->i4.i);
+ return get_unaligned_be32(from);
}
static void
xfs_dir2_sf_put_ino(
struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *to,
+ __uint8_t *to,
xfs_ino_t ino)
{
ASSERT((ino & 0xff00000000000000ULL) == 0);
if (hdr->i8count)
- put_unaligned_be64(ino, &to->i8.i);
+ put_unaligned_be64(ino, to);
else
- put_unaligned_be32(ino, &to->i4.i);
+ put_unaligned_be32(ino, to);
}
static xfs_ino_t
xfs_dir2_sf_get_parent_ino(
struct xfs_dir2_sf_hdr *hdr)
{
- return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+ return xfs_dir2_sf_get_ino(hdr, hdr->parent);
}
static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
struct xfs_dir2_sf_hdr *hdr,
xfs_ino_t ino)
{
- xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+ xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
}
/*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
struct xfs_dir2_sf_hdr *hdr,
struct xfs_dir2_sf_entry *sfep)
{
- return xfs_dir2_sf_get_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+ return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
}
static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
struct xfs_dir2_sf_entry *sfep,
xfs_ino_t ino)
{
- xfs_dir2_sf_put_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+ xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
}
static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
struct xfs_dir2_sf_hdr *hdr,
struct xfs_dir2_sf_entry *sfep)
{
- return xfs_dir2_sf_get_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+ return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
}
static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
struct xfs_dir2_sf_entry *sfep,
xfs_ino_t ino)
{
- xfs_dir2_sf_put_ino(hdr,
- (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+ xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 8d4d8bce41bf..685f23b67056 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t;
typedef uint xfs_dir2_data_aoff_t; /* argument form */
/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
-/*
* Offset in data space of a data entry.
*/
typedef __uint32_t xfs_dir2_dataptr_t;
@@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t;
*/
typedef __uint32_t xfs_dir2_db_t;
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+#define XFS_INO32_SIZE 4
+#define XFS_INO64_SIZE 8
+#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
-typedef union {
- xfs_dir2_ino8_t i8;
- xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
/*
@@ -246,39 +228,38 @@ typedef union {
typedef struct xfs_dir2_sf_hdr {
__uint8_t count; /* count of entries */
__uint8_t i8count; /* count of 8-byte inode #s */
- xfs_dir2_inou_t parent; /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
+ __uint8_t parent[8]; /* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;
typedef struct xfs_dir2_sf_entry {
__u8 namelen; /* actual name length */
- xfs_dir2_sf_off_t offset; /* saved offset */
+ __u8 offset[2]; /* saved offset */
__u8 name[]; /* name, variable size */
/*
* A single byte containing the file type field follows the inode
* number for version 3 directory entries.
*
- * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
- * variable offset after the name.
+ * A 64-bit or 32-bit inode number follows here, at a variable offset
+ * after the name.
*/
-} __arch_pack xfs_dir2_sf_entry_t;
+} xfs_dir2_sf_entry_t;
static inline int xfs_dir2_sf_hdr_size(int i8count)
{
return sizeof(struct xfs_dir2_sf_hdr) -
- (i8count == 0) *
- (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+ (i8count == 0) * XFS_INO64_DIFF;
}
static inline xfs_dir2_data_aoff_t
xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
{
- return get_unaligned_be16(&sfep->offset.i);
+ return get_unaligned_be16(sfep->offset);
}
static inline void
xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
{
- put_unaligned_be16(off, &sfep->offset.i);
+ put_unaligned_be16(off, sfep->offset);
}
static inline struct xfs_dir2_sf_entry *
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e5bb9cc3b243..c6809ff41197 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
/*
* Calculate the new size, see if we should give up yet.
*/
- size = xfs_dir2_sf_hdr_size(i8count) + /* header */
- count + /* namelen */
- count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
- namelen + /* name */
- (i8count ? /* inumber */
- (uint)sizeof(xfs_dir2_ino8_t) * count :
- (uint)sizeof(xfs_dir2_ino4_t) * count);
+ size = xfs_dir2_sf_hdr_size(i8count) + /* header */
+ count * 3 * sizeof(u8) + /* namelen + offset */
+ namelen + /* name */
+ (i8count ? /* inumber */
+ count * XFS_INO64_SIZE :
+ count * XFS_INO32_SIZE);
if (size > XFS_IFORK_DSIZE(dp))
return size; /* size value is a failure */
}
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
/*
* Yes, adjust the inode size. old count + (parent + new)
*/
- incr_isize +=
- (sfp->count + 2) *
- ((uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t));
+ incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
objchange = 1;
}
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
int error; /* error return value */
int newsize; /* new inode size */
- newsize =
- dp->i_df.if_bytes +
- (sfp->count + 1) *
- ((uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t));
+ newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
/*
* Won't fit as shortform, convert to block then do replace.
*/
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
/*
* Compute the new inode size.
*/
- newsize =
- oldsize -
- (oldsfp->count + 1) *
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+ newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
/*
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
- sfep->offset = oldsfep->offset;
+ memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
dp->d_ops->sf_put_ino(sfp, sfep,
dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
/*
* Compute the new inode size (nb: entry count + 1 for parent)
*/
- newsize =
- oldsize +
- (oldsfp->count + 1) *
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+ newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
/*
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
- sfep->offset = oldsfep->offset;
+ memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
dp->d_ops->sf_put_ino(sfp, sfep,
dp->d_ops->sf_get_ino(oldsfp, oldsfep));
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index dc97eb21af07..adb204d40f22 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
* with the crc feature bit, and all accesses to them must be conditional on
* that flag.
*/
+/* short form block header */
+struct xfs_btree_block_shdr {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
+};
+
struct xfs_btree_block {
__be32 bb_magic; /* magic number for block type */
__be16 bb_level; /* 0 is a leaf */
__be16 bb_numrecs; /* current # of data records */
union {
- struct {
- __be32 bb_leftsib;
- __be32 bb_rightsib;
-
- __be64 bb_blkno;
- __be64 bb_lsn;
- uuid_t bb_uuid;
- __be32 bb_owner;
- __le32 bb_crc;
- } s; /* short form pointers */
- struct {
- __be64 bb_leftsib;
- __be64 bb_rightsib;
-
- __be64 bb_blkno;
- __be64 bb_lsn;
- uuid_t bb_uuid;
- __be64 bb_owner;
- __le32 bb_crc;
- __be32 bb_pad; /* padding for alignment */
- } l; /* long form pointers */
+ struct xfs_btree_block_shdr s;
+ struct xfs_btree_block_lhdr l;
} bb_u; /* rest */
};
-#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ offsetof(struct xfs_btree_block_lhdr, bb_blkno))
/* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+ (offsetof(struct xfs_btree_block, bb_u) + \
+ sizeof(struct xfs_btree_block_lhdr))
#define XFS_BTREE_SBLOCK_CRC_OFF \
offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index fffe3d01bd9f..f5ec9c5ccae6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
-/* XFS_IOC_FREEZE -- FIFREEZE 119 */
-/* XFS_IOC_THAW -- FITHAW 120 */
-#ifndef FIFREEZE
-#define XFS_IOC_FREEZE _IOWR('X', 119, int)
-#define XFS_IOC_THAW _IOWR('X', 120, int)
-#endif
+#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
+#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 22297f9b0fd5..4b1e408169a8 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
- XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
- mp->m_ialloc_blks, flist, mp);
+ xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ mp->m_ialloc_blks);
return;
}
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- flist, mp);
+ xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
+ contigblk);
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -2395,20 +2394,11 @@ void
xfs_ialloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- int level;
- uint maxblocks;
- uint maxleafents;
- int minleafrecs;
- int minnoderecs;
-
- maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
- XFS_INODES_PER_CHUNK_LOG;
- minleafrecs = mp->m_inobt_mnr[0];
- minnoderecs = mp->m_inobt_mnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- mp->m_in_maxlevels = level;
+ uint inodes;
+
+ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+ mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
+ inodes);
}
/*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 951c044e24e4..e2e1106c9fad 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
-int
+static int
xfs_rtbuf_get(
xfs_mount_t *mp, /* file system mount structure */
xfs_trans_t *tp, /* transaction pointer */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 87d2b215cbbd..7575cfc3ad15 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
* We're now finished for good with this page. Update the page state via the
* associated buffer_heads, paying attention to the start and end offsets that
* we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
*/
static void
xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
int error)
{
unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
- struct buffer_head *head, *bh;
+ struct buffer_head *head, *bh, *next;
unsigned int off = 0;
+ unsigned int bsize;
ASSERT(bvec->bv_offset < PAGE_SIZE);
ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
bh = head = page_buffers(bvec->bv_page);
+ bsize = bh->b_size;
do {
+ next = bh->b_this_page;
if (off < bvec->bv_offset)
goto next_bh;
if (off > end)
break;
bh->b_end_io(bh, !error);
next_bh:
- off += bh->b_size;
- } while ((bh = bh->b_this_page) != head);
+ off += bsize;
+ } while ((bh = next) != head);
}
/*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+ /*
+ * mm accommodates an old ext3 case where clean pages might not have had
+ * the dirty bit cleared. Thus, it can send actual dirty pages to
+ * ->releasepage() via shrink_active_list(). Conversely,
+ * block_invalidatepage() can send pages that are still marked dirty
+ * but otherwise have invalidated buffers.
+ *
+ * We've historically freed buffers on the latter. Instead, quietly
+ * filter out all dirty pages to avoid spurious buffer state warnings.
+ * This can likely be removed once shrink_active_list() is fixed.
+ */
+ if (PageDirty(page))
+ return 0;
+
xfs_count_page_state(page, &delalloc, &unwritten);
if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
ssize_t size;
int new = 0;
+ BUG_ON(create && !direct);
+
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && direct && offset >= i_size_read(inode))
+ if (!create && offset >= i_size_read(inode))
return 0;
/*
* Direct I/O is usually done on preallocated files, so try getting
- * a block mapping without an exclusive lock first. For buffered
- * writes we already have the exclusive iolock anyway, so avoiding
- * a lock roundtrip here by taking the ilock exclusive from the
- * beginning is a useful micro optimization.
+ * a block mapping without an exclusive lock first.
*/
- if (create && !direct) {
- lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockmode);
- } else {
- lockmode = xfs_ilock_data_map_shared(ip);
- }
+ lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes);
if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK) ||
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- if (direct || xfs_get_extsz_hint(ip)) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
- } else {
- /*
- * Delalloc reservations do not require a transaction,
- * we can go on without dropping the lock here. If we
- * are allocating a new delalloc block, make sure that
- * we set the new flag so that we mark the buffer new so
- * that we know that it is newly allocated if the write
- * fails.
- */
- if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
- new = 1;
- error = xfs_iomap_write_delay(ip, offset, size, &imap);
- if (error)
- goto out_unlock;
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ if (error)
+ return error;
+ new = 1;
- xfs_iunlock(ip, lockmode);
- }
trace_xfs_get_blocks_alloc(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
}
/* trim mapping down to size requested */
- if (direct || size > (1 << inode->i_blkbits))
- xfs_map_trim_size(inode, iblock, bh_result,
- &imap, offset, size);
+ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
/*
* For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct) {
+ if (create) {
if (dax_fault)
ASSERT(!ISUNWRITTEN(&imap));
else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
(new || ISUNWRITTEN(&imap))))
set_buffer_new(bh_result);
- if (imap.br_startblock == DELAYSTARTBLOCK) {
- BUG_ON(direct);
- if (create) {
- set_buffer_uptodate(bh_result);
- set_buffer_mapped(bh_result);
- set_buffer_delay(bh_result);
- }
- }
+ BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
return 0;
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
* whereas if we have flags set we will always be called in task context
* (i.e. from a workqueue).
*/
-STATIC int
+int
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- dio_iodone_t *endio = NULL;
- int flags = 0;
- struct block_device *bdev;
-
- if (iov_iter_rw(iter) == WRITE) {
- endio = xfs_end_io_direct_write;
- flags = DIO_ASYNC_EXTEND;
- }
-
- if (IS_DAX(inode)) {
- return dax_do_io(iocb, inode, iter,
- xfs_get_blocks_direct, endio, 0);
- }
-
- bdev = xfs_find_bdev_for_inode(inode);
- return __blockdev_direct_IO(iocb, inode, bdev, iter,
- xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
- struct inode *inode,
- loff_t start,
- loff_t end)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error;
-
- start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
- end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
- if (end_fsb <= start_fsb)
- return;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "xfs_vm_write_failed: unable to clean up ino %lld",
- ip->i_ino);
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
- struct inode *inode,
- struct page *page,
- loff_t pos,
- unsigned len)
-{
- loff_t block_offset;
- loff_t block_start;
- loff_t block_end;
- loff_t from = pos & (PAGE_SIZE - 1);
- loff_t to = from + len;
- struct buffer_head *bh, *head;
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
-
/*
- * The request pos offset might be 32 or 64 bit, this is all fine
- * on 64-bit platform. However, for 64-bit pos request on 32-bit
- * platform, the high 32-bit will be masked off if we evaluate the
- * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
- * 0xfffff000 as an unsigned long, hence the result is incorrect
- * which could cause the following ASSERT failed in most cases.
- * In order to avoid this, we can evaluate the block_offset of the
- * start of the page by using shifts rather than masks the mismatch
- * problem.
+ * We just need the method present so that open/fcntl allow direct I/O.
*/
- block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-
- ASSERT(block_offset + from == pos);
-
- head = page_buffers(page);
- block_start = 0;
- for (bh = head; bh != head || !block_start;
- bh = bh->b_this_page, block_start = block_end,
- block_offset += bh->b_size) {
- block_end = block_start + bh->b_size;
-
- /* skip buffers before the write */
- if (block_end <= from)
- continue;
-
- /* if the buffer is after the write, we're done */
- if (block_start >= to)
- break;
-
- /*
- * Process delalloc and unwritten buffers beyond EOF. We can
- * encounter unwritten buffers in the event that a file has
- * post-EOF unwritten extents and an extending write happens to
- * fail (e.g., an unaligned write that also involves a delalloc
- * to the same page).
- */
- if (!buffer_delay(bh) && !buffer_unwritten(bh))
- continue;
-
- if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
- block_offset < i_size_read(inode))
- continue;
-
- if (buffer_delay(bh))
- xfs_vm_kill_delalloc_range(inode, block_offset,
- block_offset + bh->b_size);
-
- /*
- * This buffer does not contain data anymore. make sure anyone
- * who finds it knows that for certain.
- */
- clear_buffer_delay(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
- clear_buffer_new(bh);
- clear_buffer_dirty(bh);
- clear_buffer_unwritten(bh);
- }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata)
-{
- pgoff_t index = pos >> PAGE_SHIFT;
- struct page *page;
- int status;
- struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
-
- ASSERT(len <= PAGE_SIZE);
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
-
- status = __block_write_begin(page, pos, len, xfs_get_blocks);
- if (xfs_mp_fail_writes(mp))
- status = -EIO;
- if (unlikely(status)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
-
- xfs_vm_write_failed(inode, page, pos, len);
- unlock_page(page);
-
- /*
- * If the write is beyond EOF, we only want to kill blocks
- * allocated in this write, not blocks that were previously
- * written successfully.
- */
- if (xfs_mp_fail_writes(mp))
- isize = 0;
- if (pos + len > isize) {
- ssize_t start = max_t(ssize_t, pos, isize);
-
- truncate_pagecache_range(inode, start, pos + len);
- }
-
- put_page(page);
- page = NULL;
- }
-
- *pagep = page;
- return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned len,
- unsigned copied,
- struct page *page,
- void *fsdata)
-{
- int ret;
-
- ASSERT(len <= PAGE_SIZE);
-
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (unlikely(ret < len)) {
- struct inode *inode = mapping->host;
- size_t isize = i_size_read(inode);
- loff_t to = pos + len;
-
- if (to > isize) {
- /* only kill blocks in this write beyond EOF */
- if (pos > isize)
- isize = pos;
- xfs_vm_kill_delalloc_range(inode, isize, to);
- truncate_pagecache_range(inode, isize, to);
- }
- }
- return ret;
+ return -EINVAL;
}
STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
.set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
- .write_begin = xfs_vm_write_begin,
- .write_end = xfs_vm_write_end,
.bmap = xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 814aab790713..bf2d9a141a73 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
+int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private);
+
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 55d214981ed2..be0b79d8900f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
* Recurse (gasp!) through the attribute nodes until we find leaves.
* We're doing a depth-first traversal in order to invalidate everything.
*/
-int
+static int
xfs_attr3_root_inactive(
struct xfs_trans **trans,
struct xfs_inode *dp)
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d25f26b22ac9..25e76cd6c053 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
* we have to calculate each entries' hashvalue and sort them before
* we can begin returning them to the user.
*/
-int
+static int
xfs_attr_shortform_list(xfs_attr_list_context_t *context)
{
attrlist_cursor_kern_t *cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 586bb64e674b..cd4a850564f2 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -79,6 +79,23 @@ xfs_zero_extent(
GFP_NOFS, true);
}
+/* Sort bmap items by AG. */
+static int
+xfs_bmap_free_list_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_bmap_free_item *ra;
+ struct xfs_bmap_free_item *rb;
+
+ ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
+ rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
+ return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
+}
+
/*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
@@ -99,14 +116,15 @@ xfs_bmap_finish(
int error; /* error return value */
int committed;/* xact committed or not */
struct xfs_bmap_free_item *free; /* free extent item */
- struct xfs_bmap_free_item *next; /* next item on free list */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
if (flist->xbf_count == 0)
return 0;
+ list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
+
efi = xfs_trans_get_efi(*tp, flist->xbf_count);
- for (free = flist->xbf_first; free; free = free->xbfi_next)
+ list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
@@ -125,9 +143,7 @@ xfs_bmap_finish(
if (committed) {
xfs_efi_release(efi);
xfs_force_shutdown((*tp)->t_mountp,
- (error == -EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
+ SHUTDOWN_META_IO_ERROR);
}
return error;
}
@@ -138,15 +154,15 @@ xfs_bmap_finish(
* on error.
*/
efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
- for (free = flist->xbf_first; free != NULL; free = next) {
- next = free->xbfi_next;
-
+ while (!list_empty(&flist->xbf_flist)) {
+ free = list_first_entry(&flist->xbf_flist,
+ struct xfs_bmap_free_item, xbfi_list);
error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
free->xbfi_blockcount);
if (error)
return error;
- xfs_bmap_del_free(flist, NULL, free);
+ xfs_bmap_del_free(flist, free);
}
return 0;
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
/*
* Count fsblocks of the given fork.
*/
-int /* error */
+static int /* error */
xfs_bmap_count_blocks(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode */
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
if (error)
break;
- ASSERT(!flist.xbf_count && !flist.xbf_first);
+ ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
next_block:
start_fsb++;
remaining--;
@@ -1089,99 +1105,120 @@ error1: /* Just cancel transaction */
return error;
}
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
- xfs_inode_t *ip,
- xfs_off_t startoff,
- xfs_off_t endoff)
+static int
+xfs_unmap_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t startoffset_fsb,
+ xfs_filblks_t len_fsb,
+ int *done)
{
- xfs_bmbt_irec_t imap;
- xfs_fileoff_t offset_fsb;
- xfs_off_t lastoffset;
- xfs_off_t offset;
- xfs_buf_t *bp;
- xfs_mount_t *mp = ip->i_mount;
- int nimap;
- int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t firstfsb;
+ uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ int error;
- /*
- * Avoid doing I/O beyond eof - it's not necessary
- * since nothing can read beyond eof. The space will
- * be zeroed when the file is extended anyway.
- */
- if (startoff >= XFS_ISIZE(ip))
- return 0;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ if (error) {
+ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ return error;
+ }
- if (endoff > XFS_ISIZE(ip))
- endoff = XFS_ISIZE(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+ ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_trans_cancel;
- for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
- uint lock_mode;
+ xfs_trans_ijoin(tp, ip, 0);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- nimap = 1;
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+ &free_list, done);
+ if (error)
+ goto out_bmap_cancel;
- lock_mode = xfs_ilock_data_map_shared(ip);
- error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
- xfs_iunlock(ip, lock_mode);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
+ goto out_bmap_cancel;
- if (error || nimap < 1)
- break;
- ASSERT(imap.br_blockcount >= 1);
- ASSERT(imap.br_startoff == offset_fsb);
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = xfs_trans_commit(tp);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
- if (imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_state == XFS_EXT_UNWRITTEN) {
- /* skip the entire extent */
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
- imap.br_blockcount) - 1;
- continue;
- }
+out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
- if (lastoffset > endoff)
- lastoffset = endoff;
+static int
+xfs_adjust_extent_unmap_boundaries(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *startoffset_fsb,
+ xfs_fileoff_t *endoffset_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ int nimap, error;
+ xfs_extlen_t mod = 0;
- /* DAX can just zero the backing device directly */
- if (IS_DAX(VFS_I(ip))) {
- error = dax_zero_page_range(VFS_I(ip), offset,
- lastoffset - offset + 1,
- xfs_get_blocks_direct);
- if (error)
- return error;
- continue;
- }
+ nimap = 1;
+ error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
+ if (error)
+ return error;
- error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
- xfs_fsb_to_db(ip, imap.br_startblock),
- BTOBB(mp->m_sb.sb_blocksize),
- 0, &bp, NULL);
- if (error)
- return error;
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ xfs_daddr_t block;
- memset(bp->b_addr +
- (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
- 0, lastoffset - offset + 1);
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ block = imap.br_startblock;
+ mod = do_div(block, mp->m_sb.sb_rextsize);
+ if (mod)
+ *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+ }
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- return error;
+ nimap = 1;
+ error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
+ if (error)
+ return error;
+
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ mod++;
+ if (mod && mod != mp->m_sb.sb_rextsize)
+ *endoffset_fsb -= mod;
}
- return error;
+
+ return 0;
+}
+
+static int
+xfs_flush_unmap_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ xfs_off_t rounding, start, end;
+ int error;
+
+ /* wait for the completion of any pending DIOs */
+ inode_dio_wait(inode);
+
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+ start = round_down(offset, rounding);
+ end = round_up(offset + len, rounding) - 1;
+
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return error;
+ truncate_pagecache_range(inode, start, end);
+ return 0;
}
int
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
xfs_off_t offset,
xfs_off_t len)
{
- int done;
- xfs_fileoff_t endoffset_fsb;
- int error;
- xfs_fsblock_t firstfsb;
- xfs_bmap_free_t free_list;
- xfs_bmbt_irec_t imap;
- xfs_off_t ioffset;
- xfs_off_t iendoffset;
- xfs_extlen_t mod=0;
- xfs_mount_t *mp;
- int nimap;
- uint resblks;
- xfs_off_t rounding;
- int rt;
+ struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
- xfs_trans_t *tp;
-
- mp = ip->i_mount;
+ xfs_fileoff_t endoffset_fsb;
+ int done = 0, error;
trace_xfs_free_file_space(ip);
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
if (error)
return error;
- error = 0;
if (len <= 0) /* if nothing being freed */
- return error;
- rt = XFS_IS_REALTIME_INODE(ip);
- startoffset_fsb = XFS_B_TO_FSB(mp, offset);
- endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
- /* wait for the completion of any pending DIOs */
- inode_dio_wait(VFS_I(ip));
+ return 0;
- rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
- ioffset = round_down(offset, rounding);
- iendoffset = round_up(offset + len, rounding) - 1;
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
- iendoffset);
+ error = xfs_flush_unmap_range(ip, offset, len);
if (error)
- goto out;
- truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+ return error;
+
+ startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/*
- * Need to zero the stuff we're not freeing, on disk.
- * If it's a realtime file & can't use unwritten extents then we
- * actually need to zero the extent edges. Otherwise xfs_bunmapi
- * will take care of it for us.
+ * Need to zero the stuff we're not freeing, on disk. If it's a RT file
+ * and we can't use unwritten extents then we actually need to ensure
+ * to zero the whole extent, otherwise we just need to take of block
+ * boundaries, and xfs_bunmapi will handle the rest.
*/
- if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- nimap = 1;
- error = xfs_bmapi_read(ip, startoffset_fsb, 1,
- &imap, &nimap, 0);
- if (error)
- goto out;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
- if (mod)
- startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
- nimap = 1;
- error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
- &imap, &nimap, 0);
+ if (XFS_IS_REALTIME_INODE(ip) &&
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
+ &endoffset_fsb);
if (error)
- goto out;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && (mod != mp->m_sb.sb_rextsize))
- endoffset_fsb -= mod;
- }
- }
- if ((done = (endoffset_fsb <= startoffset_fsb)))
- /*
- * One contiguous piece to clear
- */
- error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
- else {
- /*
- * Some full blocks, possibly two pieces to clear
- */
- if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
- error = xfs_zero_remaining_bytes(ip, offset,
- XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
- if (!error &&
- XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
- error = xfs_zero_remaining_bytes(ip,
- XFS_FSB_TO_B(mp, endoffset_fsb),
- offset + len - 1);
+ return error;
}
- /*
- * free file space until done or until there is an error
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- while (!error && !done) {
-
- /*
- * allocate and setup the transaction. Allow this
- * transaction to dip into the reserve blocks to ensure
- * the freeing of the space succeeds at ENOSPC.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
- &tp);
- if (error) {
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- break;
+ if (endoffset_fsb > startoffset_fsb) {
+ while (!done) {
+ error = xfs_unmap_extent(ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb, &done);
+ if (error)
+ return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp,
- ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
- resblks, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto error1;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * issue the bunmapi() call to free the blocks
- */
- xfs_bmap_init(&free_list, &firstfsb);
- error = xfs_bunmapi(tp, ip, startoffset_fsb,
- endoffset_fsb - startoffset_fsb,
- 0, 2, &firstfsb, &free_list, &done);
- if (error)
- goto error0;
-
- /*
- * complete the transaction
- */
- error = xfs_bmap_finish(&tp, &free_list, NULL);
- if (error)
- goto error0;
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- out:
- return error;
-
- error0:
- xfs_bmap_cancel(&free_list);
- error1:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out;
+ /*
+ * Now that we've unmap all full blocks we'll have to zero out any
+ * partial block at the beginning and/or end. xfs_zero_range is
+ * smart enough to skip any holes, including those we just created.
+ */
+ return xfs_zero_range(ip, offset, len, NULL);
}
/*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index af97d9a1dfb4..f20071432ca6 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
int whichfork, int *eof);
-int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
- int whichfork, int *count);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
@@ -43,7 +41,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
void xfs_bmap_del_free(struct xfs_bmap_free *flist,
- struct xfs_bmap_free_item *prev,
struct xfs_bmap_free_item *free);
int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a87a0d5477bd..47a318ce82e0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,6 +80,47 @@ xfs_buf_vmap_len(
}
/*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+ struct xfs_buf *bp)
+{
+ if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+ return;
+
+ ASSERT(bp->b_flags & XBF_ASYNC);
+ bp->b_flags |= _XBF_IN_FLIGHT;
+ percpu_counter_inc(&bp->b_target->bt_io_count);
+}
+
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+xfs_buf_ioacct_dec(
+ struct xfs_buf *bp)
+{
+ if (!(bp->b_flags & _XBF_IN_FLIGHT))
+ return;
+
+ ASSERT(bp->b_flags & XBF_ASYNC);
+ bp->b_flags &= ~_XBF_IN_FLIGHT;
+ percpu_counter_dec(&bp->b_target->bt_io_count);
+}
+
+/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -102,6 +143,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
+ /*
+ * Once the buffer is marked stale and unlocked, a subsequent lookup
+ * could reset b_flags. There is no guarantee that the buffer is
+ * unaccounted (released to LRU) before that occurs. Drop in-flight
+ * status now to preserve accounting consistency.
+ */
+ xfs_buf_ioacct_dec(bp);
+
spin_lock(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
- bp = _xfs_buf_alloc(target, &map, 1, 0);
+ /* flags might contain irrelevant bits, pass only what we care about */
+ bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
if (unlikely(bp == NULL))
goto fail;
@@ -866,63 +916,85 @@ xfs_buf_hold(
}
/*
- * Releases a hold on the specified buffer. If the
- * the hold count is 1, calls xfs_buf_free.
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+ * placed on LRU or freed (depending on b_lru_ref).
*/
void
xfs_buf_rele(
xfs_buf_t *bp)
{
struct xfs_perag *pag = bp->b_pag;
+ bool release;
+ bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
if (!pag) {
ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
- if (atomic_dec_and_test(&bp->b_hold))
+ if (atomic_dec_and_test(&bp->b_hold)) {
+ xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp);
+ }
return;
}
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0);
- if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- spin_lock(&bp->b_lock);
- if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
- /*
- * If the buffer is added to the LRU take a new
- * reference to the buffer for the LRU and clear the
- * (now stale) dispose list state flag
- */
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
- bp->b_state &= ~XFS_BSTATE_DISPOSE;
- atomic_inc(&bp->b_hold);
- }
- spin_unlock(&bp->b_lock);
- spin_unlock(&pag->pag_buf_lock);
- } else {
- /*
- * most of the time buffers will already be removed from
- * the LRU, so optimise that case by checking for the
- * XFS_BSTATE_DISPOSE flag indicating the last list the
- * buffer was on was the disposal list
- */
- if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
- } else {
- ASSERT(list_empty(&bp->b_lru));
- }
- spin_unlock(&bp->b_lock);
- ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
- xfs_buf_free(bp);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ spin_lock(&bp->b_lock);
+ if (!release) {
+ /*
+ * Drop the in-flight state if the buffer is already on the LRU
+ * and it holds the only reference. This is racy because we
+ * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+ * ensures the decrement occurs only once per-buf.
+ */
+ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+ xfs_buf_ioacct_dec(bp);
+ goto out_unlock;
+ }
+
+ /* the last reference has been dropped ... */
+ xfs_buf_ioacct_dec(bp);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new reference to the
+ * buffer for the LRU and clear the (now stale) dispose list
+ * state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
+ }
+ spin_unlock(&pag->pag_buf_lock);
+ } else {
+ /*
+ * most of the time buffers will already be removed from the
+ * LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+ * was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
}
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+ freebuf = true;
}
+
+out_unlock:
+ spin_unlock(&bp->b_lock);
+
+ if (freebuf)
+ xfs_buf_free(bp);
}
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
int locked;
locked = down_trylock(&bp->b_sema) == 0;
- if (locked)
+ if (locked) {
XB_SET_OWNER(bp);
-
- trace_xfs_buf_trylock(bp, _RET_IP_);
+ trace_xfs_buf_trylock(bp, _RET_IP_);
+ } else {
+ trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+ }
return locked;
}
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
* xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
+ xfs_buf_ioacct_inc(bp);
_xfs_buf_ioapply(bp);
/*
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
int loop = 0;
/*
- * We need to flush the buffer workqueue to ensure that all IO
- * completion processing is 100% done. Just waiting on buffer locks is
- * not sufficient for async IO as the reference count held over IO is
- * not released until after the buffer lock is dropped. Hence we need to
- * ensure here that all reference counts have been dropped before we
- * start walking the LRU list.
+ * First wait on the buftarg I/O count for all in-flight buffers to be
+ * released. This is critical as new buffers do not make the LRU until
+ * they are released.
+ *
+ * Next, flush the buffer workqueue to ensure all completion processing
+ * has finished. Just waiting on buffer locks is not sufficient for
+ * async IO as the reference count held over IO is not released until
+ * after the buffer lock is dropped. Hence we need to ensure here that
+ * all reference counts have been dropped before we start walking the
+ * LRU list.
*/
+ while (percpu_counter_sum(&btp->bt_io_count))
+ delay(100);
drain_workqueue(btp->bt_mount->m_buf_workqueue);
/* loop until there is nothing left on the lru list. */
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
struct xfs_buftarg *btp)
{
unregister_shrinker(&btp->bt_shrinker);
+ ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+ percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
if (list_lru_init(&btp->bt_lru))
goto error;
+ if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ goto error;
+
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
return 0;
}
+/*
+ * submit buffers for write.
+ *
+ * When we have a large buffer list, we do not want to hold all the buffers
+ * locked while we block on the request queue waiting for IO dispatch. To avoid
+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
+ * the lock hold times for lists which may contain thousands of objects.
+ *
+ * To do this, we sort the buffer list before we walk the list to lock and
+ * submit buffers, and we plug and unplug around each group of buffers we
+ * submit.
+ */
static int
-__xfs_buf_delwri_submit(
+xfs_buf_delwri_submit_buffers(
struct list_head *buffer_list,
- struct list_head *io_list,
- bool wait)
+ struct list_head *wait_list)
{
- struct blk_plug plug;
struct xfs_buf *bp, *n;
+ LIST_HEAD (submit_list);
int pinned = 0;
+ struct blk_plug plug;
+ list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
- if (!wait) {
+ if (!wait_list) {
if (xfs_buf_ispinned(bp)) {
pinned++;
continue;
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
continue;
}
- list_move_tail(&bp->b_list, io_list);
trace_xfs_buf_delwri_split(bp, _RET_IP_);
- }
-
- list_sort(NULL, io_list, xfs_buf_cmp);
-
- blk_start_plug(&plug);
- list_for_each_entry_safe(bp, n, io_list, b_list) {
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
- bp->b_flags |= XBF_WRITE | XBF_ASYNC;
/*
- * we do all Io submission async. This means if we need to wait
- * for IO completion we need to take an extra reference so the
- * buffer is still valid on the other side.
+ * We do all IO submission async. This means if we need
+ * to wait for IO completion we need to take an extra
+ * reference so the buffer is still valid on the other
+ * side. We need to move the buffer onto the io_list
+ * at this point so the caller can still access it.
*/
- if (wait)
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+ if (wait_list) {
xfs_buf_hold(bp);
- else
+ list_move_tail(&bp->b_list, wait_list);
+ } else
list_del_init(&bp->b_list);
xfs_buf_submit(bp);
@@ -1849,8 +1946,7 @@ int
xfs_buf_delwri_submit_nowait(
struct list_head *buffer_list)
{
- LIST_HEAD (io_list);
- return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+ return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
}
/*
@@ -1865,15 +1961,15 @@ int
xfs_buf_delwri_submit(
struct list_head *buffer_list)
{
- LIST_HEAD (io_list);
+ LIST_HEAD (wait_list);
int error = 0, error2;
struct xfs_buf *bp;
- __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+ xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
/* Wait for IO to complete. */
- while (!list_empty(&io_list)) {
- bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+ while (!list_empty(&wait_list)) {
+ bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8bfb974f0772..1c2e52b2d926 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
+#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
typedef unsigned int xfs_buf_flags_t;
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }
+ { _XBF_COMPOUND, "COMPOUND" }, \
+ { _XBF_IN_FLIGHT, "IN_FLIGHT" }
/*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
/* LRU control structures */
struct shrinker bt_shrinker;
struct list_lru bt_lru;
+
+ struct percpu_counter bt_io_count;
} xfs_buftarg_t;
struct xfs_buf;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 34257992934c..e455f9098d49 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
for (i = 0; i < bip->bli_format_count; i++) {
xfs_buf_item_format_segment(bip, lv, &vecp, offset,
&bip->bli_formats[i]);
- offset += bp->b_maps[i].bm_len;
+ offset += BBTOB(bp->b_maps[i].bm_len);
}
/*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
for (i = 0; i < bip->bli_format_count; i++) {
if (start > last)
break;
- end = start + BBTOB(bp->b_maps[i].bm_len);
+ end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+
+ /* skip to the map that includes the first byte to log */
if (first > end) {
start += BBTOB(bp->b_maps[i].bm_len);
continue;
}
+
+ /*
+ * Trim the range to this segment and mark it in the bitmap.
+ * Note that we must convert buffer offsets to segment relative
+ * offsets (e.g., the first byte of each segment is byte 0 of
+ * that segment).
+ */
if (first < start)
first = start;
if (end > last)
end = last;
-
- xfs_buf_item_log_segment(first, end,
+ xfs_buf_item_log_segment(first - start, end - start,
&bip->bli_formats[i].blf_data_map[0]);
- start += bp->b_maps[i].bm_len;
+ start += BBTOB(bp->b_maps[i].bm_len);
}
}
@@ -949,6 +957,7 @@ xfs_buf_item_free(
xfs_buf_log_item_t *bip)
{
xfs_buf_item_free_format(bip);
+ kmem_free(bip->bli_item.li_lv_shadow);
kmem_zone_free(xfs_buf_item_zone, bip);
}
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
ASSERT(bp->b_iodone != NULL);
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+
/*
* If the write was asynchronous then no one will be looking for the
* error. If this is the first failure of this type, clear the error
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
* async write failure at least once, but we also need to set the buffer
* up to behave correctly now for repeated failures.
*/
- if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+ if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
bp->b_last_error != bp->b_error) {
- bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
- XBF_DONE | XBF_WRITE_FAIL);
+ bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
bp->b_last_error = bp->b_error;
- bp->b_retries = 0;
- bp->b_first_retry_time = jiffies;
+ if (cfg->retry_timeout && !bp->b_first_retry_time)
+ bp->b_first_retry_time = jiffies;
xfs_buf_ioerror(bp, 0);
xfs_buf_submit(bp);
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
* Repeated failure on an async write. Take action according to the
* error configuration we have been set up to use.
*/
- cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
++bp->b_retries > cfg->max_retries)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index e0646659ce16..ccb0811963b2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
{
ASSERT(list_empty(&dqp->q_lru));
+ kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 814cff94e78f..2c7a1629e064 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
spin_lock(&ailp->xa_lock);
xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+ kmem_free(qfs->qql_item.li_lv_shadow);
+ kmem_free(lip->li_lv_shadow);
kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 88693a98fac5..ed7ee4e8af73 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
}
int
-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
{
int i;
int len;
int64_t fsid;
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return -EINVAL;
+
memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 4ed3042a0f16..2e4f67f68856 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
(rf))))
-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
#else
#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 4aa0153214f9..ab779460ecbf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
+ kmem_free(efip->efi_item.li_lv_shadow);
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_efd_item_free(struct xfs_efd_log_item *efdp)
{
+ kmem_free(efdp->efd_item.li_lv_shadow);
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1b3dc9dd8861..ed95e5bb04e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
#include "xfs_log.h"
#include "xfs_icache.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
}
/*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
*/
int
-xfs_iozero(
- struct xfs_inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count) /* size of data to zero */
+xfs_zero_range(
+ struct xfs_inode *ip,
+ xfs_off_t pos,
+ xfs_off_t count,
+ bool *did_zero)
{
- struct page *page;
- struct address_space *mapping;
- int status = 0;
-
-
- mapping = VFS_I(ip)->i_mapping;
- do {
- unsigned offset, bytes;
- void *fsdata;
-
- offset = (pos & (PAGE_SIZE -1)); /* Within page */
- bytes = PAGE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- if (IS_DAX(VFS_I(ip))) {
- status = dax_zero_page_range(VFS_I(ip), pos, bytes,
- xfs_get_blocks_direct);
- if (status)
- break;
- } else {
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
-
- zero_user(page, offset, bytes);
-
- status = pagecache_write_end(NULL, mapping, pos, bytes,
- bytes, page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
- status = 0;
- }
- pos += bytes;
- count -= bytes;
- } while (count);
-
- return status;
+ return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
}
int
@@ -282,48 +239,35 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- size_t size = iov_iter_count(to);
+ loff_t isize = i_size_read(inode);
+ size_t count = iov_iter_count(to);
+ struct iov_iter data;
+ struct xfs_buftarg *target;
ssize_t ret = 0;
- int ioflags = 0;
- xfs_fsize_t n;
- loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(mp, xs_read_calls);
-
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
- ioflags |= XFS_IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
- if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- /* DIO must be aligned to device logical sector size */
- if ((pos | size) & target->bt_logical_sectormask) {
- if (pos == i_size_read(inode))
- return 0;
- return -EINVAL;
- }
- }
+ trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
- n = mp->m_super->s_maxbytes - pos;
- if (n <= 0 || size == 0)
- return 0;
+ if (!count)
+ return 0; /* skip atime */
- if (n < size)
- size = n;
+ if (XFS_IS_REALTIME_INODE(ip))
+ target = ip->i_mount->m_rtdev_targp;
+ else
+ target = ip->i_mount->m_ddev_targp;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
+ /* DIO must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+ if (iocb->ki_pos == isize)
+ return 0;
+ return -EINVAL;
+ }
/*
* Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
* serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ if (mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
@@ -351,8 +295,8 @@ xfs_file_read_iter(
* flush and reduce the chances of repeated iolock cycles going
* forward.
*/
- if (inode->i_mapping->nrpages) {
- ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait(mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
- ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+ ret = invalidate_inode_pages2(mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- trace_xfs_file_read(ip, size, pos, ioflags);
+ data = *to;
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, NULL, NULL, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(to, ret);
+ }
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct iov_iter data = *to;
+ size_t count = iov_iter_count(to);
+ ssize_t ret = 0;
+
+ trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+ if (!count)
+ return 0; /* skip atime */
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(to, ret);
+ }
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
ret = generic_file_read_iter(iocb, to);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ ssize_t ret = 0;
+
+ XFS_STATS_INC(mp, xs_read_calls);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_read(iocb, to);
+ else if (iocb->ki_flags & IOCB_DIRECT)
+ ret = xfs_file_dio_aio_read(iocb, to);
+ else
+ ret = xfs_file_buffered_aio_read(iocb, to);
+
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -389,18 +408,14 @@ xfs_file_splice_read(
unsigned int flags)
{
struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
- int ioflags = 0;
ssize_t ret;
XFS_STATS_INC(ip->i_mount, xs_read_calls);
- if (infilp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+ trace_xfs_file_splice_read(ip, count, *ppos);
/*
* DAX inodes cannot ues the page cache for splice, so we have to push
@@ -424,49 +439,6 @@ out:
}
/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF. We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int /* error (positive) */
-xfs_zero_last_block(
- struct xfs_inode *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize,
- bool *did_zeroing)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
- int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- int zero_len;
- int nimaps = 1;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
-
- /*
- * If the block underlying isize is just a hole, then there
- * is nothing to zero.
- */
- if (imap.br_startblock == HOLESTARTBLOCK)
- return 0;
-
- zero_len = mp->m_sb.sb_blocksize - zero_offset;
- if (isize + zero_len > offset)
- zero_len = offset - isize;
- *did_zeroing = true;
- return xfs_iozero(ip, isize, zero_len);
-}
-
-/*
* Zero any on disk space between the current EOF and the new, larger EOF.
*
* This handles the normal case of zeroing the remainder of the last block in
@@ -484,94 +456,11 @@ xfs_zero_eof(
xfs_fsize_t isize, /* current inode size */
bool *did_zeroing)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- struct xfs_bmbt_irec imap;
-
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
trace_xfs_zero_eof(ip, isize, offset - isize);
-
- /*
- * First handle zeroing the block on which isize resides.
- *
- * We only zero a part of that block so it is handled specially.
- */
- if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
- if (error)
- return error;
- }
-
- /*
- * Calculate the range between the new size and the old where blocks
- * needing to be zeroed may exist.
- *
- * To get the block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back to a block
- * boundary. We subtract 1 in case the size is exactly on a block
- * boundary.
- */
- last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
- start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
- ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
- if (last_fsb == end_zero_fsb) {
- /*
- * The size was only incremented on its last block.
- * We took care of that above, so just return.
- */
- return 0;
- }
-
- ASSERT(start_zero_fsb <= end_zero_fsb);
- while (start_zero_fsb <= end_zero_fsb) {
- nimaps = 1;
- zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
- &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- ASSERT(nimaps > 0);
-
- if (imap.br_state == XFS_EXT_UNWRITTEN ||
- imap.br_startblock == HOLESTARTBLOCK) {
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- continue;
- }
-
- /*
- * There are blocks we need to zero.
- */
- zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
- zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
- if ((zero_off + zero_len) > offset)
- zero_len = offset - zero_off;
-
- error = xfs_iozero(ip, zero_off, zero_len);
- if (error)
- return error;
-
- *did_zeroing = true;
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- }
-
- return 0;
+ return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
}
/*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if (!IS_DAX(inode) &&
- ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
end = iocb->ki_pos + count - 1;
/*
- * See xfs_file_read_iter() for why we do a full-file flush here.
+ * See xfs_file_dio_aio_read() for why we do a full-file flush here.
*/
if (mapping->nrpages) {
ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
data = *from;
- ret = mapping->a_ops->direct_IO(iocb, &data);
+ ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+ xfs_get_blocks_direct, xfs_end_io_direct_write,
+ NULL, DIO_ASYNC_EXTEND);
/* see generic_file_direct_write() for why this is necessary */
if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
xfs_rw_iunlock(ip, iolock);
/*
- * No fallback to buffered IO on errors for XFS. DAX can result in
- * partial writes, but direct IO will either complete fully or fail.
+ * No fallback to buffered IO on errors for XFS, direct IO will either
+ * complete fully or fail.
*/
- ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ int unaligned_io = 0;
+ int iolock;
+ struct iov_iter data;
+
+ /* "unaligned" here means not aligned to a filesystem block */
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+ unaligned_io = 1;
+ iolock = XFS_IOLOCK_EXCL;
+ } else if (mapping->nrpages) {
+ iolock = XFS_IOLOCK_EXCL;
+ } else {
+ iolock = XFS_IOLOCK_SHARED;
+ }
+ xfs_rw_ilock(ip, iolock);
+
+ ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out;
+
+ /*
+ * Yes, even DAX files can have page cache attached to them: A zeroed
+ * page is inserted into the pagecache when we have to serve a write
+ * fault on a hole. It should never be dirtied and can simply be
+ * dropped from the pagecache once we get real data for the page.
+ */
+ if (mapping->nrpages) {
+ ret = invalidate_inode_pages2(mapping);
+ WARN_ON_ONCE(ret);
+ }
+
+ if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+ data = *from;
+ ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, 0);
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ iov_iter_advance(from, ret);
+ }
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
current->backing_dev_info = inode_to_bdi(inode);
write_retry:
- trace_xfs_file_buffered_write(ip, iov_iter_count(from),
- iocb->ki_pos, 0);
- ret = generic_perform_write(file, from, iocb->ki_pos);
+ trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
@@ -895,7 +844,9 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+ if (IS_DAX(inode))
+ ret = xfs_file_dax_write(iocb, from);
+ else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
if (IS_DAX(inode)) {
ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
} else {
- ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b4d75825ae37..7191c3878b4a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
__uint64_t *inval,
xfs_fsop_resblks_t *outval)
{
- __int64_t lcounter, delta, fdblks_delta;
+ __int64_t lcounter, delta;
+ __int64_t fdblks_delta = 0;
__uint64_t request;
+ __int64_t free;
+ int error = 0;
/* If inval is null, report current values and return */
if (inval == (__uint64_t *)NULL) {
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
request = *inval;
/*
- * With per-cpu counters, this becomes an interesting
- * problem. we needto work out if we are freeing or allocation
- * blocks first, then we can do the modification as necessary.
+ * With per-cpu counters, this becomes an interesting problem. we need
+ * to work out if we are freeing or allocation blocks first, then we can
+ * do the modification as necessary.
*
- * We do this under the m_sb_lock so that if we are near
- * ENOSPC, we will hold out any changes while we work out
- * what to do. This means that the amount of free space can
- * change while we do this, so we need to retry if we end up
- * trying to reserve more space than is available.
+ * We do this under the m_sb_lock so that if we are near ENOSPC, we will
+ * hold out any changes while we work out what to do. This means that
+ * the amount of free space can change while we do this, so we need to
+ * retry if we end up trying to reserve more space than is available.
*/
-retry:
spin_lock(&mp->m_sb_lock);
/*
* If our previous reservation was larger than the current value,
- * then move any unused blocks back to the free pool.
+ * then move any unused blocks back to the free pool. Modify the resblks
+ * counters directly since we shouldn't have any problems unreserving
+ * space.
*/
- fdblks_delta = 0;
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (lcounter > 0) { /* release unused blocks */
@@ -707,54 +709,67 @@ retry:
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
- } else {
- __int64_t free;
+ if (fdblks_delta) {
+ spin_unlock(&mp->m_sb_lock);
+ error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+ spin_lock(&mp->m_sb_lock);
+ }
+
+ goto out;
+ }
+ /*
+ * If the request is larger than the current reservation, reserve the
+ * blocks before we update the reserve counters. Sample m_fdblocks and
+ * perform a partial reservation if the request exceeds free space.
+ */
+ error = -ENOSPC;
+ do {
free = percpu_counter_sum(&mp->m_fdblocks) -
XFS_ALLOC_SET_ASIDE(mp);
if (!free)
- goto out; /* ENOSPC and fdblks_delta = 0 */
+ break;
delta = request - mp->m_resblks;
lcounter = free - delta;
- if (lcounter < 0) {
+ if (lcounter < 0)
/* We can't satisfy the request, just get what we can */
- mp->m_resblks += free;
- mp->m_resblks_avail += free;
- fdblks_delta = -free;
- } else {
- fdblks_delta = -delta;
- mp->m_resblks = request;
- mp->m_resblks_avail += delta;
- }
- }
-out:
- if (outval) {
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- }
- spin_unlock(&mp->m_sb_lock);
+ fdblks_delta = free;
+ else
+ fdblks_delta = delta;
- if (fdblks_delta) {
/*
- * If we are putting blocks back here, m_resblks_avail is
- * already at its max so this will put it in the free pool.
- *
- * If we need space, we'll either succeed in getting it
- * from the free block count or we'll get an enospc. If
- * we get a ENOSPC, it means things changed while we were
- * calculating fdblks_delta and so we should try again to
- * see if there is anything left to reserve.
+ * We'll either succeed in getting space from the free block
+ * count or we'll get an ENOSPC. If we get a ENOSPC, it means
+ * things changed while we were calculating fdblks_delta and so
+ * we should try again to see if there is anything left to
+ * reserve.
*
* Don't set the reserved flag here - we don't want to reserve
* the extra reserve blocks from the reserve.....
*/
- int error;
- error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
- if (error == -ENOSPC)
- goto retry;
+ spin_unlock(&mp->m_sb_lock);
+ error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ spin_lock(&mp->m_sb_lock);
+ } while (error == -ENOSPC);
+
+ /*
+ * Update the reserve counters if blocks have been successfully
+ * allocated.
+ */
+ if (!error && fdblks_delta) {
+ mp->m_resblks += fdblks_delta;
+ mp->m_resblks_avail += fdblks_delta;
}
- return 0;
+
+out:
+ if (outval) {
+ outval->resblks = mp->m_resblks;
+ outval->resblks_avail = mp->m_resblks_avail;
+ }
+
+ spin_unlock(&mp->m_sb_lock);
+ return error;
}
int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 99ee6eee5e0b..fb39a66914dd 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -765,7 +765,7 @@ restart:
* Background scanning to trim post-EOF preallocated space. This is queued
* based on the 'speculative_prealloc_lifetime' tunable (5m by default).
*/
-STATIC void
+void
xfs_queue_eofblocks(
struct xfs_mount *mp)
{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 62f1f91c32cb..05bac99bef75 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
void xfs_eofblocks_worker(struct work_struct *);
+void xfs_queue_eofblocks(struct xfs_mount *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ee6799e0476f..8825bcfd314c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
* lock more than one at a time, lockdep will report false positives saying we
* have violated locking orders.
*/
-void
+static void
xfs_lock_inodes(
xfs_inode_t **ips,
int inodes,
@@ -667,14 +667,6 @@ xfs_ip2xflags(
return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
}
-uint
-xfs_dic2xflags(
- struct xfs_dinode *dip)
-{
- return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
- be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
-}
-
/*
* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
* is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +740,7 @@ out_unlock:
* are not linked into the directory structure - they are attached
* directly to the superblock - and so have no parent.
*/
-int
+static int
xfs_ialloc(
xfs_trans_t *tp,
xfs_inode_t *pip,
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
* link count to go to zero, move the inode to AGI unlinked list so that it can
* be freed when the last active reference goes away via xfs_inactive().
*/
-int /* error */
+static int /* error */
xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
@@ -1104,7 +1096,7 @@ xfs_droplink(
/*
* Increment the link count on an inode & log the change.
*/
-int
+static int
xfs_bumplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e52d7c7aeb5b..8eb78ec4a6e2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
-int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
- xfs_nlink_t, xfs_dev_t, prid_t, int,
- struct xfs_buf **, xfs_inode_t **);
uint xfs_ip2xflags(struct xfs_inode *);
-uint xfs_dic2xflags(struct xfs_dinode *);
int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
-void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_inode **, int *);
-int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
-int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
/* from xfs_file.c */
enum xfs_prealloc_flags {
@@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
xfs_fsize_t isize, bool *did_zeroing);
-int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+ bool *did_zero);
loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
loff_t eof, int whence);
@@ -479,14 +473,4 @@ do { \
extern struct kmem_zone *xfs_inode_zone;
-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
-#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
- { XFS_IO_ISDIRECT, "DIRECT" }, \
- { XFS_IO_INVIS, "INVIS"}
-
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a1b07612224c..892c2aced207 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
xfs_inode_item_destroy(
xfs_inode_t *ip)
{
+ kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
kmem_zone_free(xfs_ili_zone, ip->i_itemp);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 63a6ff2cfc68..9a7c87809d3b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
int
xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
struct file *filp,
- int ioflags,
unsigned int cmd,
xfs_flock64_t *bf)
{
+ struct inode *inode = file_inode(filp);
+ struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
if (filp->f_flags & O_DSYNC)
flags |= XFS_PREALLOC_SYNC;
- if (ioflags & XFS_IO_INVIS)
+ if (filp->f_mode & FMODE_NOCMTIME)
flags |= XFS_PREALLOC_INVISIBLE;
error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
STATIC int
xfs_ioc_getbmap(
- struct xfs_inode *ip,
- int ioflags,
+ struct file *file,
unsigned int cmd,
void __user *arg)
{
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
return -EINVAL;
bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (ioflags & XFS_IO_INVIS)
+ if (file->f_mode & FMODE_NOCMTIME)
bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
- error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+ error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
(__force struct getbmap *)arg+1);
if (error)
return error;
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
goto out_put_tmp_file;
}
+ /*
+ * We need to ensure that the fds passed in point to XFS inodes
+ * before we cast and access them as XFS structures as we have no
+ * control over what the user passes us here.
+ */
if (f.file->f_op != &xfs_file_operations ||
tmp.file->f_op != &xfs_file_operations) {
error = -EINVAL;
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
- int ioflags = 0;
int error;
- if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
trace_xfs_file_ioctl(ip);
switch (cmd) {
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
if (copy_from_user(&bf, arg, sizeof(bf)))
return -EFAULT;
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ return xfs_ioc_space(filp, cmd, &bf);
}
case XFS_IOC_DIOINFO: {
struct dioattr da;
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+ return xfs_ioc_getbmap(filp, cmd, arg);
case XFS_IOC_GETBMAPX:
return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 77c02c7900b6..8b52881bfd90 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
extern int
xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
struct file *filp,
- int ioflags,
unsigned int cmd,
xfs_flock64_t *bf);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1a05d8ae327d..321f57721b92 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
- int ioflags = 0;
int error;
- if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= XFS_IO_INVIS;
-
trace_xfs_file_compat_ioctl(ip);
switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
if (xfs_compat_flock64_copyin(&bf, arg))
return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ return xfs_ioc_space(filp, cmd, &bf);
}
case XFS_IOC_FSGEOMETRY_V1_32:
return xfs_compat_ioc_fsgeometry_v1(mp, arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 58391355a44d..620fc9120444 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
+
+void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+ return !nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int nimaps = 1, error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+ length = mp->m_super->s_maxbytes - offset;
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ENTIRE);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat symmetric
+ * with the work writeback does. This is a completely arbitrary
+ * number pulled out of thin air as a best guess for initial
+ * testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+ if (xfs_get_extsz_hint(ip)) {
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
+ } else {
+ error = xfs_iomap_write_delay(ip, offset, length, &imap);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ } else if (nimaps) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ } else {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = offset;
+ iomap->length = length;
+ }
+
+ return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+ struct xfs_inode *ip,
+ loff_t offset,
+ loff_t length,
+ ssize_t written)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+
+ start_fsb = XFS_B_TO_FSB(mp, offset + written);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ /*
+ * Trim back delalloc blocks if we didn't manage to write the whole
+ * range reserved.
+ *
+ * We don't need to care about racing delalloc as we hold i_mutex
+ * across the reserve/allocate/unreserve calls. If there are delalloc
+ * blocks in the range, they are ours.
+ */
+ if (start_fsb < end_fsb) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino %lld",
+ __func__, ip->i_ino);
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+static int
+xfs_file_iomap_end(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+ return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+ length, written);
+ return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+ .iomap_begin = xfs_file_iomap_begin,
+ .iomap_end = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e663d744..e066d045e2ff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
+#include <linux/iomap.h>
+
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *);
+
+extern struct iomap_ops xfs_iomap_ops;
+
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba6972e..ab820f84ed50 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
#include "xfs_dir2.h"
#include "xfs_trans_space.h"
#include "xfs_pnfs.h"
+#include "xfs_iomap.h"
#include <linux/capability.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
/*
@@ -801,20 +802,30 @@ xfs_setattr_size(
return error;
/*
+ * Wait for all direct I/O to complete.
+ */
+ inode_dio_wait(inode);
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
* part of the transaction.
*
- * Start with zeroing any data block beyond EOF that we may expose on
- * file extension.
+ * Start with zeroing any data beyond EOF that we may expose on file
+ * extension, or zeroing out the rest of the block on a downward
+ * truncate.
*/
if (newsize > oldsize) {
error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
- if (error)
- return error;
+ } else {
+ error = iomap_truncate_page(inode, newsize, &did_zeroing,
+ &xfs_iomap_ops);
}
+ if (error)
+ return error;
+
/*
* We are going to log the inode size change in this transaction so
* any previous writes that are beyond the on disk EOF and the new
@@ -823,17 +834,14 @@ xfs_setattr_size(
* problem. Note that this includes any block zeroing we did above;
* otherwise those blocks may not be zeroed after a crash.
*/
- if (newsize > ip->i_d.di_size &&
- (oldsize != ip->i_d.di_size || did_zeroing)) {
+ if (did_zeroing ||
+ (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
return error;
}
- /* Now wait for all direct I/O to complete. */
- inode_dio_wait(inode);
-
/*
* We've already locked out new page faults, so now we can safely remove
* pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +859,6 @@ xfs_setattr_size(
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
- if (IS_DAX(inode))
- error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
- else
- error = block_truncate_page(inode->i_mapping, newsize,
- xfs_get_blocks);
- if (error)
- return error;
truncate_setsize(inode, newsize);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -998,51 +999,6 @@ xfs_vn_update_time(
return xfs_trans_commit(tp);
}
-#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
- void **arg,
- struct getbmapx *bmv,
- int *full)
-{
- int error;
- struct fiemap_extent_info *fieinfo = *arg;
- u32 fiemap_flags = 0;
- u64 logical, physical, length;
-
- /* Do nothing for a hole */
- if (bmv->bmv_block == -1LL)
- return 0;
-
- logical = BBTOB(bmv->bmv_offset);
- physical = BBTOB(bmv->bmv_block);
- length = BBTOB(bmv->bmv_length);
-
- if (bmv->bmv_oflags & BMV_OF_PREALLOC)
- fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
- else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- physical = 0; /* no block yet */
- }
- if (bmv->bmv_oflags & BMV_OF_LAST)
- fiemap_flags |= FIEMAP_EXTENT_LAST;
-
- error = fiemap_fill_next_extent(fieinfo, logical, physical,
- length, fiemap_flags);
- if (error > 0) {
- error = 0;
- *full = 1; /* user array now full */
- }
-
- return error;
-}
-
STATIC int
xfs_vn_fiemap(
struct inode *inode,
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
u64 start,
u64 length)
{
- xfs_inode_t *ip = XFS_I(inode);
- struct getbmapx bm;
int error;
- error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
- if (error)
- return error;
-
- /* Set up bmap header for xfs internal routine */
- bm.bmv_offset = BTOBBT(start);
- /* Special case for whole file */
- if (length == FIEMAP_MAX_OFFSET)
- bm.bmv_length = -1LL;
- else
- bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
- /* We add one because in getbmap world count includes the header */
- bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
- fieinfo->fi_extents_max + 1;
- bm.bmv_count = min_t(__s32, bm.bmv_count,
- (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
- bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
- if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
- bm.bmv_iflags |= BMV_IF_ATTRFORK;
- if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
- bm.bmv_iflags |= BMV_IF_DELALLOC;
-
- error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
- if (error)
- return error;
+ xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+ error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+ xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
- return 0;
+ return error;
}
STATIC int
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index a8192dc797dc..b8d64d520e12 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
return x;
}
-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
-
#define ASSERT_ALWAYS(expr) \
(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bde02f1fba73..3b74fa011bb1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
* As far as I know, there weren't any dependencies on the old behaviour.
*/
-int
+static int
xfs_log_unmount_write(xfs_mount_t *mp)
{
struct xlog *log = mp->m_log;
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
* there's no point in running a dummy transaction at this point because we
* can't start trying to idle the log until both the CIL and AIL are empty.
*/
-int
+static int
xfs_log_need_covered(xfs_mount_t *mp)
{
struct xlog *log = mp->m_log;
@@ -1177,7 +1177,7 @@ xlog_space_left(
* The log manager needs its own routine, in order to control what
* happens with the buffer after the write completes.
*/
-void
+static void
xlog_iodone(xfs_buf_t *bp)
{
struct xlog_in_core *iclog = bp->b_fspriv;
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
* disk. If there is nothing dirty, then we might need to cover the log to
* indicate that the filesystem is idle.
*/
-void
+static void
xfs_log_worker(
struct work_struct *work)
{
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
*/
error = -ENOMEM;
bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
- BTOBB(log->l_iclog_size), 0);
+ BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
if (!bp)
goto out_free_log;
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
prev_iclog = iclog;
bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- BTOBB(log->l_iclog_size), 0);
+ BTOBB(log->l_iclog_size),
+ XBF_NO_IOACCT);
if (!bp)
goto out_free_iclog;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 80ba0c047090..b5e71072fde5 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp,
__uint8_t clientid,
bool permanent);
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
-int xfs_log_need_covered(struct xfs_mount *mp);
-
-void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
-void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5e54e7955ea6..a4ab192e1792 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
log->l_cilp->xc_ctx->sequence = 1;
}
+static inline int
+xlog_cil_iovec_space(
+ uint niovecs)
+{
+ return round_up((sizeof(struct xfs_log_vec) +
+ niovecs * sizeof(struct xfs_log_iovec)),
+ sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ * a) does not exist; or
+ * b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+ struct xlog *log,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item_desc *lidp;
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+ struct xfs_log_vec *lv;
+ int niovecs = 0;
+ int nbytes = 0;
+ int buf_size;
+ bool ordered = false;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /* get number of vecs and size of data to be stored */
+ lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+ /*
+ * Ordered items need to be tracked but we do not wish to write
+ * them. We need a logvec to track the object, but we do not
+ * need an iovec or buffer to be allocated for copying data.
+ */
+ if (niovecs == XFS_LOG_VEC_ORDERED) {
+ ordered = true;
+ niovecs = 0;
+ nbytes = 0;
+ }
+
+ /*
+ * We 64-bit align the length of each iovec so that the start
+ * of the next one is naturally aligned. We'll need to
+ * account for that slack space here. Then round nbytes up
+ * to 64-bit alignment so that the initial buffer alignment is
+ * easy to calculate and verify.
+ */
+ nbytes += niovecs * sizeof(uint64_t);
+ nbytes = round_up(nbytes, sizeof(uint64_t));
+
+ /*
+ * The data buffer needs to start 64-bit aligned, so round up
+ * that space to ensure we can align it appropriately and not
+ * overrun the buffer.
+ */
+ buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+ /*
+ * if we have no shadow buffer, or it is too small, we need to
+ * reallocate it.
+ */
+ if (!lip->li_lv_shadow ||
+ buf_size > lip->li_lv_shadow->lv_size) {
+
+ /*
+ * We free and allocate here as a realloc would copy
+ * unecessary data. We don't use kmem_zalloc() for the
+ * same reason - we don't need to zero the data area in
+ * the buffer, only the log vector header and the iovec
+ * storage.
+ */
+ kmem_free(lip->li_lv_shadow);
+
+ lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+ memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+ lv->lv_item = lip;
+ lv->lv_size = buf_size;
+ if (ordered)
+ lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ else
+ lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+ lip->li_lv_shadow = lv;
+ } else {
+ /* same or smaller, optimise common overwrite case */
+ lv = lip->li_lv_shadow;
+ if (ordered)
+ lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ else
+ lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
+ lv->lv_next = NULL;
+ }
+
+ /* Ensure the lv is set up according to ->iop_size */
+ lv->lv_niovecs = niovecs;
+
+ /* The allocated data region lies beyond the iovec region */
+ lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+ }
+
+}
+
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
* log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
/*
* If there is no old LV, this is the first time we've seen the item in
* this CIL context and so we need to pin it. If we are replacing the
- * old_lv, then remove the space it accounts for and free it.
+ * old_lv, then remove the space it accounts for and make it the shadow
+ * buffer for later freeing. In both cases we are now switching to the
+ * shadow buffer, so update the the pointer to it appropriately.
*/
- if (!old_lv)
+ if (!old_lv) {
lv->lv_item->li_ops->iop_pin(lv->lv_item);
- else if (old_lv != lv) {
+ lv->lv_item->li_lv_shadow = NULL;
+ } else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
*diff_iovecs -= old_lv->lv_niovecs;
- kmem_free(old_lv);
+ lv->lv_item->li_lv_shadow = old_lv;
}
/* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
* write it out asynchronously without needing to relock the object that was
* modified at the time it gets written into the iclog.
*
- * This function builds a vector for the changes in each log item in the
- * transaction. It then works out the length of the buffer needed for each log
- * item, allocates them and formats the vector for the item into the buffer.
- * The buffer is then attached to the log item are then inserted into the
- * Committed Item List for tracking until the next checkpoint is written out.
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
*
* We don't set up region headers during this process; we simply copy the
* regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
list_for_each_entry(lidp, &tp->t_items, lid_trans) {
struct xfs_log_item *lip = lidp->lid_item;
struct xfs_log_vec *lv;
- struct xfs_log_vec *old_lv;
- int niovecs = 0;
- int nbytes = 0;
- int buf_size;
+ struct xfs_log_vec *old_lv = NULL;
+ struct xfs_log_vec *shadow;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- /* get number of vecs and size of data to be stored */
- lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-
- /* Skip items that do not have any vectors for writing */
- if (!niovecs)
- continue;
-
/*
- * Ordered items need to be tracked but we do not wish to write
- * them. We need a logvec to track the object, but we do not
- * need an iovec or buffer to be allocated for copying data.
+ * The formatting size information is already attached to
+ * the shadow lv on the log item.
*/
- if (niovecs == XFS_LOG_VEC_ORDERED) {
+ shadow = lip->li_lv_shadow;
+ if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
ordered = true;
- niovecs = 0;
- nbytes = 0;
- }
- /*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
- */
- nbytes += niovecs * sizeof(uint64_t);
- nbytes = round_up(nbytes, sizeof(uint64_t));
-
- /* grab the old item if it exists for reservation accounting */
- old_lv = lip->li_lv;
-
- /*
- * The data buffer needs to start 64-bit aligned, so round up
- * that space to ensure we can align it appropriately and not
- * overrun the buffer.
- */
- buf_size = nbytes +
- round_up((sizeof(struct xfs_log_vec) +
- niovecs * sizeof(struct xfs_log_iovec)),
- sizeof(uint64_t));
+ /* Skip items that do not have any vectors for writing */
+ if (!shadow->lv_niovecs && !ordered)
+ continue;
/* compare to existing item size */
- if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+ old_lv = lip->li_lv;
+ if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv;
lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
*/
*diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
+
+ /* Ensure the lv is set up according to ->iop_size */
+ lv->lv_niovecs = shadow->lv_niovecs;
+
+ /* reset the lv buffer information for new formatting */
+ lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
+ lv->lv_buf = (char *)lv +
+ xlog_cil_iovec_space(lv->lv_niovecs);
} else {
- /* allocate new data chunk */
- lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+ /* switch to shadow buffer! */
+ lv = shadow;
lv->lv_item = lip;
- lv->lv_size = buf_size;
if (ordered) {
/* track as an ordered logvec */
ASSERT(lip->li_lv == NULL);
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
goto insert;
}
- lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
}
- /* Ensure the lv is set up according to ->iop_size */
- lv->lv_niovecs = niovecs;
-
- /* The allocated data region lies beyond the iovec region */
- lv->lv_buf_len = 0;
- lv->lv_bytes = 0;
- lv->lv_buf = (char *)lv + buf_size - nbytes;
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-
lip->li_ops->iop_format(lip, lv);
insert:
- ASSERT(lv->lv_buf_len <= nbytes);
xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
}
}
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
+ /*
+ * Do all necessary memory allocation before we lock the CIL.
+ * This ensures the allocation does not deadlock with a CIL
+ * push in memory reclaim (e.g. from kswapd).
+ */
+ xlog_cil_alloc_shadow_bufs(log, tp);
+
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e39b02351b4a..970c19ba2f56 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,13 +272,15 @@ xfs_readsb(
buf_ops = NULL;
/*
- * Allocate a (locked) buffer to hold the superblock.
- * This will be kept around at all times to optimize
- * access to the superblock.
+ * Allocate a (locked) buffer to hold the superblock. This will be kept
+ * around at all times to optimize access to the superblock. Therefore,
+ * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+ * elevated.
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0, &bp, buf_ops);
+ BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+ buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 184c44effdd5..0cc8d8f74356 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
#structname ") is wrong, expected " #size)
+#define XFS_CHECK_OFFSET(structname, member, off) \
+ BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+ "XFS: offsetof(" #structname ", " #member ") is wrong, " \
+ "expected " #off)
+
static inline void __init
xfs_check_ondisk_structs(void)
{
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
*/
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
+ XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
+ XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6);
+ XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0);
+ XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
+ XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0);
+ XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
+ XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index d5b756669fb5..0f14b2e4bf6c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
+#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
return 0;
}
-static void
-xfs_bmbt_to_iomap(
- struct xfs_inode *ip,
- struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_DELALLOC;
- } else {
- iomap->blkno =
- XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
- if (imap->br_state == XFS_EXT_UNWRITTEN)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
/*
* Get a layout for the pNFS client.
*/
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 76c0a4a9bb17..355dd9e1cb64 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
/*
* From xfs_rtbitmap.c
*/
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len, int val,
xfs_rtblock_t *new, int *stat);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 11ea5d51db56..0303f1005f88 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -546,7 +546,7 @@ xfs_showargs(
return 0;
}
-__uint64_t
+static __uint64_t
xfs_max_file_offset(
unsigned int blockshift)
{
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
*/
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
+ xfs_queue_eofblocks(mp);
}
/* rw -> ro */
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
* return it to the same size.
*/
xfs_save_resvblks(mp);
+
+ /*
+ * Cancel background eofb scanning so it cannot race with the
+ * final log force+buftarg wait and deadlock the remount.
+ */
+ cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hassparseinodes(&mp->m_sb))
- xfs_alert(mp,
- "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
if (!xfs_log_ticket_zone)
goto out_free_ioend_bioset;
- xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
- "xfs_bmap_free_item");
+ xfs_bmap_free_item_zone = kmem_zone_init(
+ sizeof(struct xfs_bmap_free_item),
+ "xfs_bmap_free_item");
if (!xfs_bmap_free_item_zone)
goto out_destroy_log_ticket_zone;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2dfb1ce4585f..529bce9fc37e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
struct xfs_buftarg;
struct block_device;
-extern __uint64_t xfs_max_file_offset(unsigned int);
-
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 4c2c55086208..79cfd3fc5324 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
{
struct xfs_error_cfg *cfg;
+ if (error < 0)
+ error = -error;
+
switch (error) {
case EIO:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ea94ee0fe5ea..145169093fe0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
DEFINE_BUF_EVENT(xfs_buf_bawrite);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
DEFINE_BUF_EVENT(xfs_buf_trylock);
DEFINE_BUF_EVENT(xfs_buf_unlock);
DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
- TP_ARGS(ip, count, offset, flags),
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
+ TP_ARGS(ip, count, offset),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, flags)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
- "offset 0x%llx count 0x%zx ioflags %s",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
- __entry->count,
- __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+ __entry->count)
)
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
- TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
+ TP_ARGS(ip, count, offset))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9a462e892e4f..9b2b9fa89331 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
/* delayed logging */
struct list_head li_cil; /* CIL pointers */
struct xfs_log_vec *li_lv; /* active log vector */
+ struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_lsn_t li_seq; /* CIL commit seq */
} xfs_log_item_t;
OpenPOWER on IntegriCloud