summaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig24
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c194
-rw-r--r--fs/ext4/dir.c33
-rw-r--r--fs/ext4/ext4.h205
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c57
-rw-r--r--fs/ext4/ext4_jbd2.h126
-rw-r--r--fs/ext4/extents.c452
-rw-r--r--fs/ext4/extents_status.c521
-rw-r--r--fs/ext4/extents_status.h14
-rw-r--r--fs/ext4/file.c537
-rw-r--r--fs/ext4/fsync.c72
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c15
-rw-r--r--fs/ext4/indirect.c151
-rw-r--r--fs/ext4/inline.c6
-rw-r--r--fs/ext4/inode-test.c274
-rw-r--r--fs/ext4/inode.c1148
-rw-r--r--fs/ext4/ioctl.c146
-rw-r--r--fs/ext4/mballoc.c4
-rw-r--r--fs/ext4/migrate.c103
-rw-r--r--fs/ext4/mmp.c18
-rw-r--r--fs/ext4/namei.c117
-rw-r--r--fs/ext4/page-io.c186
-rw-r--r--fs/ext4/readpage.c225
-rw-r--r--fs/ext4/resize.c56
-rw-r--r--fs/ext4/super.c351
-rw-r--r--fs/ext4/sysfs.c94
-rw-r--r--fs/ext4/verity.c410
-rw-r--r--fs/ext4/xattr.c100
32 files changed, 3800 insertions, 1854 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index cbb5ca830e57..2a592e38cdfe 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -4,12 +4,7 @@
# kernels after the removal of ext3 driver.
config EXT3_FS
tristate "The Extended 3 (ext3) filesystem"
- # These must match EXT4_FS selects...
select EXT4_FS
- select JBD2
- select CRC16
- select CRYPTO
- select CRYPTO_CRC32C
help
This config option is here only for backward compatibility. ext3
filesystem is now handled by the ext4 driver.
@@ -33,12 +28,12 @@ config EXT3_FS_SECURITY
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
- # Please update EXT3_FS selects when changing these
select JBD2
select CRC16
select CRYPTO
select CRYPTO_CRC32C
select FS_IOMAP
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
help
This is the next generation of the ext3 filesystem.
@@ -106,3 +101,20 @@ config EXT4_DEBUG
If you select Y here, then you will be able to turn on debugging
with a command such as:
echo 1 > /sys/module/ext4/parameters/mballoc_debug
+
+config EXT4_KUNIT_TESTS
+ tristate "KUnit tests for ext4"
+ select EXT4_FS
+ depends on KUNIT
+ help
+ This builds the ext4 KUnit tests.
+
+ KUnit tests run during boot and output the results to the debug log
+ in TAP format (http://testanything.org/). Only useful for kernel devs
+ running KUnit test harness and are not for inclusion into a production
+ build.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3c3e04..4ccb3c9189d8 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,3 +13,6 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-inode-test-objs += inode-test.o
+obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
+ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..5f993a411251 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
if (buffer_verified(bh))
goto verified;
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ desc, bh) ||
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
ext4_mark_group_bitmap_corrupted(sb, block_group,
@@ -505,7 +506,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
+ ext4_set_errno(sb, EIO);
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e83741b02e0..0a734ffb4310 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void)
void ext4_exit_system_zone(void)
{
+ rcu_barrier();
kmem_cache_destroy(ext4_system_zone_cachep);
}
@@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1,
return 0;
}
+static void release_system_zone(struct ext4_system_blocks *system_blks)
+{
+ struct ext4_system_zone *entry, *n;
+
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &system_blks->root, node)
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+}
+
/*
* Mark a range of blocks as belonging to the "system zone" --- that
* is, filesystem metadata blocks which should never be used by
* inodes.
*/
-static int add_system_zone(struct ext4_sb_info *sbi,
+static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *new_entry = NULL, *entry;
- struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
while (*n) {
@@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
new_node = &new_entry->node;
rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &sbi->system_blks);
+ rb_insert_color(new_node, &system_blks->root);
}
/* Can we merge to the left? */
@@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
if (can_merge(entry, new_entry)) {
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
- rb_erase(node, &sbi->system_blks);
+ rb_erase(node, &system_blks->root);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
@@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
- rb_erase(node, &sbi->system_blks);
+ rb_erase(node, &system_blks->root);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
@@ -123,10 +133,13 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
{
struct rb_node *node;
struct ext4_system_zone *entry;
+ struct ext4_system_blocks *system_blks;
int first = 1;
printk(KERN_INFO "System zones: ");
- node = rb_first(&sbi->system_blks);
+ rcu_read_lock();
+ system_blks = rcu_dereference(sbi->system_blks);
+ node = rb_first(&system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -134,10 +147,51 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
first = 0;
node = rb_next(node);
}
+ rcu_read_unlock();
printk(KERN_CONT "\n");
}
-static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
+ struct ext4_system_blocks *system_blks,
+ ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+
+ if (system_blks == NULL)
+ return 1;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int ext4_protect_reserved_inode(struct super_block *sb,
+ struct ext4_system_blocks *system_blks,
+ u32 ino)
{
struct inode *inode;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -153,6 +207,7 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
return PTR_ERR(inode);
num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
while (i < num) {
+ cond_resched();
map.m_lblk = i;
map.m_len = num - i;
n = ext4_map_blocks(NULL, inode, &map, 0);
@@ -163,14 +218,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid(sbi, map.m_pblk, n)) {
+ if (!ext4_data_block_valid_rcu(sbi, system_blks,
+ map.m_pblk, n)) {
ext4_error(sb, "blocks %llu-%llu from inode %u "
"overlap system zone", map.m_pblk,
map.m_pblk + map.m_len - 1, ino);
err = -EFSCORRUPTED;
break;
}
- err = add_system_zone(sbi, map.m_pblk, n);
+ err = add_system_zone(system_blks, map.m_pblk, n);
if (err < 0)
break;
i += n;
@@ -180,94 +236,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
return err;
}
+static void ext4_destroy_system_zone(struct rcu_head *rcu)
+{
+ struct ext4_system_blocks *system_blks;
+
+ system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
+ release_system_zone(system_blks);
+ kfree(system_blks);
+}
+
+/*
+ * Build system zone rbtree which is used for block validity checking.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. That's why we first build the rbtree and then
+ * swap it in place.
+ */
int ext4_setup_system_zone(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_system_blocks *system_blks;
struct ext4_group_desc *gdp;
ext4_group_t i;
int flex_size = ext4_flex_bg_size(sbi);
int ret;
if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks.rb_node)
+ if (sbi->system_blks)
ext4_release_system_zone(sb);
return 0;
}
- if (sbi->system_blks.rb_node)
+ if (sbi->system_blks)
return 0;
+ system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
+ if (!system_blks)
+ return -ENOMEM;
+
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+ add_system_zone(system_blks,
+ ext4_group_first_block_no(sb, i),
ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
- ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+ ret = add_system_zone(system_blks,
+ ext4_block_bitmap(sb, gdp), 1);
if (ret)
- return ret;
- ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+ goto err;
+ ret = add_system_zone(system_blks,
+ ext4_inode_bitmap(sb, gdp), 1);
if (ret)
- return ret;
- ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+ goto err;
+ ret = add_system_zone(system_blks,
+ ext4_inode_table(sb, gdp),
sbi->s_itb_per_group);
if (ret)
- return ret;
+ goto err;
}
if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
- ret = ext4_protect_reserved_inode(sb,
+ ret = ext4_protect_reserved_inode(sb, system_blks,
le32_to_cpu(sbi->s_es->s_journal_inum));
if (ret)
- return ret;
+ goto err;
}
+ /*
+ * System blks rbtree complete, announce it once to prevent racing
+ * with ext4_data_block_valid() accessing the rbtree at the same
+ * time.
+ */
+ rcu_assign_pointer(sbi->system_blks, system_blks);
+
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);
return 0;
+err:
+ release_system_zone(system_blks);
+ kfree(system_blks);
+ return ret;
}
-/* Called when the filesystem is unmounted */
+/*
+ * Called when the filesystem is unmounted or when remounting it with
+ * noblock_validity specified.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. So we first clear the system_blks pointer and
+ * then free the rbtree only after RCU grace period expires.
+ */
void ext4_release_system_zone(struct super_block *sb)
{
- struct ext4_system_zone *entry, *n;
+ struct ext4_system_blocks *system_blks;
- rbtree_postorder_for_each_entry_safe(entry, n,
- &EXT4_SB(sb)->system_blks, node)
- kmem_cache_free(ext4_system_zone_cachep, entry);
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ lockdep_is_held(&sb->s_umount));
+ rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
- EXT4_SB(sb)->system_blks = RB_ROOT;
+ if (system_blks)
+ call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
unsigned int count)
{
- struct ext4_system_zone *entry;
- struct rb_node *n = sbi->system_blks.rb_node;
+ struct ext4_system_blocks *system_blks;
+ int ret;
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es))) {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
- }
- return 1;
+ /*
+ * Lock the system zone to prevent it being released concurrently
+ * when doing a remount which inverse current "[no]block_validity"
+ * mount option.
+ */
+ rcu_read_lock();
+ system_blks = rcu_dereference(sbi->system_blks);
+ ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
+ count);
+ rcu_read_unlock();
+ return ret;
}
int ext4_check_blockref(const char *function, unsigned int line,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86054f31fe4d..9aa1f75409b0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -72,6 +72,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
+ const int next_offset = ((char *) de - buf) + rlen;
if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
@@ -79,8 +80,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (unlikely(((char *) de - buf) + rlen > size))
+ else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
+ else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
+ next_offset != size))
+ error_msg = "directory entry too close to block end";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
@@ -116,7 +120,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
- if (err && err != -ENOKEY)
+ if (err)
return err;
}
@@ -125,12 +129,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (err != ERR_BAD_DX_DIR) {
return err;
}
- /*
- * We don't set the inode dirty flag since it's not
- * critical that it get flushed back to the disk.
- */
- ext4_clear_inode_flag(file_inode(file),
- EXT4_INODE_INDEX);
+ /* Can we just clear INDEX flag to ignore htree information? */
+ if (!ext4_has_metadata_csum(sb)) {
+ /*
+ * We don't set the inode dirty flag since it's not
+ * critical that it gets flushed back to the disk.
+ */
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
if (ext4_has_inline_data(inode)) {
@@ -458,7 +464,6 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
new_fn->name_len = ent_name->len;
new_fn->file_type = dirent->file_type;
memcpy(new_fn->name, ent_name->name, ent_name->len);
- new_fn->name[ent_name->len] = 0;
while (*p) {
parent = *p;
@@ -668,24 +673,28 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct qstr qstr = {.name = str, .len = len };
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+ const struct inode *inode = READ_ONCE(parent->d_inode);
- if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
+ if (!inode || !IS_CASEFOLDED(inode) ||
+ !EXT4_SB(inode->i_sb)->s_encoding) {
if (len != name->len)
return -1;
return memcmp(str, name->name, len);
}
- return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false);
+ return ext4_ci_compare(inode, name, &qstr, false);
}
static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
const struct unicode_map *um = sbi->s_encoding;
+ const struct inode *inode = READ_ONCE(dentry->d_inode);
unsigned char *norm;
int len, ret = 0;
- if (!IS_CASEFOLDED(dentry->d_inode))
+ if (!inode || !IS_CASEFOLDED(inode) || !um)
return 0;
norm = kmalloc(PATH_MAX, GFP_ATOMIC);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa7a9e0..4441331d06cc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -41,6 +41,7 @@
#endif
#include <linux/fscrypt.h>
+#include <linux/fsverity.h>
#include <linux/compiler.h>
@@ -185,10 +186,24 @@ struct ext4_map_blocks {
};
/*
+ * Block validity checking, system zone rbtree.
+ */
+struct ext4_system_blocks {
+ struct rb_root root;
+ struct rcu_head rcu;
+};
+
+/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -202,8 +217,7 @@ typedef struct ext4_io_end {
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -284,6 +298,9 @@ struct ext4_io_submit {
~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Fill in the low bits to get the last block of the cluster */
+#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \
+ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -395,6 +412,7 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
@@ -402,7 +420,7 @@ struct flex_groups {
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
@@ -467,6 +485,7 @@ enum {
EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
@@ -512,6 +531,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(TOPDIR);
CHECK_FLAG_VALUE(HUGE_FILE);
CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
@@ -649,6 +669,10 @@ enum {
#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
+/* ioctl codes 19--39 are reserved for fscrypt */
+#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
+#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
+#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
@@ -662,6 +686,16 @@ enum {
#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+/*
+ * Flags returned by EXT4_IOC_GETSTATE
+ *
+ * We only expose to userspace a subset of the state flags in
+ * i_state_flags
+ */
+#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001
+#define EXT4_STATE_FLAG_NEW 0x00000002
+#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
+#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -679,6 +713,12 @@ enum {
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
+/*
+ * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
+ * It indicates that the entry in extent status cache is for a hole.
+ */
+#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000
+
/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
@@ -808,31 +848,20 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
static inline void ext4_decode_extra_time(struct timespec64 *time,
__le32 extra)
{
- if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
-
-#if 1
- /* Handle legacy encoding of pre-1970 dates with epoch
- * bits 1,1. (This backwards compatibility may be removed
- * at the discretion of the ext4 developers.)
- */
- u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
- if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
- extra_bits = 0;
- time->tv_sec += extra_bits << 32;
-#else
+ if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
-#endif
- }
time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
do { \
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
(raw_inode)->xtime ## _extra = \
ext4_encode_extra_time(&(inode)->xtime); \
} \
+ else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
} while (0)
#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
@@ -1023,8 +1052,6 @@ struct ext4_inode_info {
/* allocation reservation info for delalloc */
/* In case of bigalloc, this refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
- ext4_lblk_t i_da_metadata_calc_last_lblock;
- int i_da_metadata_calc_len;
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
@@ -1314,7 +1341,8 @@ struct ext4_super_block {
__u8 s_lastcheck_hi;
__u8 s_first_error_time_hi;
__u8 s_last_error_time_hi;
- __u8 s_pad[2];
+ __u8 s_first_error_errcode;
+ __u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_reserved[95]; /* Padding to the end of the block */
@@ -1421,7 +1449,7 @@ struct ext4_sb_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct rb_root system_blks;
+ struct ext4_system_blocks __rcu *system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1527,6 +1555,9 @@ struct ext4_sb_info {
/* Barrier between changing inodes' journal flags and writepages ops. */
struct percpu_rw_semaphore s_journal_flag_rwsem;
struct dax_device *s_daxdev;
+#ifdef CONFIG_EXT4_DEBUG
+ unsigned long s_simulate_fail;
+#endif
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1546,6 +1577,66 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
}
/*
+ * Simulate_fail codes
+ */
+#define EXT4_SIM_BBITMAP_EIO 1
+#define EXT4_SIM_BBITMAP_CRC 2
+#define EXT4_SIM_IBITMAP_EIO 3
+#define EXT4_SIM_IBITMAP_CRC 4
+#define EXT4_SIM_INODE_EIO 5
+#define EXT4_SIM_INODE_CRC 6
+#define EXT4_SIM_DIRBLOCK_EIO 7
+#define EXT4_SIM_DIRBLOCK_CRC 8
+
+static inline bool ext4_simulate_fail(struct super_block *sb,
+ unsigned long code)
+{
+#ifdef CONFIG_EXT4_DEBUG
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (unlikely(sbi->s_simulate_fail == code)) {
+ sbi->s_simulate_fail = 0;
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline void ext4_simulate_fail_bh(struct super_block *sb,
+ struct buffer_head *bh,
+ unsigned long code)
+{
+ if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
+ clear_buffer_uptodate(bh);
+}
+
+/*
+ * Error number codes for s_{first,last}_error_errno
+ *
+ * Linux errno numbers are architecture specific, so we need to translate
+ * them into something which is architecture independent. We don't define
+ * codes for all errno's; just the ones which are most likely to be the cause
+ * of an ext4_error() call.
+ */
+#define EXT4_ERR_UNKNOWN 1
+#define EXT4_ERR_EIO 2
+#define EXT4_ERR_ENOMEM 3
+#define EXT4_ERR_EFSBADCRC 4
+#define EXT4_ERR_EFSCORRUPTED 5
+#define EXT4_ERR_ENOSPC 6
+#define EXT4_ERR_ENOKEY 7
+#define EXT4_ERR_EROFS 8
+#define EXT4_ERR_EFBIG 9
+#define EXT4_ERR_EEXIST 10
+#define EXT4_ERR_ERANGE 11
+#define EXT4_ERR_EOVERFLOW 12
+#define EXT4_ERR_EBUSY 13
+#define EXT4_ERR_ENOTDIR 14
+#define EXT4_ERR_ENOTEMPTY 15
+#define EXT4_ERR_ESHUTDOWN 16
+#define EXT4_ERR_EFAULT 17
+
+/*
* Inode dynamic state flags
*/
enum {
@@ -1555,11 +1646,11 @@ enum {
EXT4_STATE_NO_EXPAND, /* No space for expansion */
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
- EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
+ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1610,6 +1701,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_SB(sb) (sb)
#endif
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+ return IS_ENABLED(CONFIG_FS_VERITY) &&
+ ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1632,6 +1729,10 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_GOOD_OLD_INODE_SIZE 128
+#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN)
+#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX
+#define EXT4_TIMESTAMP_MIN S32_MIN
+
/*
* Feature set definitions
*/
@@ -1643,6 +1744,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1662,6 +1764,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1743,6 +1846,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -1756,6 +1860,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -1813,7 +1918,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
- EXT4_FEATURE_RO_COMPAT_PROJECT)
+ EXT4_FEATURE_RO_COMPAT_PROJECT |\
+ EXT4_FEATURE_RO_COMPAT_VERITY)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -2438,8 +2544,11 @@ void ext4_insert_dentry(struct inode *inode,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb))
+ if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ /* ext4_iget() should have caught this... */
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -2522,8 +2631,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2566,7 +2673,6 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -2587,7 +2693,6 @@ extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
@@ -2638,8 +2743,6 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
-extern void *ext4_kvmalloc(size_t size, gfp_t flags);
-extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2647,6 +2750,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
+extern void ext4_set_errno(struct super_block *sb, int err);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
@@ -3177,6 +3281,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
extern int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
@@ -3211,7 +3317,6 @@ struct ext4_extent;
#define EXT_MAX_BLOCKS 0xffffffff
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
-extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3224,16 +3329,13 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+ ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
- ext4_lblk_t lblocks);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
- struct ext4_extent *ex1,
- struct ext4_extent *ex2);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
struct ext4_ext_path **,
struct ext4_extent *, int);
@@ -3245,14 +3347,19 @@ extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+extern int ext4_get_es_cache(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
-extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
-extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred);
+
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3279,10 +3386,15 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
@@ -3333,6 +3445,21 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_overwrite_ops;
+extern const struct iomap_ops ext4_iomap_report_ops;
+
+static inline int ext4_buffer_uptodate(struct buffer_head *bh)
+{
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out data in the block. In this case, we don't
+ * have to read the block because we may read the old data
+ * successfully.
+ */
+ if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+ set_buffer_uptodate(bh);
+ return buffer_uptodate(bh);
+}
#endif /* __KERNEL__ */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 98bd0e9ee7df..1c216fcc202a 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -267,10 +267,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
-#define ext4_ext_dirty(handle, inode, path) \
- __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
- struct inode *inode, struct ext4_ext_path *path);
-
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7c70b08d104c..1f53d64e42a5 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,6 +7,28 @@
#include <trace/events/ext4.h>
+int ext4_inode_journal_mode(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
+ /* We do not support data journalling for encrypted data */
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ }
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ BUG();
+}
+
/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
@@ -58,6 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb)
* take the FS itself readonly cleanly.
*/
if (journal && is_journal_aborted(journal)) {
+ ext4_set_errno(sb, -journal->j_errno);
ext4_abort(sb, "Detected aborted journal");
return -EROFS;
}
@@ -65,12 +88,14 @@ static int ext4_journal_check_start(struct super_block *sb)
}
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks)
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds)
{
journal_t *journal;
int err;
- trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
@@ -78,8 +103,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
journal = EXT4_SB(sb)->s_journal;
if (!journal)
return ext4_get_nojournal();
- return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
- type, line);
+ return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
+ GFP_NOFS, type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -119,8 +144,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return ext4_get_nojournal();
sb = handle->h_journal->j_private;
- trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
- _RET_IP_);
+ trace_ext4_journal_start_reserved(sb,
+ jbd2_handle_buffer_credits(handle), _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0) {
jbd2_journal_free_reserved(handle);
@@ -133,6 +158,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return handle;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (jbd2_handle_buffer_credits(handle) >= check_cred &&
+ handle->h_revoke_credits >= revoke_cred)
+ return 0;
+ extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
+ revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
+ return ext4_journal_extend(handle, extend_cred, revoke_cred);
+}
+
static void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh,
@@ -234,6 +272,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
+ ext4_set_errno(inode->i_sb, -err);
__ext4_abort(inode->i_sb, where, line,
"error %d when attempting revoke", err);
}
@@ -278,7 +317,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle), err);
return err;
}
ext4_error_inode(inode, where, line,
@@ -289,7 +328,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle),
+ err);
}
} else {
if (inode)
@@ -304,6 +344,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_block =
cpu_to_le64(bh->b_blocknr);
+ ext4_set_errno(inode->i_sb, EIO);
ext4_error_inode(inode, where, line,
bh->b_blocknr,
"IO error syncing itable block");
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index ef8fcf7d0d3b..7ea4f6fa173b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks);
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle)
return 0;
}
-static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
+ int blocks)
{
- if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
- return 0;
- return 1;
+ /* Freeing each metadata block can result in freeing one cluster */
+ return blocks * EXT4_SB(sb)->s_cluster_ratio;
+}
+
+static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
+{
+ return ext4_free_metadata_revoke_credits(sb, 8);
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits((inode)->i_sb))
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
+ ext4_trans_default_revoke_credits((inode)->i_sb))
-#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
- __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \
+ (revoke_creds))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int blocks, int rsv_blocks)
+ int blocks, int rsv_blocks,
+ int revoke_creds)
{
return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
- rsv_blocks);
+ rsv_blocks, revoke_creds);
}
#define ext4_journal_stop(handle) \
@@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void)
return journal_current_handle();
}
-static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_extend(handle, nblocks);
+ return jbd2_journal_extend(handle, nblocks, revoke);
return 0;
}
-static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+static inline int ext4_journal_restart(handle_t *handle, int nblocks,
+ int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_restart(handle, nblocks);
+ return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
return 0;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred);
+
+
+/*
+ * Ensure @handle has at least @check_creds credits available. If not,
+ * transaction will be extended or restarted to contain at least @extend_cred
+ * credits. Before restarting transaction @fn is executed to allow for cleanup
+ * before the transaction is restarted.
+ *
+ * The return value is < 0 in case of error, 0 in case the handle has enough
+ * credits or transaction extension succeeded, 1 in case transaction had to be
+ * restarted.
+ */
+#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \
+ revoke_cred, fn) \
+({ \
+ __label__ __ensure_end; \
+ int err = __ext4_journal_ensure_credits((handle), (check_cred), \
+ (extend_cred), (revoke_cred)); \
+ \
+ if (err <= 0) \
+ goto __ensure_end; \
+ err = (fn); \
+ if (err < 0) \
+ goto __ensure_end; \
+ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
+ if (err == 0) \
+ err = 1; \
+__ensure_end: \
+ err; \
+})
+
+/*
+ * Ensure given handle has at least requested amount of credits available,
+ * possibly restarting transaction if needed. We also make sure the transaction
+ * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
+ * as freeing one or two blocks is very common pattern and requesting this is
+ * very cheap.
+ */
+static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
+ int revoke_creds)
+{
+ return ext4_journal_ensure_credits_fn(handle, credits, credits,
+ revoke_creds, 0);
+}
+
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
@@ -401,26 +463,7 @@ int ext4_force_commit(struct super_block *sb);
#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
-static inline int ext4_inode_journal_mode(struct inode *inode)
-{
- if (EXT4_JOURNAL(inode) == NULL)
- return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- /* We do not support data journalling with delayed allocation */
- if (!S_ISREG(inode->i_mode) ||
- test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
- /* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
- return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
- return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
- }
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- BUG();
-}
+int ext4_inode_journal_mode(struct inode *inode);
static inline int ext4_should_journal_data(struct inode *inode)
{
@@ -437,6 +480,19 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
+static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
+{
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return 0;
+ if (!ext4_should_journal_data(inode))
+ return 0;
+ /*
+ * Data blocks in one extent are contiguous, just account for partial
+ * clusters at extent boundaries
+ */
+ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
+}
+
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2da7d6..954013d6076b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle,
static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status *newes);
-static int ext4_ext_truncate_extend_restart(handle_t *handle,
- struct inode *inode,
- int needed)
+static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
- int err;
-
- if (!ext4_handle_valid(handle))
- return 0;
- if (handle->h_buffer_credits >= needed)
- return 0;
/*
- * If we need to extend the journal get a few extra blocks
- * while we're at it for efficiency's sake.
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
*/
- needed += 3;
- err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
- if (err <= 0)
- return err;
- err = ext4_truncate_restart_trans(handle, inode, needed);
- if (err == 0)
- err = -EAGAIN;
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
- return err;
+/*
+ * Make sure 'handle' has at least 'check_cred' credits. If not, restart
+ * transaction with 'restart_cred' credits. The function drops i_data_sem
+ * when restarting transaction and gets it after transaction is restarted.
+ *
+ * The function returns 0 on success, 1 if transaction had to be restarted,
+ * and < 0 in case of fatal error.
+ */
+int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred)
+{
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
+ revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
}
/*
@@ -149,8 +161,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
- struct inode *inode, struct ext4_ext_path *path)
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
{
int err;
@@ -167,6 +180,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
return err;
}
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+
static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
@@ -297,53 +313,6 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
}
-/*
- * Calculate the number of metadata blocks needed
- * to allocate @blocks
- * Worse case is one block per extent
- */
-int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- int idxs;
-
- idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
- / sizeof(struct ext4_extent_idx));
-
- /*
- * If the new delayed allocation block is contiguous with the
- * previous da block, it can share index blocks with the
- * previous block, so we only need to allocate a new index
- * block every idxs leaf blocks. At ldxs**2 blocks, we need
- * an additional index block, and at ldxs**3 blocks, yet
- * another index blocks.
- */
- if (ei->i_da_metadata_calc_len &&
- ei->i_da_metadata_calc_last_lblock+1 == lblock) {
- int num = 0;
-
- if ((ei->i_da_metadata_calc_len % idxs) == 0)
- num++;
- if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
- num++;
- if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
- num++;
- ei->i_da_metadata_calc_len = 0;
- } else
- ei->i_da_metadata_calc_len++;
- ei->i_da_metadata_calc_last_lblock++;
- return num;
- }
-
- /*
- * In the worst case we need a new set of index blocks at
- * every level of the inode's extent tree.
- */
- ei->i_da_metadata_calc_len = 1;
- ei->i_da_metadata_calc_last_lblock = lblock;
- return ext_depth(inode) + 1;
-}
-
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -480,6 +449,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
return 0;
corrupted:
+ ext4_set_errno(inode->i_sb, -err);
ext4_error_inode(inode, function, line, 0,
"pblk %llu bad header/extent: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
@@ -498,6 +468,30 @@ int ext4_ext_check_inode(struct inode *inode)
return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
+static void ext4_cache_extents(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_unwritten(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+}
+
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
struct inode *inode, ext4_fsblk_t pblk, int depth,
@@ -532,26 +526,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
*/
if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
struct ext4_extent_header *eh = ext_block_hdr(bh);
- struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t prev = 0;
- int i;
-
- for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
- unsigned int status = EXTENT_STATUS_WRITTEN;
- ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
- int len = ext4_ext_get_actual_len(ex);
-
- if (prev && (prev != lblk))
- ext4_es_cache_extent(inode, prev,
- lblk - prev, ~0,
- EXTENT_STATUS_HOLE);
-
- if (ext4_ext_is_unwritten(ex))
- status = EXTENT_STATUS_UNWRITTEN;
- ext4_es_cache_extent(inode, lblk, len,
- ext4_ext_pblock(ex), status);
- prev = lblk + len;
- }
+ ext4_cache_extents(inode, eh);
}
return bh;
errout:
@@ -637,8 +612,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
ext_debug("path:");
for (k = 0; k <= l; k++, path++) {
if (path->p_idx) {
- ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
- ext4_idx_pblock(path->p_idx));
+ ext_debug(" %d->%llu",
+ le32_to_cpu(path->p_idx->ei_block),
+ ext4_idx_pblock(path->p_idx));
} else if (path->p_ext) {
ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
@@ -719,11 +695,12 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path)
if (!path)
return;
depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++)
+ for (i = 0; i <= depth; i++, path++) {
if (path->p_bh) {
brelse(path->p_bh);
path->p_bh = NULL;
}
+ }
}
/*
@@ -765,8 +742,8 @@ ext4_ext_binsearch_idx(struct inode *inode,
chix = ix = EXT_FIRST_INDEX(eh);
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
- if (k != 0 &&
- le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+ if (k != 0 && le32_to_cpu(ix->ei_block) <=
+ le32_to_cpu(ix[-1].ei_block)) {
printk(KERN_DEBUG "k=%d, ix=0x%p, "
"first=0x%p\n", k,
ix, EXT_FIRST_INDEX(eh));
@@ -899,6 +876,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[0].p_bh = NULL;
i = depth;
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
+ ext4_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
ext_debug("depth %d: num %d, max %d\n",
@@ -1620,17 +1599,16 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
return EXT_MAX_BLOCKS;
while (depth >= 0) {
+ struct ext4_ext_path *p = &path[depth];
+
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext &&
- path[depth].p_ext !=
- EXT_LAST_EXTENT(path[depth].p_hdr))
- return le32_to_cpu(path[depth].p_ext[1].ee_block);
+ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
+ return le32_to_cpu(p->p_ext[1].ee_block);
} else {
/* index */
- if (path[depth].p_idx !=
- EXT_LAST_INDEX(path[depth].p_hdr))
- return le32_to_cpu(path[depth].p_idx[1].ei_block);
+ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
+ return le32_to_cpu(p->p_idx[1].ei_block);
}
depth--;
}
@@ -1730,9 +1708,9 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}
-int
-ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
- struct ext4_extent *ex2)
+static int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2)
{
unsigned short ext1_ee_len, ext2_ee_len;
@@ -1746,23 +1724,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
le32_to_cpu(ex2->ee_block))
return 0;
- /*
- * To allow future support for preallocated extents to be added
- * as an RO_COMPAT feature, refuse to merge to extents if
- * this can result in the top bit of ee_len being set.
- */
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
- /*
- * The check for IO to unwritten extent is somewhat racy as we
- * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
- * dropping i_data_sem. But reserved blocks should save us in that
- * case.
- */
+
if (ext4_ext_is_unwritten(ex1) &&
- (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
- atomic_read(&EXT4_I(inode)->i_unwritten) ||
- (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+ ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1840,7 +1806,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
* group descriptor to release the extent tree block. If we
* can't get the journal credits, give up.
*/
- if (ext4_journal_extend(handle, 2))
+ if (ext4_journal_extend(handle, 2,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
return;
/*
@@ -1864,13 +1831,14 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
}
/*
- * This function tries to merge the @ex extent to neighbours in the tree.
- * return 1 if merge left else 0.
+ * This function tries to merge the @ex extent to neighbours in the tree, then
+ * tries to collapse the extent tree into the inode.
*/
static void ext4_ext_try_to_merge(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex) {
+ struct ext4_extent *ex)
+{
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
@@ -2315,6 +2283,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
return err;
}
+static int ext4_fill_es_cache_info(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
+{
+ ext4_lblk_t next, end = block + num - 1;
+ struct extent_status es;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+ unsigned int flags;
+ int err;
+
+ while (block <= end) {
+ next = 0;
+ flags = 0;
+ if (!ext4_es_lookup_extent(inode, block, &next, &es))
+ break;
+ if (ext4_es_is_unwritten(&es))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ if (ext4_es_is_delayed(&es))
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ext4_es_is_hole(&es))
+ flags |= EXT4_FIEMAP_EXTENT_HOLE;
+ if (next == 0)
+ flags |= FIEMAP_EXTENT_LAST;
+ if (flags & (FIEMAP_EXTENT_DELALLOC|
+ EXT4_FIEMAP_EXTENT_HOLE))
+ es.es_pblk = 0;
+ else
+ es.es_pblk = ext4_es_pblock(&es);
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
+ flags);
+ if (next == 0)
+ break;
+ block = next;
+ if (err < 0)
+ return err;
+ if (err == 1)
+ return 0;
+ }
+ return 0;
+}
+
+
/*
* ext4_ext_determine_hole - determine hole around given block
* @inode: inode we lookup in
@@ -2681,7 +2695,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
- int depth = ext_depth(inode), credits;
+ int depth = ext_depth(inode), credits, revoke_credits;
struct ext4_extent_header *eh;
ext4_lblk_t a, b;
unsigned num;
@@ -2773,10 +2787,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += (ext_depth(inode)) + 1;
}
credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- if (err)
+ /*
+ * We may end up freeing some index blocks and data from the
+ * punched range. Note that partial clusters are accounted for
+ * by ext4_free_data_revoke_credits().
+ */
+ revoke_credits =
+ ext4_free_metadata_revoke_credits(inode->i_sb,
+ ext_depth(inode)) +
+ ext4_free_data_revoke_credits(inode, b - a + 1);
+
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ credits, revoke_credits);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2902,7 +2929,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+ depth + 1,
+ ext4_free_metadata_revoke_credits(inode->i_sb, depth));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -3651,9 +3680,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (IS_ENCRYPTED(inode))
- max_zeroout = 0;
-
/*
* five cases:
* 1. split the extent into three extents.
@@ -3813,8 +3839,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
* illegal.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef EXT4_DEBUG
- ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+#ifdef CONFIG_EXT4_DEBUG
+ ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
" len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
@@ -4639,6 +4665,10 @@ retry:
return ret > 0 ? ret2 : ret;
}
+static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+
+static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
@@ -4656,9 +4686,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
trace_ext4_zero_range(inode, offset, len, mode);
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
/* Call ext4_force_commit to flush all data in case of data=journal. */
if (ext4_should_journal_data(inode)) {
ret = ext4_force_commit(inode->i_sb);
@@ -4698,7 +4725,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > i_size_read(inode) ||
+ (offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
@@ -4782,7 +4809,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Mark that we allocate beyond EOF so the subsequent truncate
* can proceed even if the new size is the same as i_size.
*/
- if ((offset + len) > i_size_read(inode))
+ if (offset + len > inode->i_size)
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
@@ -4823,14 +4850,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
* range since we would need to re-encrypt blocks with a
* different IV or XTS tweak (which are based on the logical
* block number).
- *
- * XXX It's not clear why zero range isn't working, but we'll
- * leave it disabled for encrypted inodes for now. This is a
- * bug we should fix....
*/
if (IS_ENCRYPTED(inode) &&
- (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
- FALLOC_FL_ZERO_RANGE)))
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
/* Return error if mode is not supported */
@@ -4874,7 +4896,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > i_size_read(inode) ||
+ (offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
@@ -4916,23 +4938,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ret = 0;
int ret2 = 0;
struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- /*
- * This is somewhat ugly but the idea is clear: When transaction is
- * reserved, everything goes into it. Otherwise we rather start several
- * smaller transactions for conversion of each extent separately.
- */
- if (handle) {
- handle = ext4_journal_start_reserved(handle,
- EXT4_HT_EXT_CONVERT);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- credits = 0;
- } else {
+ if (!handle) {
/*
* credits to insert 1 extent into extent tree
*/
@@ -4963,11 +4975,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
if (ret <= 0 || ret2)
break;
}
- if (!credits)
- ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
+{
+ int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
+
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
+ if (handle)
+ err = ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : err;
+}
+
/*
* If newes is not existing extent (newes->ec_pblk equals zero) find
* delayed extent at start of newes and update newes accordingly and
@@ -5017,8 +5058,6 @@ static int ext4_find_delayed_extent(struct inode *inode,
return next_del;
}
-/* fiemap flags we can handle specified here */
-#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
static int ext4_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
@@ -5055,10 +5094,16 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
}
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+static int _ext4_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len,
+ int (*fill)(struct inode *, ext4_lblk_t,
+ ext4_lblk_t,
+ struct fiemap_extent_info *))
{
ext4_lblk_t start_blk;
+ u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+
int error = 0;
if (ext4_has_inline_data(inode)) {
@@ -5075,14 +5120,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_ext_precache(inode);
if (error)
return error;
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
/* fallback to generic here if not in extents fmt */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+ fill == ext4_fill_fiemap_extents)
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);
- if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+ if (fill == ext4_fill_es_cache_info)
+ ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
+ if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
return -EBADR;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
@@ -5101,12 +5150,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Walk the extent tree gathering extent information
* and pushing extents back to the user.
*/
- error = ext4_fill_fiemap_extents(inode, start_blk,
- len_blks, fieinfo);
+ error = fill(inode, start_blk, len_blks, fieinfo);
}
return error;
}
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ return _ext4_fiemap(inode, fieinfo, start, len,
+ ext4_fill_fiemap_extents);
+}
+
+int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ if (ext4_has_inline_data(inode)) {
+ int has_inline;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ has_inline = ext4_has_inline_data(inode);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (has_inline)
+ return 0;
+ }
+
+ return _ext4_fiemap(inode, fieinfo, start, len,
+ ext4_fill_es_cache_info);
+}
+
+
/*
* ext4_access_path:
* Function to access the path buffer for marking it dirty.
@@ -5128,13 +5201,10 @@ ext4_access_path(handle_t *handle, struct inode *inode,
* descriptor) for each block group; assume two block
* groups
*/
- if (handle->h_buffer_credits < 7) {
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- /* EAGAIN is success */
- if (err && err != -EAGAIN)
- return err;
- }
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
+ if (err < 0)
+ return err;
err = ext4_ext_get_access(handle, inode, path);
return err;
@@ -5153,7 +5223,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
{
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
- bool update = 0;
+ bool update = false;
depth = path->p_depth;
while (depth >= 0) {
@@ -5169,7 +5239,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
goto out;
if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = 1;
+ update = true;
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
@@ -5357,7 +5427,7 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
ext4_lblk_t punch_start, punch_stop;
@@ -5374,12 +5444,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
- /* Collapse range works only on fs block size aligned offsets. */
- if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
- len & (EXT4_CLUSTER_SIZE(sb) - 1))
- return -EINVAL;
-
- if (!S_ISREG(inode->i_mode))
+ /* Collapse range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
trace_ext4_collapse_range(inode, offset, len);
@@ -5399,7 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= i_size_read(inode)) {
+ if (offset + len >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
@@ -5477,7 +5543,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_stop;
}
- new_size = i_size_read(inode) - len;
+ new_size = inode->i_size - len;
i_size_write(inode, new_size);
EXT4_I(inode)->i_disksize = new_size;
@@ -5505,7 +5571,7 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
handle_t *handle;
@@ -5524,14 +5590,10 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
- /* Insert range works only on fs block size aligned offsets. */
- if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
- len & (EXT4_CLUSTER_SIZE(sb) - 1))
+ /* Insert range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
trace_ext4_insert_range(inode, offset, len);
offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -5551,14 +5613,14 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- /* Check for wrap through zero */
- if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size) {
ret = -EFBIG;
goto out_mutex;
}
- /* Offset should be less than i_size */
- if (offset >= i_size_read(inode)) {
+ /* Offset must be less than i_size */
+ if (offset >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7521de2dcf3a..d996b44d2265 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end);
+ ext4_lblk_t end, int *reserved);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
@@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end);
+ err = __es_remove_extent(inode, lblk, end, NULL);
if (err != 0)
goto error;
retry:
@@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t *next_lblk,
struct extent_status *es)
{
struct ext4_es_tree *tree;
@@ -947,9 +948,18 @@ out:
es->es_pblk = es1->es_pblk;
if (!ext4_es_is_referenced(es1))
ext4_es_set_referenced(es1);
- stats->es_stats_cache_hits++;
+ percpu_counter_inc(&stats->es_stats_cache_hits);
+ if (next_lblk) {
+ node = rb_next(&es1->rb_node);
+ if (node) {
+ es1 = rb_entry(node, struct extent_status,
+ rb_node);
+ *next_lblk = es1->es_lblk;
+ } else
+ *next_lblk = 0;
+ }
} else {
- stats->es_stats_cache_misses++;
+ percpu_counter_inc(&stats->es_stats_cache_misses);
}
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -958,8 +968,322 @@ out:
return found;
}
+struct rsvd_count {
+ int ndelonly;
+ bool first_do_lblk_found;
+ ext4_lblk_t first_do_lblk;
+ ext4_lblk_t last_do_lblk;
+ struct extent_status *left_es;
+ bool partial;
+ ext4_lblk_t lclu;
+};
+
+/*
+ * init_rsvd - initialize reserved count data before removing block range
+ * in file from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @es - pointer to first extent in range
+ * @rc - pointer to reserved count data
+ *
+ * Assumes es is not NULL
+ */
+static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es, struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct rb_node *node;
+
+ rc->ndelonly = 0;
+
+ /*
+ * for bigalloc, note the first delonly block in the range has not
+ * been found, record the extent containing the block to the left of
+ * the region to be removed, if any, and note that there's no partial
+ * cluster to track
+ */
+ if (sbi->s_cluster_ratio > 1) {
+ rc->first_do_lblk_found = false;
+ if (lblk > es->es_lblk) {
+ rc->left_es = es;
+ } else {
+ node = rb_prev(&es->rb_node);
+ rc->left_es = node ? rb_entry(node,
+ struct extent_status,
+ rb_node) : NULL;
+ }
+ rc->partial = false;
+ }
+}
+
+/*
+ * count_rsvd - count the clusters containing delayed and not unwritten
+ * (delonly) blocks in a range within an extent and add to
+ * the running tally in rsvd_count
+ *
+ * @inode - file containing extent
+ * @lblk - first block in range
+ * @len - length of range in blocks
+ * @es - pointer to extent containing clusters to be counted
+ * @rc - pointer to reserved count data
+ *
+ * Tracks partial clusters found at the beginning and end of extents so
+ * they aren't overcounted when they span adjacent extents
+ */
+static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
+ struct extent_status *es, struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t i, end, nclu;
+
+ if (!ext4_es_is_delonly(es))
+ return;
+
+ WARN_ON(len <= 0);
+
+ if (sbi->s_cluster_ratio == 1) {
+ rc->ndelonly += (int) len;
+ return;
+ }
+
+ /* bigalloc */
+
+ i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
+ end = lblk + (ext4_lblk_t) len - 1;
+ end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
+
+ /* record the first block of the first delonly extent seen */
+ if (rc->first_do_lblk_found == false) {
+ rc->first_do_lblk = i;
+ rc->first_do_lblk_found = true;
+ }
+
+ /* update the last lblk in the region seen so far */
+ rc->last_do_lblk = end;
+
+ /*
+ * if we're tracking a partial cluster and the current extent
+ * doesn't start with it, count it and stop tracking
+ */
+ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
+ rc->ndelonly++;
+ rc->partial = false;
+ }
+
+ /*
+ * if the first cluster doesn't start on a cluster boundary but
+ * ends on one, count it
+ */
+ if (EXT4_LBLK_COFF(sbi, i) != 0) {
+ if (end >= EXT4_LBLK_CFILL(sbi, i)) {
+ rc->ndelonly++;
+ rc->partial = false;
+ i = EXT4_LBLK_CFILL(sbi, i) + 1;
+ }
+ }
+
+ /*
+ * if the current cluster starts on a cluster boundary, count the
+ * number of whole delonly clusters in the extent
+ */
+ if ((i + sbi->s_cluster_ratio - 1) <= end) {
+ nclu = (end - i + 1) >> sbi->s_cluster_bits;
+ rc->ndelonly += nclu;
+ i += nclu << sbi->s_cluster_bits;
+ }
+
+ /*
+ * start tracking a partial cluster if there's a partial at the end
+ * of the current extent and we're not already tracking one
+ */
+ if (!rc->partial && i <= end) {
+ rc->partial = true;
+ rc->lclu = EXT4_B2C(sbi, i);
+ }
+}
+
+/*
+ * __pr_tree_search - search for a pending cluster reservation
+ *
+ * @root - root of pending reservation tree
+ * @lclu - logical cluster to search for
+ *
+ * Returns the pending reservation for the cluster identified by @lclu
+ * if found. If not, returns a reservation for the next cluster if any,
+ * and if not, returns NULL.
+ */
+static struct pending_reservation *__pr_tree_search(struct rb_root *root,
+ ext4_lblk_t lclu)
+{
+ struct rb_node *node = root->rb_node;
+ struct pending_reservation *pr = NULL;
+
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ if (lclu < pr->lclu)
+ node = node->rb_left;
+ else if (lclu > pr->lclu)
+ node = node->rb_right;
+ else
+ return pr;
+ }
+ if (pr && lclu < pr->lclu)
+ return pr;
+ if (pr && lclu > pr->lclu) {
+ node = rb_next(&pr->rb_node);
+ return node ? rb_entry(node, struct pending_reservation,
+ rb_node) : NULL;
+ }
+ return NULL;
+}
+
+/*
+ * get_rsvd - calculates and returns the number of cluster reservations to be
+ * released when removing a block range from the extent status tree
+ * and releases any pending reservations within the range
+ *
+ * @inode - file containing block range
+ * @end - last block in range
+ * @right_es - pointer to extent containing next block beyond end or NULL
+ * @rc - pointer to reserved count data
+ *
+ * The number of reservations to be released is equal to the number of
+ * clusters containing delayed and not unwritten (delonly) blocks within
+ * the range, minus the number of clusters still containing delonly blocks
+ * at the ends of the range, and minus the number of pending reservations
+ * within the range.
+ */
+static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
+ struct extent_status *right_es,
+ struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+ struct rb_node *node;
+ ext4_lblk_t first_lclu, last_lclu;
+ bool left_delonly, right_delonly, count_pending;
+ struct extent_status *es;
+
+ if (sbi->s_cluster_ratio > 1) {
+ /* count any remaining partial cluster */
+ if (rc->partial)
+ rc->ndelonly++;
+
+ if (rc->ndelonly == 0)
+ return 0;
+
+ first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
+ last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
+
+ /*
+ * decrease the delonly count by the number of clusters at the
+ * ends of the range that still contain delonly blocks -
+ * these clusters still need to be reserved
+ */
+ left_delonly = right_delonly = false;
+
+ es = rc->left_es;
+ while (es && ext4_es_end(es) >=
+ EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
+ if (ext4_es_is_delonly(es)) {
+ rc->ndelonly--;
+ left_delonly = true;
+ break;
+ }
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (end < ext4_es_end(right_es)) {
+ es = right_es;
+ } else {
+ node = rb_next(&right_es->rb_node);
+ es = node ? rb_entry(node, struct extent_status,
+ rb_node) : NULL;
+ }
+ while (es && es->es_lblk <=
+ EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
+ if (ext4_es_is_delonly(es)) {
+ rc->ndelonly--;
+ right_delonly = true;
+ break;
+ }
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status,
+ rb_node);
+ }
+ }
+
+ /*
+ * Determine the block range that should be searched for
+ * pending reservations, if any. Clusters on the ends of the
+ * original removed range containing delonly blocks are
+ * excluded. They've already been accounted for and it's not
+ * possible to determine if an associated pending reservation
+ * should be released with the information available in the
+ * extents status tree.
+ */
+ if (first_lclu == last_lclu) {
+ if (left_delonly | right_delonly)
+ count_pending = false;
+ else
+ count_pending = true;
+ } else {
+ if (left_delonly)
+ first_lclu++;
+ if (right_delonly)
+ last_lclu--;
+ if (first_lclu <= last_lclu)
+ count_pending = true;
+ else
+ count_pending = false;
+ }
+
+ /*
+ * a pending reservation found between first_lclu and last_lclu
+ * represents an allocated cluster that contained at least one
+ * delonly block, so the delonly total must be reduced by one
+ * for each pending reservation found and released
+ */
+ if (count_pending) {
+ pr = __pr_tree_search(&tree->root, first_lclu);
+ while (pr && pr->lclu <= last_lclu) {
+ rc->ndelonly--;
+ node = rb_next(&pr->rb_node);
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ if (!node)
+ break;
+ pr = rb_entry(node, struct pending_reservation,
+ rb_node);
+ }
+ }
+ }
+ return rc->ndelonly;
+}
+
+
+/*
+ * __es_remove_extent - removes block range from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @end - last block in range
+ * @reserved - number of cluster reservations released
+ *
+ * If @reserved is not NULL and delayed allocation is enabled, counts
+ * block/cluster reservations freed by removing range and if bigalloc
+ * enabled cancels pending reservations as needed. Returns 0 on success,
+ * error code on failure.
+ */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end)
+ ext4_lblk_t end, int *reserved)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -968,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
int err;
+ bool count_reserved = true;
+ struct rsvd_count rc;
+ if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
+ count_reserved = false;
retry:
err = 0;
+
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
@@ -979,6 +1308,8 @@ retry:
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
+ if (count_reserved)
+ init_rsvd(inode, lblk, es, &rc);
orig_es.es_lblk = es->es_lblk;
orig_es.es_len = es->es_len;
@@ -1020,10 +1351,16 @@ retry:
ext4_es_store_pblock(es, block);
}
}
+ if (count_reserved)
+ count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
+ &orig_es, &rc);
goto out;
}
if (len1 > 0) {
+ if (count_reserved)
+ count_rsvd(inode, lblk, orig_es.es_len - len1,
+ &orig_es, &rc);
node = rb_next(&es->rb_node);
if (node)
es = rb_entry(node, struct extent_status, rb_node);
@@ -1032,6 +1369,8 @@ retry:
}
while (es && ext4_es_end(es) <= end) {
+ if (count_reserved)
+ count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
@@ -1046,6 +1385,9 @@ retry:
ext4_lblk_t orig_len = es->es_len;
len1 = ext4_es_end(es) - end;
+ if (count_reserved)
+ count_rsvd(inode, es->es_lblk, orig_len - len1,
+ es, &rc);
es->es_lblk = end + 1;
es->es_len = len1;
if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
@@ -1054,20 +1396,28 @@ retry:
}
}
+ if (count_reserved)
+ *reserved = get_rsvd(inode, end, es, &rc);
out:
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a extent status tree.
+ * ext4_es_remove_extent - removes block range from extent status tree
*
- * Return 0 on success, error code on failure.
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @len - number of blocks to remove
+ *
+ * Reduces block/cluster reservation count and for bigalloc cancels pending
+ * reservations as needed. Returns 0 on success, error code on failure.
*/
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
ext4_lblk_t end;
int err = 0;
+ int reserved = 0;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -1085,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end);
+ err = __es_remove_extent(inode, lblk, end, &reserved);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
+ ext4_da_release_space(inode, reserved);
return err;
}
@@ -1235,9 +1586,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
- seq_printf(seq, " %lu/%lu cache hits/misses\n",
- es_stats->es_stats_cache_hits,
- es_stats->es_stats_cache_misses);
+ seq_printf(seq, " %lld/%lld cache hits/misses\n",
+ percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+ percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, " %d inodes on list\n", inode_cnt);
@@ -1264,35 +1615,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_nr_inode = 0;
spin_lock_init(&sbi->s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
- sbi->s_es_stats.es_stats_cache_hits = 0;
- sbi->s_es_stats.es_stats_cache_misses = 0;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
+ GFP_KERNEL);
+ if (err)
+ return err;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
+ GFP_KERNEL);
+ if (err)
+ goto err1;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
if (err)
- return err;
+ goto err2;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
if (err)
- goto err1;
+ goto err3;
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(&sbi->s_es_shrinker);
if (err)
- goto err2;
+ goto err4;
return 0;
-
-err2:
+err4:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
-err1:
+err3:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+err2:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
+err1:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
return err;
}
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
@@ -1317,6 +1679,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
if (!es)
goto out_wrap;
+
while (*nr_to_scan > 0) {
if (es->es_lblk > end) {
ei->i_es_shrink_lblk = end + 1;
@@ -1374,6 +1737,34 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
return nr_shrunk;
}
+/*
+ * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
+ * discretionary entries from the extent status cache. (Some entries
+ * must be present for proper operations.)
+ */
+void ext4_clear_inode_es(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct extent_status *es;
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ write_lock(&ei->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ tree->cache_es = NULL;
+ node = rb_first(&tree->root);
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(node);
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ }
+ }
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+ write_unlock(&ei->i_es_lock);
+}
+
#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
@@ -1590,7 +1981,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, lblk);
+ err = __es_remove_extent(inode, lblk, lblk, NULL);
if (err != 0)
goto error;
retry:
@@ -1779,93 +2170,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
__remove_pending(inode, last);
}
}
-
-/*
- * ext4_es_remove_blks - remove block range from extents status tree and
- * reduce reservation count or cancel pending
- * reservation as needed
- *
- * @inode - file containing range
- * @lblk - first block in range
- * @len - number of blocks to remove
- *
- */
-void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int clu_size, reserved = 0;
- ext4_lblk_t last_lclu, first, length, remainder, last;
- bool delonly;
- int err = 0;
- struct pending_reservation *pr;
- struct ext4_pending_tree *tree;
-
- /*
- * Process cluster by cluster for bigalloc - there may be up to
- * two clusters in a 4k page with a 1k block size and two blocks
- * per cluster. Also necessary for systems with larger page sizes
- * and potentially larger block sizes.
- */
- clu_size = sbi->s_cluster_ratio;
- last_lclu = EXT4_B2C(sbi, lblk + len - 1);
-
- write_lock(&EXT4_I(inode)->i_es_lock);
-
- for (first = lblk, remainder = len;
- remainder > 0;
- first += length, remainder -= length) {
-
- if (EXT4_B2C(sbi, first) == last_lclu)
- length = remainder;
- else
- length = clu_size - EXT4_LBLK_COFF(sbi, first);
-
- /*
- * The BH_Delay flag, which triggers calls to this function,
- * and the contents of the extents status tree can be
- * inconsistent due to writepages activity. So, note whether
- * the blocks to be removed actually belong to an extent with
- * delayed only status.
- */
- delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
-
- /*
- * because of the writepages effect, written and unwritten
- * blocks could be removed here
- */
- last = first + length - 1;
- err = __es_remove_extent(inode, first, last);
- if (err)
- ext4_warning(inode->i_sb,
- "%s: couldn't remove page (err = %d)",
- __func__, err);
-
- /* non-bigalloc case: simply count the cluster for release */
- if (sbi->s_cluster_ratio == 1 && delonly) {
- reserved++;
- continue;
- }
-
- /*
- * bigalloc case: if all delayed allocated only blocks have
- * just been removed from a cluster, either cancel a pending
- * reservation if it exists or count a cluster for release
- */
- if (delonly &&
- !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
- pr = __get_pending(inode, EXT4_B2C(sbi, first));
- if (pr != NULL) {
- tree = &EXT4_I(inode)->i_pending_tree;
- rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
- } else {
- reserved++;
- }
- }
- }
-
- write_unlock(&EXT4_I(inode)->i_es_lock);
-
- ext4_da_release_space(inode, reserved);
-}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..4ec30a798260 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,8 +70,8 @@ struct ext4_es_tree {
struct ext4_es_stats {
unsigned long es_stats_shrunk;
- unsigned long es_stats_cache_hits;
- unsigned long es_stats_cache_misses;
+ struct percpu_counter es_stats_cache_hits;
+ struct percpu_counter es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
@@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t *next_lblk,
struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
@@ -208,6 +209,12 @@ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
return es->es_pblk & ~ES_MASK;
}
+static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
+{
+ ext4_fsblk_t pblock = ext4_es_pblock(es);
+ return pblock == ~ES_MASK ? 0 : pblock;
+}
+
static inline void ext4_es_store_pblock(struct extent_status *es,
ext4_fsblk_t pb)
{
@@ -246,7 +253,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 70b0438dbc94..5f225881176b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,10 +29,58 @@
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
+#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ return false;
+ if (fsverity_active(inode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_has_inline_data(inode))
+ return false;
+ return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock_shared(inode);
+ /*
+ * Fallback to buffered I/O if the operation being performed on
+ * the inode is not supported by direct I/O. The IOCB_DIRECT
+ * flag needs to be cleared here in order to ensure that the
+ * direct I/O path within generic_file_read_iter() is not
+ * taken.
+ */
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ return generic_file_read_iter(iocb, to);
+ }
+
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -40,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- if (!inode_trylock_shared(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
return -EAGAIN;
+ } else {
inode_lock_shared(inode);
}
/*
@@ -64,16 +113,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
- if (IS_DAX(file_inode(iocb->ki_filp)))
+ if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_read_iter(iocb, to);
+
return generic_file_read_iter(iocb, to);
}
@@ -103,13 +157,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_unwritten_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -119,19 +166,25 @@ static void ext4_unwritten_wait(struct inode *inode)
* threads are at work on the same unwritten block, they must be synchronized
* or one thread will zero the other's data, causing corruption.
*/
-static int
-ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+static bool
+ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
- int blockmask = sb->s_blocksize - 1;
-
- if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
- return 0;
+ unsigned long blockmask = sb->s_blocksize - 1;
if ((pos | iov_iter_alignment(from)) & blockmask)
- return 1;
+ return true;
- return 0;
+ return false;
+}
+
+static bool
+ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
+{
+ if (offset + len > i_size_read(inode) ||
+ offset + len > EXT4_I(inode)->i_disksize)
+ return true;
+ return false;
}
/* Is IO overwriting allocated and initialized blocks? */
@@ -157,18 +210,19 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}
-static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -180,112 +234,413 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
+
return iov_iter_count(from);
}
-#ifdef CONFIG_FS_DAX
-static ssize_t
-ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret, count;
+
+ count = ext4_generic_write_checks(iocb, from);
+ if (count <= 0)
+ return count;
+
+ ret = file_modified(iocb->ki_filp);
+ if (ret)
+ return ret;
+ return count;
+}
+
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- inode_lock(inode);
- }
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
- ret = file_remove_privs(iocb->ki_filp);
- if (ret)
- goto out;
- ret = file_update_time(iocb->ki_filp);
- if (ret)
- goto out;
- ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+
out:
inode_unlock(inode);
- if (ret > 0)
+ if (likely(ret > 0)) {
+ iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
+ }
+
return ret;
}
-#endif
-static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+ ssize_t written, size_t count)
{
- struct inode *inode = file_inode(iocb->ki_filp);
- int o_direct = iocb->ki_flags & IOCB_DIRECT;
- int unaligned_aio = 0;
- int overwrite = 0;
- ssize_t ret;
+ handle_t *handle;
+ bool truncate = false;
+ u8 blkbits = inode->i_blkbits;
+ ext4_lblk_t written_blk, end_blk;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ /*
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But, the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
+ */
+ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
+ if (offset + count <= EXT4_I(inode)->i_disksize) {
+ /*
+ * We need to ensure that the inode is removed from the orphan
+ * list if it has been added prematurely, due to writeback of
+ * delalloc blocks.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
-#ifdef CONFIG_FS_DAX
- if (IS_DAX(inode))
- return ext4_dax_write_iter(iocb, from);
-#endif
- if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
+ return written;
+ }
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- inode_lock(inode);
+ if (written < 0)
+ goto truncate;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ written = PTR_ERR(handle);
+ goto truncate;
}
- ret = ext4_write_checks(iocb, from);
+ if (ext4_update_inode_size(inode, offset + written))
+ ext4_mark_inode_dirty(handle, inode);
+
+ /*
+ * We may need to truncate allocated but not written blocks beyond EOF.
+ */
+ written_blk = ALIGN(offset + written, 1 << blkbits);
+ end_blk = ALIGN(offset + count, 1 << blkbits);
+ if (written_blk < end_blk && ext4_can_truncate(inode))
+ truncate = true;
+
+ /*
+ * Remove the inode from the orphan list if it has been extended and
+ * everything went OK.
+ */
+ if (!truncate && inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+
+ if (truncate) {
+truncate:
+ ext4_truncate_failed_write(inode);
+ /*
+ * If the truncate operation failed early, then the inode may
+ * still be on the orphan list. In that case, we need to try
+ * remove the inode from the in-memory linked list.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return written;
+}
+
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned int flags)
+{
+ loff_t offset = iocb->ki_pos;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (error)
+ return error;
+
+ if (size && flags & IOMAP_DIO_UNWRITTEN)
+ return ext4_convert_unwritten_extents(NULL, inode,
+ offset, size);
+
+ return 0;
+}
+
+static const struct iomap_dio_ops ext4_dio_write_ops = {
+ .end_io = ext4_dio_write_end_io,
+};
+
+/*
+ * The intention here is to start with shared lock acquired then see if any
+ * condition requires an exclusive inode lock. If yes, then we restart the
+ * whole operation by releasing the shared lock and acquiring exclusive lock.
+ *
+ * - For unaligned_io we never take shared lock as it may cause data corruption
+ * when two unaligned IO tries to modify the same block e.g. while zeroing.
+ *
+ * - For extending writes case we don't take the shared lock, since it requires
+ * updating inode i_disksize and/or orphan handling with exclusive lock.
+ *
+ * - shared locking will only be true mostly with overwrites. Otherwise we will
+ * switch to exclusive i_rwsem lock.
+ */
+static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+ bool *ilock_shared, bool *extend)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t offset;
+ size_t count;
+ ssize_t ret;
+
+restart:
+ ret = ext4_generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
+ offset = iocb->ki_pos;
+ count = ret;
+ if (ext4_extending_io(inode, offset, count))
+ *extend = true;
/*
- * Unaligned direct AIO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned AIOs can result in data
- * corruption.
+ * Determine whether the IO operation will overwrite allocated
+ * and initialized blocks.
+ * We need exclusive i_rwsem for changing security info
+ * in file_modified().
*/
- if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb) &&
- ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
- unaligned_aio = 1;
- ext4_unwritten_wait(inode);
+ if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
+ !ext4_overwrite_io(inode, offset, count))) {
+ inode_unlock_shared(inode);
+ *ilock_shared = false;
+ inode_lock(inode);
+ goto restart;
}
- iocb->private = &overwrite;
- /* Check whether we do a DIO overwrite or not */
- if (o_direct && !unaligned_aio) {
- if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
- if (ext4_should_dioread_nolock(inode))
- overwrite = 1;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
+ ret = file_modified(file);
+ if (ret < 0)
+ goto out;
+
+ return count;
+out:
+ if (*ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+ return ret;
+}
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret;
+ handle_t *handle;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ loff_t offset = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+ const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
+ bool extend = false, unaligned_io = false;
+ bool ilock_shared = true;
+
+ /*
+ * We initially start with shared inode lock unless it is
+ * unaligned IO which needs exclusive lock anyways.
+ */
+ if (ext4_unaligned_io(inode, from, offset)) {
+ unaligned_io = true;
+ ilock_shared = false;
+ }
+ /*
+ * Quick check here without any i_rwsem lock to see if it is extending
+ * IO. A more reliable check is done in ext4_dio_write_checks() with
+ * proper locking in place.
+ */
+ if (offset + count > i_size_read(inode))
+ ilock_shared = false;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (ilock_shared) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
}
+ } else {
+ if (ilock_shared)
+ inode_lock_shared(inode);
+ else
+ inode_lock(inode);
}
- ret = __generic_file_write_iter(iocb, from);
+ /* Fallback to buffered I/O if the inode does not support direct I/O. */
+ if (!ext4_dio_supported(inode)) {
+ if (ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+ return ext4_buffered_write_iter(iocb, from);
+ }
+
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+ if (ret <= 0)
+ return ret;
+
+ offset = iocb->ki_pos;
+ count = ret;
+
/*
- * Unaligned direct AIO must be the only IO in flight. Otherwise
- * overlapping aligned IO after unaligned might result in data
+ * Unaligned direct IO must be serialized among each other as zeroing
+ * of partial blocks of two competing unaligned IOs can result in data
* corruption.
+ *
+ * So we make sure we don't allow any unaligned IO in flight.
+ * For IOs where we need not wait (like unaligned non-AIO DIO),
+ * below inode_dio_wait() may anyway become a no-op, since we start
+ * with exclusive lock.
*/
- if (ret == -EIOCBQUEUED && unaligned_aio)
- ext4_unwritten_wait(inode);
- inode_unlock(inode);
+ if (unaligned_io)
+ inode_dio_wait(inode);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
+ if (extend) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+
+ ext4_journal_stop(handle);
+ }
+
+ if (ilock_shared)
+ iomap_ops = &ext4_iomap_overwrite_ops;
+ ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_io || extend);
+
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
+
+out:
+ if (ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+
+ if (ret >= 0 && iov_iter_count(from)) {
+ ssize_t err;
+ loff_t endbyte;
+
+ offset = iocb->ki_pos;
+ err = ext4_buffered_write_iter(iocb, from);
+ if (err < 0)
+ return err;
+
+ /*
+ * We need to ensure that the pages within the page cache for
+ * the range covered by this I/O are written to disk and
+ * invalidated. This is in attempt to preserve the expected
+ * direct I/O semantics in the case we fallback to buffered I/O
+ * to complete off the I/O request.
+ */
+ ret += err;
+ endbyte = offset + err - 1;
+ err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+ offset, endbyte);
+ if (!err)
+ invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+ offset >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+ }
return ret;
+}
+#ifdef CONFIG_FS_DAX
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret;
+ size_t count;
+ loff_t offset;
+ handle_t *handle;
+ bool extend = false;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ offset = iocb->ki_pos;
+ count = iov_iter_count(from);
+
+ if (offset + count > EXT4_I(inode)->i_disksize) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+
+ extend = true;
+ ext4_journal_stop(handle);
+ }
+
+ ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
inode_unlock(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
return ret;
}
+#endif
+
+static ssize_t
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return ext4_dax_write_iter(iocb, from);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_write_iter(iocb, from);
+
+ return ext4_buffered_write_iter(iocb, from);
+}
#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
@@ -457,6 +812,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ret)
return ret;
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
+
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
@@ -492,12 +851,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
maxbytes, i_size_read(inode));
case SEEK_HOLE:
inode_lock_shared(inode);
- offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_hole(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
case SEEK_DATA:
inode_lock_shared(inode);
- offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_data(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5508baa11bb6..e10206e7f4bb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
+static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ int ret, err;
+
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (!(inode->i_state & I_DIRTY_ALL))
+ return ret;
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ return ret;
+
+ err = sync_inode_metadata(inode, 1);
+ if (!ret)
+ ret = err;
+
+ if (!ret)
+ ret = ext4_sync_parent(inode);
+ if (test_opt(inode->i_sb, BARRIER))
+ *needs_barrier = true;
+
+ return ret;
+}
+
+static int ext4_fsync_journal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ *needs_barrier = true;
+
+ return jbd2_complete_transaction(journal, commit_tid);
+}
+
/*
* akpm: A new design for ext4_sync_file().
*
@@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode)
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*/
-
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0, err;
- tid_t commit_tid;
bool needs_barrier = false;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
- if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ret = -EROFS;
goto out;
}
- if (!journal) {
- ret = __generic_file_fsync(file, start, end, datasync);
- if (!ret)
- ret = ext4_sync_parent(inode);
- if (test_opt(inode->i_sb, BARRIER))
- goto issue_flush;
- goto out;
- }
-
ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
+
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
+ if (!sbi->s_journal)
+ ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
+ else if (ext4_should_journal_data(inode))
ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
+ else
+ ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
- commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(journal, commit_tid))
- needs_barrier = true;
- ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- issue_flush:
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index d358bfcb6b3f..3e133793a5a3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir)) {
+ if (len && IS_CASEFOLDED(dir) && um) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 764ff4c56233..c66e8f9451a2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
goto verified;
blk = ext4_inode_bitmap(sb, desc);
if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8)) {
+ EXT4_INODES_PER_GROUP(sb) / 8) ||
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, blk);
@@ -192,8 +193,10 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
get_bh(bh);
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
if (!buffer_uptodate(bh)) {
put_bh(bh);
+ ext4_set_errno(sb, EIO);
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
@@ -265,13 +268,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ext4_debug("freeing inode %lu\n", ino);
trace_ext4_free_inode(inode);
- /*
- * Note: we must free any quota before locking the superblock,
- * as writing the quota to disk may need the lock as well.
- */
dquot_initialize(inode);
dquot_free_inode(inode);
- dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -926,8 +924,8 @@ repeat_in_this_group:
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks,
- 0);
+ handle_type, nblocks, 0,
+ ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
@@ -1228,6 +1226,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
+ ext4_set_errno(sb, -err);
ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
ino, err);
return inode;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36699a131168..569fc68e8975 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle,
for (i = 0; i <= indirect_blks; i++) {
if (i == indirect_blks) {
new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
- } else
+ } else {
ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
ar->inode, ar->goal,
ar->flags & EXT4_MB_DELALLOC_RESERVED,
NULL, &err);
+ /* Simplify error cleanup... */
+ branch[i+1].bh = NULL;
+ }
if (err) {
i--;
goto failed;
@@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle,
}
return 0;
failed:
+ if (i == indirect_blks) {
+ /* Free data blocks */
+ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+ ar->len, 0);
+ i--;
+ }
for (; i >= 0; i--) {
/*
* We want to ext4_forget() only freshly allocated indirect
- * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
- * buffer at branch[0].bh is indirect block / inode already
- * existing before ext4_alloc_branch() was called.
+ * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
+ * (buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called). Also
+ * because blocks are freshly allocated, we don't need to
+ * revoke them which is why we don't set
+ * EXT4_FREE_BLOCKS_METADATA.
*/
- if (i > 0 && i != indirect_blks && branch[i].bh)
- ext4_forget(handle, 1, ar->inode, branch[i].bh,
- branch[i].bh->b_blocknr);
- ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
- (i == indirect_blks) ? ar->len : 1, 0);
+ ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
+ new_blocks[i], 1,
+ branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
}
return err;
}
@@ -650,32 +660,6 @@ out:
}
/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate a new block at @lblocks for non extent file based file
- */
-int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
- int blk_bits;
-
- if (lblock < EXT4_NDIR_BLOCKS)
- return 0;
-
- lblock -= EXT4_NDIR_BLOCKS;
-
- if (ei->i_da_metadata_calc_len &&
- (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
- ei->i_da_metadata_calc_len++;
- return 0;
- }
- ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
- ei->i_da_metadata_calc_len = 1;
- blk_bits = order_base_2(lblock);
- return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
-}
-
-/*
* Calculate number of indirect blocks touched by mapping @nrblocks logically
* contiguous blocks
*/
@@ -689,27 +673,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
+static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int *dropped)
+{
+ int err;
+
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ return err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ return err;
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
+
/*
* Truncate transactions can be complex and absolutely huge. So we need to
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * extend fails, we restart transaction.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_ind_truncate_ensure_credits(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh,
+ int revoke_creds)
{
- if (!ext4_handle_valid(handle))
- return 0;
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
- return 0;
- return 1;
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_blocks_for_truncate(inode), revoke_creds,
+ ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (ret <= 0)
+ return ret;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(ret))
+ return ret;
+ }
+ return 0;
}
/*
@@ -844,27 +864,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
return 1;
}
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (unlikely(err))
- goto out_err;
- }
- err = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(err))
- goto out_err;
- err = ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- if (unlikely(err))
- goto out_err;
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (unlikely(err))
- goto out_err;
- }
- }
+ err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
+ ext4_free_data_revoke_credits(inode, count));
+ if (err < 0)
+ goto out_err;
for (p = first; p < last; p++)
*p = 0;
@@ -1047,11 +1050,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
if (ext4_handle_is_aborted(handle))
return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- }
+ if (ext4_ind_truncate_ensure_credits(handle, inode,
+ NULL,
+ ext4_free_metadata_revoke_credits(
+ inode->i_sb, 1)) < 0)
+ return;
/*
* The forget flag here is critical because if
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 88cdf3c90bd1..fad82d08fca5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode)
error = ext4_get_inode_loc(inode, &iloc);
if (error) {
+ ext4_set_errno(inode->i_sb, -error);
ext4_error_inode(inode, __func__, __LINE__, 0,
"can't get inode location %lu",
inode->i_ino);
@@ -849,7 +850,7 @@ out:
/*
* Prepare the write for the inline data.
- * If the the data can be written into the inode, we just read
+ * If the data can be written into the inode, we just read
* the page and make it uptodate, and start the journal.
* Otherwise read the page, makes it dirty so that it can be
* handle in writepages(the i_disksize update is left to the
@@ -1416,7 +1417,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
err = ext4_htree_store_dirent(dir_file, hinfo->hash,
hinfo->minor_hash, de, &tmp_str);
if (err) {
- count = err;
+ ret = err;
goto out;
}
count++;
@@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
+ ext4_set_errno(dir->i_sb, -err);
EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
err, dir->i_ino);
return true;
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
new file mode 100644
index 000000000000..d62d802c9c12
--- /dev/null
+++ b/fs/ext4/inode-test.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 inode that verify the seconds part of [a/c/m]
+ * timestamps in ext4 inode structs are decoded correctly.
+ */
+
+#include <kunit/test.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+
+#include "ext4.h"
+
+/*
+ * For constructing the nonnegative timestamp lower bound value.
+ * binary: 00000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_0 0L
+/*
+ * For constructing the nonnegative timestamp upper bound value.
+ * binary: 01111111 11111111 11111111 11111111
+ *
+ */
+#define UPPER_MSB_0 0x7fffffffL
+/*
+ * For constructing the negative timestamp lower bound value.
+ * binary: 10000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */
+/*
+ * For constructing the negative timestamp upper bound value.
+ * binary: 11111111 11111111 11111111 11111111
+ */
+#define UPPER_MSB_1 (-1L)
+/*
+ * Upper bound for nanoseconds value supported by the encoding.
+ * binary: 00111111 11111111 11111111 11111111
+ */
+#define MAX_NANOSECONDS ((1L << 30) - 1)
+
+#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x"
+
+#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits"
+#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits"
+#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits"
+#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits"
+#define LOWER_BOUND_NEG_LO_1_CASE\
+ "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NEG_LO_1_CASE\
+ "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NONNEG_LO_1_CASE\
+ "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NONNEG_LO_1_CASE\
+ "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NEG_HI_1_CASE\
+ "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NEG_HI_1_CASE\
+ "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on"
+#define LOWER_BOUND_NONNEG_HI_1_CASE\
+ "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_CASE\
+ "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\
+ "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns"
+#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\
+ "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns"
+#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on"
+#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on"
+
+struct timestamp_expectation {
+ const char *test_case_name;
+ struct timespec64 expected;
+ u32 extra_bits;
+ bool msb_set;
+ bool lower_bound;
+};
+
+static time64_t get_32bit_time(const struct timestamp_expectation * const test)
+{
+ if (test->msb_set) {
+ if (test->lower_bound)
+ return LOWER_MSB_1;
+
+ return UPPER_MSB_1;
+ }
+
+ if (test->lower_bound)
+ return LOWER_MSB_0;
+ return UPPER_MSB_0;
+}
+
+
+/*
+ * Test data is derived from the table in the Inode Timestamps section of
+ * Documentation/filesystems/ext4/inodes.rst.
+ */
+static void inode_test_xtimestamp_decoding(struct kunit *test)
+{
+ const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+ };
+
+ struct timespec64 timestamp;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_data); ++i) {
+ timestamp.tv_sec = get_32bit_time(&test_data[i]);
+ ext4_decode_extra_time(&timestamp,
+ cpu_to_le32(test_data[i].extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_data[i].expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_data[i].test_case_name,
+ test_data[i].msb_set,
+ test_data[i].lower_bound,
+ test_data[i].extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_data[i].expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_data[i].test_case_name,
+ test_data[i].msb_set,
+ test_data[i].lower_bound,
+ test_data[i].extra_bits);
+ }
+}
+
+static struct kunit_case ext4_inode_test_cases[] = {
+ KUNIT_CASE(inode_test_xtimestamp_decoding),
+ {}
+};
+
+static struct kunit_suite ext4_inode_test_suite = {
+ .name = "ext4_inode_test",
+ .test_cases = ext4_inode_test_cases,
+};
+
+kunit_test_suites(&ext4_inode_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..e60aca791d3f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -48,8 +48,6 @@
#include <trace/events/ext4.h>
-#define MPAGE_DA_EXTENT_TAIL 0x01
-
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
@@ -164,39 +162,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
}
/*
- * Restart the transaction associated with *handle. This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
- int nblocks)
-{
- int ret;
-
- /*
- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
- * moment, get_block can be called only for blocks inside i_size since
- * page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
- */
- BUG_ON(EXT4_JOURNAL(inode) == NULL);
- jbd_debug(2, "restarting handle %p\n", handle);
- up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, nblocks);
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- return ret;
-}
-
-/*
* Called at the last iput() if i_nlink is zero.
*/
void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
- int extra_credits = 3;
+ /*
+ * Credits for final inode cleanup and freeing:
+ * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+ * (xattr block freeing), bitmap, group descriptor (inode freeing)
+ */
+ int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -252,8 +229,12 @@ void ext4_evict_inode(struct inode *inode)
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+ /*
+ * Block bitmap, group descriptor, and inode are accounted in both
+ * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+ */
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+extra_credits);
+ ext4_blocks_for_truncate(inode) + extra_credits - 3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -288,6 +269,7 @@ void ext4_evict_inode(struct inode *inode)
if (inode->i_blocks) {
err = ext4_truncate(inode);
if (err) {
+ ext4_set_errno(inode->i_sb, -err);
ext4_error(inode->i_sb,
"couldn't truncate inode %lu (err %d)",
inode->i_ino, err);
@@ -419,7 +401,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
- if (IS_ENCRYPTED(inode))
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -527,7 +509,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -695,7 +677,7 @@ found:
* extent status tree.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es))
goto out_sem;
}
@@ -827,136 +809,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
#define DIO_MAX_BLOCKS 4096
/*
- * Get blocks function for the cases that need to start a transaction -
- * generally difference cases of direct IO and DAX IO. It also handles retries
- * in case of ENOSPC.
- */
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int flags)
-{
- int dio_credits;
- handle_t *handle;
- int retries = 0;
- int ret;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
- dio_credits = ext4_chunk_trans_blocks(inode,
- bh_result->b_size >> inode->i_blkbits);
-retry:
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = _ext4_get_block(inode, iblock, bh_result, flags);
- ext4_journal_stop(handle);
-
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
-}
-
-/* Get block function for DIO reads and writes to inodes without extents */
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- if (!create)
- return _ext4_get_block(inode, iblock, bh, 0);
- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
-}
-
-/*
- * Get block function for AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete.
- */
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * When doing DIO using unwritten extents, we need io_end to convert
- * unwritten extents to written on IO completion. We allocate io_end
- * once we spot unwritten extent and store it in b_private. Generic
- * DIO code keeps b_private set and furthermore passes the value to
- * our completion callback in 'private' argument.
- */
- if (!ret && buffer_unwritten(bh_result)) {
- if (!bh_result->b_private) {
- ext4_io_end_t *io_end;
-
- io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!io_end)
- return -ENOMEM;
- bh_result->b_private = io_end;
- ext4_set_io_unwritten_flag(inode, io_end);
- }
- set_buffer_defer_completion(bh_result);
- }
-
- return ret;
-}
-
-/*
- * Get block function for non-AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete by ext4_direct_IO_write().
- */
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * Mark inode as having pending DIO writes to unwritten extents.
- * ext4_direct_IO_write() checks this flag and converts extents to
- * written.
- */
- if (!ret && buffer_unwritten(bh_result))
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-
- return ret;
-}
-
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
-
- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
- inode->i_ino, create);
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-
- return ret;
-}
-
-
-/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -1024,7 +876,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
- if (!bh || buffer_uptodate(bh))
+ if (!bh || ext4_buffer_uptodate(bh))
return bh;
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
@@ -1051,7 +903,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
- if (bhs[i] && !buffer_uptodate(bhs[i]))
+ if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
&bhs[i]);
@@ -1340,6 +1192,9 @@ retry_journal:
}
if (ret) {
+ bool extended = (pos + len > inode->i_size) &&
+ !ext4_verity_in_progress(inode);
+
unlock_page(page);
/*
* __block_write_begin may have instantiated a few blocks
@@ -1349,11 +1204,11 @@ retry_journal:
* Add inode to orphan list in case we crash before
* truncate finishes
*/
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (extended && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
- if (pos + len > inode->i_size) {
+ if (extended) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
@@ -1406,6 +1261,7 @@ static int ext4_write_end(struct file *file,
int ret = 0, ret2;
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
if (inline_data) {
@@ -1423,12 +1279,16 @@ static int ext4_write_end(struct file *file,
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
+ *
+ * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+ * blocks are being written past EOF, so skip the i_size update.
*/
- i_size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1439,7 +1299,7 @@ static int ext4_write_end(struct file *file,
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1450,7 +1310,7 @@ errout:
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1511,6 +1371,7 @@ static int ext4_journalled_write_end(struct file *file,
unsigned from, to;
int size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_SIZE - 1);
@@ -1540,13 +1401,14 @@ static int ext4_journalled_write_end(struct file *file,
if (!partial)
SetPageUptodate(page);
}
- size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
if (size_changed || inline_data) {
@@ -1555,7 +1417,7 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1566,7 +1428,7 @@ errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1646,49 +1508,6 @@ void ext4_da_release_space(struct inode *inode, int to_free)
dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
-static void ext4_da_page_release_reservation(struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- int contiguous_blks = 0;
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
- struct inode *inode = page->mapping->host;
- unsigned int stop = offset + length;
- ext4_fsblk_t lblk;
-
- BUG_ON(stop > PAGE_SIZE || stop < length);
-
- head = page_buffers(page);
- bh = head;
- do {
- unsigned int next_off = curr_off + bh->b_size;
-
- if (next_off > stop)
- break;
-
- if ((offset <= curr_off) && (buffer_delay(bh))) {
- contiguous_blks++;
- clear_buffer_delay(bh);
- } else if (contiguous_blks) {
- lblk = page->index <<
- (PAGE_SHIFT - inode->i_blkbits);
- lblk += (curr_off >> inode->i_blkbits) -
- contiguous_blks;
- ext4_es_remove_blks(inode, lblk, contiguous_blks);
- contiguous_blks = 0;
- }
- curr_off = next_off;
- } while ((bh = bh->b_this_page) != head);
-
- if (contiguous_blks) {
- lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
- lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
- ext4_es_remove_blks(inode, lblk, contiguous_blks);
- }
-
-}
-
/*
* Delayed allocation stuff
*/
@@ -1868,7 +1687,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read(&EXT4_I(inode)->i_data_sem);
@@ -2162,7 +1981,8 @@ static int ext4_writepage(struct page *page,
trace_ext4_writepage(page);
size = i_size_read(inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2246,7 +2066,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
* after page tables are updated.
*/
size = i_size_read(mpd->inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(mpd->inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2345,6 +2166,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
>> inode->i_blkbits;
+ if (ext4_verity_in_progress(inode))
+ blocks = EXT_MAX_BLOCKS;
+
do {
BUG_ON(buffer_locked(bh));
@@ -2369,6 +2193,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
}
/*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ * may submit fully mapped page for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ * @m_lblk - logical block mapping.
+ * @m_pblk - corresponding physical mapping.
+ * @map_bh - determines on return whether this page requires any further
+ * mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+ bool *map_bh)
+{
+ struct buffer_head *head, *bh;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ ext4_lblk_t lblk = *m_lblk;
+ ext4_fsblk_t pblock = *m_pblk;
+ int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
+ if (err > 0)
+ err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec)) {
+ err = PTR_ERR(io_end_vec);
+ goto out;
+ }
+ io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ }
+ *map_bh = true;
+ goto out;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+ *map_bh = false;
+out:
+ *m_lblk = lblk;
+ *m_pblk = pblock;
+ return err;
+}
+
+/*
* mpage_map_buffers - update buffers corresponding to changed extent and
* submit fully mapped pages for IO
*
@@ -2387,12 +2284,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
struct pagevec pvec;
int nr_pages, i;
struct inode *inode = mpd->inode;
- struct buffer_head *head, *bh;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
- sector_t pblock;
+ ext4_fsblk_t pblock;
int err;
+ bool map_bh = false;
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2408,50 +2305,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- bh = head = page_buffers(page);
- do {
- if (lblk < mpd->map.m_lblk)
- continue;
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
- /*
- * Buffer after end of mapped extent.
- * Find next buffer in the page to map.
- */
- mpd->map.m_len = 0;
- mpd->map.m_flags = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
- err = mpage_process_page_bufs(mpd, head,
- bh, lblk);
- pagevec_release(&pvec);
- if (err > 0)
- err = 0;
- return err;
- }
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock++;
- }
- clear_buffer_unwritten(bh);
- } while (lblk++, (bh = bh->b_this_page) != head);
-
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
+ &map_bh);
/*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
+ * If map_bh is true, means page may require further bh
+ * mapping, or maybe the page was submitted for IO.
+ * So we return to call further extent mapping.
*/
- mpd->io_submit.io_end->size += PAGE_SIZE;
+ if (err < 0 || map_bh == true)
+ goto out;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
- if (err < 0) {
- pagevec_release(&pvec);
- return err;
- }
+ if (err < 0)
+ goto out;
}
pagevec_release(&pvec);
}
@@ -2459,6 +2325,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2538,9 +2407,13 @@ static int mpage_map_and_submit_extent(handle_t *handle,
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec;
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec))
+ return PTR_ERR(io_end_vec);
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -2604,10 +2477,12 @@ update_disksize:
EXT4_I(inode)->i_disksize = disksize;
up_write(&EXT4_I(inode)->i_data_sem);
err2 = ext4_mark_inode_dirty(handle, inode);
- if (err2)
+ if (err2) {
+ ext4_set_errno(inode->i_sb, -err2);
ext4_error(inode->i_sb,
"Failed to mark inode %lu dirty",
inode->i_ino);
+ }
if (!err)
err = err2;
}
@@ -2785,15 +2660,6 @@ static int ext4_writepages(struct address_space *mapping,
goto out_writepages;
}
- if (ext4_should_dioread_nolock(inode)) {
- /*
- * We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
- */
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
- PAGE_SIZE >> inode->i_blkbits);
- }
-
/*
* If we have inline data and arrive here, it means that
* we will soon create the block for the 1st page, so
@@ -2812,6 +2678,15 @@ static int ext4_writepages(struct address_space *mapping,
ext4_journal_stop(handle);
}
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+ PAGE_SIZE >> inode->i_blkbits);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2992,7 +2867,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
percpu_down_read(&sbi->s_journal_flag_rwsem);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
percpu_up_read(&sbi->s_journal_flag_rwsem);
@@ -3061,8 +2936,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) ||
- S_ISLNK(inode->i_mode)) {
+ if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+ ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
len, flags, pagep, fsdata);
@@ -3227,24 +3102,6 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * Drop reserved blocks
- */
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
- goto out;
-
- ext4_da_page_release_reservation(page, offset, length);
-
-out:
- ext4_invalidatepage(page, offset, length);
-
- return;
-}
-
/*
* Force all delayed allocation blocks to be allocated for a given inode.
*/
@@ -3452,471 +3309,256 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode->i_state & I_DIRTY_DATASYNC;
}
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned flags, struct iomap *iomap)
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+ struct ext4_map_blocks *map, loff_t offset,
+ loff_t length)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int blkbits = inode->i_blkbits;
- unsigned long first_block, last_block;
- struct ext4_map_blocks map;
- bool delalloc = false;
- int ret;
-
- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
- return -EINVAL;
- first_block = offset >> blkbits;
- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
- EXT4_MAX_LOGICAL_BLOCK);
-
- if (flags & IOMAP_REPORT) {
- if (ext4_has_inline_data(inode)) {
- ret = ext4_inline_data_iomap(inode, iomap);
- if (ret != -EAGAIN) {
- if (ret == 0 && offset >= iomap->length)
- ret = -ENOENT;
- return ret;
- }
- }
- } else {
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
- return -ERANGE;
- }
-
- map.m_lblk = first_block;
- map.m_len = last_block - first_block + 1;
-
- if (flags & IOMAP_REPORT) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
-
- if (ret == 0) {
- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map.m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end) {
- /* entire range is a hole */
- } else if (es.es_lblk > map.m_lblk) {
- /* range starts with a hole */
- map.m_len = es.es_lblk - map.m_lblk;
- } else {
- ext4_lblk_t offs = 0;
-
- if (es.es_lblk < map.m_lblk)
- offs = map.m_lblk - es.es_lblk;
- map.m_lblk = es.es_lblk + offs;
- map.m_len = es.es_len - offs;
- delalloc = true;
- }
- }
- } else if (flags & IOMAP_WRITE) {
- int dio_credits;
- handle_t *handle;
- int retries = 0;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-retry:
- /*
- * Either we allocate blocks and then we don't get unwritten
- * extent so we have reserved enough credits, or the blocks
- * are already allocated and unwritten and in that case
- * extent conversion fits in the credits as well.
- */
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0) {
- ext4_journal_stop(handle);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
- }
-
- /*
- * If we added blocks beyond i_size, we need to make sure they
- * will get truncated if we crash before updating i_size in
- * ext4_iomap_end(). For faults we don't need to do that (and
- * even cannot because for orphan list operations inode_lock is
- * required) - if we happen to instantiate block beyond i_size,
- * it is because we race with truncate which has already added
- * the inode to the orphan list.
- */
- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
- int err;
-
- err = ext4_orphan_add(handle, inode);
- if (err < 0) {
- ext4_journal_stop(handle);
- return err;
- }
- }
- ext4_journal_stop(handle);
- } else {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
- }
+ u8 blkbits = inode->i_blkbits;
+ /*
+ * Writes that span EOF might trigger an I/O size update on completion,
+ * so consider them to be dirty for the purpose of O_DSYNC, even if
+ * there is no other metadata changes being made or are pending.
+ */
iomap->flags = 0;
- if (ext4_inode_datasync_dirty(inode))
+ if (ext4_inode_datasync_dirty(inode) ||
+ offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
+
+ if (map->m_flags & EXT4_MAP_NEW)
+ iomap->flags |= IOMAP_F_NEW;
+
iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = sbi->s_daxdev;
- iomap->offset = (u64)first_block << blkbits;
- iomap->length = (u64)map.m_len << blkbits;
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ iomap->offset = (u64) map->m_lblk << blkbits;
+ iomap->length = (u64) map->m_len << blkbits;
- if (ret == 0) {
- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
+ /*
+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+ * set. In order for any allocated unwritten extents to be converted
+ * into written extents correctly within the ->end_io() handler, we
+ * need to ensure that the iomap->type is set appropriately. Hence, the
+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+ * been set first.
+ */
+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+ iomap->type = IOMAP_UNWRITTEN;
+ iomap->addr = (u64) map->m_pblk << blkbits;
+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (u64) map->m_pblk << blkbits;
} else {
- if (map.m_flags & EXT4_MAP_MAPPED) {
- iomap->type = IOMAP_MAPPED;
- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- iomap->type = IOMAP_UNWRITTEN;
- } else {
- WARN_ON_ONCE(1);
- return -EIO;
- }
- iomap->addr = (u64)map.m_pblk << blkbits;
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
}
-
- if (map.m_flags & EXT4_MAP_NEW)
- iomap->flags |= IOMAP_F_NEW;
-
- return 0;
}
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int flags)
{
- int ret = 0;
handle_t *handle;
- int blkbits = inode->i_blkbits;
- bool truncate = false;
+ u8 blkbits = inode->i_blkbits;
+ int ret, dio_credits, m_flags = 0, retries = 0;
- if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
- return 0;
+ /*
+ * Trim the mapping request to the maximum value that we can map at
+ * once for direct I/O.
+ */
+ if (map->m_len > DIO_MAX_BLOCKS)
+ map->m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto orphan_del;
- }
- if (ext4_update_inode_size(inode, offset + written))
- ext4_mark_inode_dirty(handle, inode);
+retry:
/*
- * We may need to truncate allocated but not written blocks beyond EOF.
+ * Either we allocate blocks and then don't get an unwritten extent, so
+ * in that case we have reserved enough credits. Or, the blocks are
+ * already allocated and unwritten. In that case, the extent conversion
+ * fits into the credits as well.
*/
- if (iomap->offset + iomap->length >
- ALIGN(inode->i_size, 1 << blkbits)) {
- ext4_lblk_t written_blk, end_blk;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- written_blk = (offset + written) >> blkbits;
- end_blk = (offset + length) >> blkbits;
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
- }
/*
- * Remove inode from orphan list if we were extending a inode and
- * everything went fine.
+ * DAX and direct I/O are the only two operations that are currently
+ * supported with IOMAP_WRITE.
*/
- if (!truncate && inode->i_nlink &&
- !list_empty(&EXT4_I(inode)->i_orphan))
- ext4_orphan_del(handle, inode);
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+ if (IS_DAX(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+ /*
+ * We use i_size instead of i_disksize here because delalloc writeback
+ * can complete at any point during the I/O and subsequently push the
+ * i_disksize out to i_size. This could be beyond where direct I/O is
+ * happening and thus expose allocated blocks to direct I/O reads.
+ */
+ else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE;
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+
+ /*
+ * We cannot fill holes in indirect tree based inodes as that could
+ * expose stale data in the case of a crash. Use the magic error code
+ * to fallback to buffered I/O.
+ */
+ if (!m_flags && !ret)
+ ret = -ENOTBLK;
+
ext4_journal_stop(handle);
- if (truncate) {
- ext4_truncate_failed_write(inode);
-orphan_del:
- /*
- * If truncate failed early the inode might still be on the
- * orphan list; we need to make sure the inode is removed from
- * the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
return ret;
}
-const struct iomap_ops ext4_iomap_ops = {
- .iomap_begin = ext4_iomap_begin,
- .iomap_end = ext4_iomap_end,
-};
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private)
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
- ext4_io_end_t *io_end = private;
+ int ret;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
- /* if not async direct IO just return */
- if (!io_end)
- return 0;
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
- ext_debug("ext4_end_io_dio(): io_end 0x%p "
- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- io_end, io_end->inode->i_ino, iocb, offset, size);
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
/*
- * Error during AIO DIO. We cannot convert unwritten extents as the
- * data was not written. Just clear the unwritten flag and drop io_end.
+ * Calculate the first and last logical blocks respectively.
*/
- if (size <= 0) {
- ext4_clear_io_unwritten_flag(io_end);
- size = 0;
- }
- io_end->offset = offset;
- io_end->size = size;
- ext4_put_io_end(io_end);
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ if (flags & IOMAP_WRITE)
+ ret = ext4_iomap_alloc(inode, &map, flags);
+ else
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0)
+ return ret;
+
+ ext4_set_iomap(inode, iomap, &map, offset, length);
return 0;
}
-/*
- * Handling of direct IO writes.
- *
- * For ext4 extent files, ext4 will do direct-io write even to holes,
- * preallocated extents, and those write extend the file, no need to
- * fall back to buffered IO.
- *
- * For holes, we fallocate those blocks, mark them as unwritten
- * If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as unwritten.
- *
- * The unwritten extents will be converted to written when DIO is completed.
- * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the conversion
- * when async direct IO completed.
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- */
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- ssize_t ret;
- loff_t offset = iocb->ki_pos;
- size_t count = iov_iter_count(iter);
- int overwrite = 0;
- get_block_t *get_block_func = NULL;
- int dio_flags = 0;
- loff_t final_size = offset + count;
- int orphan = 0;
- handle_t *handle;
-
- if (final_size > inode->i_size || final_size > ei->i_disksize) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ext4_update_i_disksize(inode, inode->i_size);
- ext4_journal_stop(handle);
- }
-
- BUG_ON(iocb->private == NULL);
+ int ret;
/*
- * Make all waiters for direct IO properly wait also for extent
- * conversion. This also disallows race between truncate() and
- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ * Even for writes we don't need to allocate blocks, so just pretend
+ * we are reading to save overhead of starting a transaction.
*/
- inode_dio_begin(inode);
-
- /* If we do a overwrite dio, i_mutex locking can be released */
- overwrite = *((int *)iocb->private);
-
- if (overwrite)
- inode_unlock(inode);
+ flags &= ~IOMAP_WRITE;
+ ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
+ WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
+ return ret;
+}
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
+{
/*
- * For extent mapped files we could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as unwritten to prevent
- * parallel buffered read to expose the stale data before DIO complete
- * the data IO.
- *
- * As to previously fallocated extents, ext4 get_block will just simply
- * mark the buffer mapped but still keep the extents unwritten.
- *
- * For non AIO case, we will convert those unwritten extents to written
- * after return back from blockdev_direct_IO. That way we save us from
- * allocating io_end structure and also the overhead of offloading
- * the extent convertion to a workqueue.
- *
- * For async DIO, the conversion needs to be deferred when the
- * IO is completed. The ext4 end_io callback function will be
- * called to take care of the conversion work. Here for async
- * case, we allocate an io_end structure to hook to the iocb.
+ * Check to see whether an error occurred while writing out the data to
+ * the allocated blocks. If so, return the magic error code so that we
+ * fallback to buffered I/O and attempt to complete the remainder of
+ * the I/O. Any blocks that may have been allocated in preparation for
+ * the direct I/O will be reused during buffered I/O.
*/
- iocb->private = NULL;
- if (overwrite)
- get_block_func = ext4_dio_get_block_overwrite;
- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
- get_block_func = ext4_dio_get_block;
- dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
- } else if (is_sync_kiocb(iocb)) {
- get_block_func = ext4_dio_get_block_unwritten_sync;
- dio_flags = DIO_LOCKING;
- } else {
- get_block_func = ext4_dio_get_block_unwritten_async;
- dio_flags = DIO_LOCKING;
- }
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- get_block_func, ext4_end_io_dio, NULL,
- dio_flags);
+ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ return -ENOTBLK;
- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(NULL, inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
+ return 0;
+}
- inode_dio_end(inode);
- /* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite)
- inode_lock(inode);
+const struct iomap_ops ext4_iomap_ops = {
+ .iomap_begin = ext4_iomap_begin,
+ .iomap_end = ext4_iomap_end,
+};
- if (ret < 0 && final_size > inode->i_size)
- ext4_truncate_failed_write(inode);
+const struct iomap_ops ext4_iomap_overwrite_ops = {
+ .iomap_begin = ext4_iomap_overwrite_begin,
+ .iomap_end = ext4_iomap_end,
+};
- /* Handle extending of i_size after direct IO write */
- if (orphan) {
- int err;
+static bool ext4_iomap_is_delalloc(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct extent_status es;
+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /*
- * We wrote the data but cannot extend
- * i_size. Bail out. In async io case, we do
- * not return error here because we have
- * already submmitted the corresponding
- * bio. Returning error here makes the caller
- * think that this IO is done and failed
- * resulting in race with bio's completion
- * handler.
- */
- if (!ret)
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map->m_lblk, end, &es);
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size || end > ei->i_disksize) {
- ext4_update_i_disksize(inode, end);
- if (end > inode->i_size)
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
+ if (!es.es_len || es.es_lblk > end)
+ return false;
+
+ if (es.es_lblk > map->m_lblk) {
+ map->m_len = es.es_lblk - map->m_lblk;
+ return false;
}
-out:
- return ret;
-}
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
- ssize_t ret;
+ offset = map->m_lblk - es.es_lblk;
+ map->m_len = es.es_len - offset;
- /*
- * Shared inode_lock is enough for us - it protects against concurrent
- * writes & truncates and since we take care of writing back page cache,
- * we are protected against page writeback as well.
- */
- inode_lock_shared(inode);
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret)
- goto out_unlock;
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
- inode_unlock_shared(inode);
- return ret;
+ return true;
}
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- size_t count = iov_iter_count(iter);
- loff_t offset = iocb->ki_pos;
- ssize_t ret;
+ int ret;
+ bool delalloc = false;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
-#ifdef CONFIG_FS_ENCRYPTION
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
- return 0;
-#endif
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_inline_data_iomap(inode, iomap);
+ if (ret != -EAGAIN) {
+ if (ret == 0 && offset >= iomap->length)
+ ret = -ENOENT;
+ return ret;
+ }
+ }
/*
- * If we are doing data journalling we don't support O_DIRECT
+ * Calculate the first and last logical block respectively.
*/
- if (ext4_should_journal_data(inode))
- return 0;
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
- /* Let buffer I/O handle the inline data case. */
- if (ext4_has_inline_data(inode))
- return 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ delalloc = ext4_iomap_is_delalloc(inode, &map);
- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == READ)
- ret = ext4_direct_IO_read(iocb, iter);
- else
- ret = ext4_direct_IO_write(iocb, iter);
- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
- return ret;
+ ext4_set_iomap(inode, iomap, &map, offset, length);
+ if (delalloc && iomap->type == IOMAP_HOLE)
+ iomap->type = IOMAP_DELALLOC;
+
+ return 0;
}
+const struct iomap_ops ext4_iomap_report_ops = {
+ .iomap_begin = ext4_iomap_begin_report,
+};
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3954,7 +3596,7 @@ static const struct address_space_operations ext4_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -3971,7 +3613,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -3985,9 +3627,9 @@ static const struct address_space_operations ext4_da_aops = {
.write_end = ext4_da_write_end,
.set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_da_invalidatepage,
+ .invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -4081,8 +3723,12 @@ static int __ext4_block_zero_page_range(handle_t *handle,
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
- WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
- page, blocksize, bh_offset(bh)));
+ err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+ bh_offset(bh));
+ if (err) {
+ clear_buffer_uptodate(bh);
+ goto unlock;
+ }
}
}
if (ext4_should_journal_data(inode)) {
@@ -4292,11 +3938,17 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
unsigned int credits;
int ret = 0;
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
trace_ext4_punch_hole(inode, offset, length, 0);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ if (ext4_has_inline_data(inode)) {
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_convert_inline_data(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ if (ret)
+ return ret;
+ }
+
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4611,6 +4263,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
+ if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
+ goto simulate_eio;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -4709,6 +4363,8 @@ make_io:
blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
+ simulate_eio:
+ ext4_set_errno(inode->i_sb, EIO);
EXT4_ERROR_INODE_BLOCK(inode, block,
"unable to read itable block");
brelse(bh);
@@ -4739,6 +4395,8 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
+ return false;
return true;
}
@@ -4763,9 +4421,11 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
new_fl |= S_CASEFOLD;
+ if (flags & EXT4_VERITY_FL)
+ new_fl |= S_VERITY;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
- S_ENCRYPTED|S_CASEFOLD);
+ S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4918,7 +4578,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(gen));
}
- if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
+ ext4_set_errno(inode->i_sb, EFSBADCRC);
ext4_error_inode(inode, function, line, 0,
"iget: checksum invalid");
ret = -EFSBADCRC;
@@ -4982,6 +4644,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ /*
+ * If dir_index is not enabled but there's dir with INDEX flag set,
+ * we'd normally treat htree data as empty space. But with metadata
+ * checksumming that corrupts checksums so forbid that.
+ */
+ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: Dir with htree data on filesystem without dir_index feature.");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -5119,6 +4793,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
"iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ ext4_error_inode(inode, function, line, 0,
+ "casefold flag without casefold feature");
brelse(iloc.bh);
unlock_new_inode(inode);
@@ -5454,6 +5131,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_set_errno(inode->i_sb, EIO);
EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
"IO error syncing inode");
err = -EIO;
@@ -5478,11 +5156,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_SIZE ==
- * blocksize case
+ * If the page is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidatepage() may
+ * strip all buffers from the page but keep the page dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * buffers). Also we don't need to wait for any commit if all buffers in
+ * the page remain valid. This is most beneficial for the common case of
+ * blocksize == PAGESIZE.
*/
- if (offset > PAGE_SIZE - i_blocksize(inode))
+ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
@@ -5555,6 +5237,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ error = fsverity_prepare_setattr(dentry, attr);
+ if (error)
+ return error;
+
if (is_quota_modification(inode, attr)) {
error = dquot_initialize(inode);
if (error)
@@ -5724,7 +5410,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
+ if ((request_mask & STATX_BTIME) &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
@@ -5741,12 +5428,15 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (flags & EXT4_NODUMP_FL)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (flags & EXT4_VERITY_FL)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
- STATX_ATTR_NODUMP);
+ STATX_ATTR_NODUMP |
+ STATX_ATTR_VERITY);
generic_fillattr(inode, stat);
return 0;
@@ -5936,8 +5626,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
+ unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
int error;
+ /* this was checked at iget time, but double check for good measure */
+ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ return -EFSCORRUPTED;
+ }
+ if ((new_extra_isize < ei->i_extra_isize) ||
+ (new_extra_isize < 4) ||
+ (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+ return -EINVAL; /* Should never happen */
+
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
@@ -5989,9 +5694,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
* If this is felt to be critical, then e2fsck should be run to
* force a large enough s_min_extra_isize.
*/
- if (ext4_handle_valid(handle) &&
- jbd2_journal_extend(handle,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+ if (ext4_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
return -ENOSPC;
if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
@@ -6031,7 +5735,7 @@ int ext4_expand_extra_isize(struct inode *inode,
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
- goto out_stop;
+ goto out_unlock;
}
error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
@@ -6041,8 +5745,8 @@ int ext4_expand_extra_isize(struct inode *inode,
if (!error)
error = rc;
+out_unlock:
ext4_write_unlock_xattr(inode, &no_expand);
-out_stop:
ext4_journal_stop(handle);
return error;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 442f7ef873fc..a0ec750018dd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
}
+/* copied from fs/ioctl.c */
+static int fiemap_check_ranges(struct super_block *sb,
+ u64 start, u64 len, u64 *new_len)
+{
+ u64 maxbytes = (u64) sb->s_maxbytes;
+
+ *new_len = len;
+
+ if (len == 0)
+ return -EINVAL;
+
+ if (start > maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (len > maxbytes || (maxbytes - len) < start)
+ *new_len = maxbytes - start;
+
+ return 0;
+}
+
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
+
+static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
+{
+ struct fiemap fiemap;
+ struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+ struct fiemap_extent_info fieinfo = { 0, };
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ u64 len;
+ int error;
+
+ if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+ return -EFAULT;
+
+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+ return -EINVAL;
+
+ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+ &len);
+ if (error)
+ return error;
+
+ fieinfo.fi_flags = fiemap.fm_flags;
+ fieinfo.fi_extents_max = fiemap.fm_extent_count;
+ fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+ if (fiemap.fm_extent_count != 0 &&
+ !access_ok(fieinfo.fi_extents_start,
+ fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+ return -EFAULT;
+
+ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+ filemap_write_and_wait(inode->i_mapping);
+
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+ fiemap.fm_flags = fieinfo.fi_flags;
+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+ if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+ error = -EFAULT;
+
+ return error;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1113,8 +1181,62 @@ resizefs_out:
#endif
}
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key_all_users(filp,
+ (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ {
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+ ext4_clear_inode_es(inode);
+ return 0;
+ }
+
+ case EXT4_IOC_GETSTATE:
+ {
+ __u32 state = 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
+ state |= EXT4_STATE_FLAG_EXT_PRECACHED;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ state |= EXT4_STATE_FLAG_NEW;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ state |= EXT4_STATE_FLAG_NEWENTRY;
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
+ state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
+
+ return put_user(state, (__u32 __user *) arg);
+ }
+
+ case EXT4_IOC_GET_ES_CACHE:
+ return ext4_ioctl_get_es_cache(filp, arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -1171,6 +1293,17 @@ out:
}
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
+
+ case FS_IOC_ENABLE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+ case FS_IOC_MEASURE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -1227,12 +1360,25 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
case EXT4_IOC_MOVE_EXT:
case EXT4_IOC_RESIZE_FS:
+ case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
case EXT4_IOC_SET_ENCRYPTION_POLICY:
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ case EXT4_IOC_GETSTATE:
+ case EXT4_IOC_GET_ES_CACHE:
+ case EXT4_IOC_FSGETXATTR:
+ case EXT4_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a3e2767bdf2f..f64838187559 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d reading block bitmap for %u",
err, group);
return 0;
@@ -4063,6 +4064,7 @@ repeat:
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d loading buddy information for %u",
err, group);
continue;
@@ -4071,6 +4073,7 @@ repeat:
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d reading block bitmap for %u",
err, group);
ext4_mb_unload_buddy(&e4b);
@@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d loading buddy information for %u",
err, group);
continue;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b1e4d359f73b..89725fa42573 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
needed = ext4_ext_calc_credits_for_single_extent(inode,
lb->last_block - lb->first_block + 1, path);
- /*
- * Make sure the credit we accumalated is not really high
- */
- if (needed && ext4_handle_has_enough_credits(handle,
- EXT4_RESERVE_TRANS_BLOCKS)) {
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- } else if (needed) {
- retval = ext4_journal_extend(handle, needed);
- if (retval) {
- /*
- * IF not able to extend the journal restart the journal
- */
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- }
- }
+ retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
+ if (retval < 0)
+ goto err_out;
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
@@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
}
-static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
-{
- int retval = 0, needed;
-
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- /*
- * We are freeing a blocks. During this we touch
- * superblock, group descriptor and block bitmap.
- * So allocate a credit of 3. We may update
- * quota (user and group).
- */
- needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- if (ext4_journal_extend(handle, needed) != 0)
- retval = ext4_journal_restart(handle, needed);
-
- return retval;
-}
-
static int free_dind_blocks(handle_t *handle,
struct inode *inode, __le32 i_data)
{
int i;
__le32 *tmp_idata;
struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+ int err;
- bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0) {
+ put_bh(bh);
+ return err;
+ }
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(tmp_idata[i]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle,
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0)
+ return err;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle,
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(i_data[0]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* One credit accounted for writing the
* i_data field of the original inode
*/
- retval = ext4_journal_extend(handle, 1);
- if (retval) {
- retval = ext4_journal_restart(handle, 1);
- if (retval)
- goto err_out;
- }
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto err_out;
i_data[0] = ei->i_data[EXT4_IND_BLOCK];
i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
@@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
ix = EXT_FIRST_INDEX(eh);
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
retval = free_ext_idx(handle, inode, ix);
- if (retval)
- break;
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- return retval;
+ return 0;
}
/*
@@ -574,9 +554,9 @@ err_out:
}
/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
- if (ext4_journal_extend(handle, 1) != 0)
- ext4_journal_restart(handle, 1);
-
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto out_stop;
/*
* Mark the tmp_inode as of size zero
*/
@@ -594,6 +574,7 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
+out_stop:
ext4_journal_stop(handle);
out:
unlock_new_inode(tmp_inode);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 2305b4374fd3..87f7551c5132 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -120,10 +120,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
{
__ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
- "MMP failure info: last update time: %llu, last update "
- "node: %s, last update device: %s",
- (long long unsigned int) le64_to_cpu(mmp->mmp_time),
- mmp->mmp_nodename, mmp->mmp_bdevname);
+ "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
+ (unsigned long long)le64_to_cpu(mmp->mmp_time),
+ (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
+ (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
}
/*
@@ -154,6 +154,7 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
@@ -173,8 +174,10 @@ static int kmmpd(void *data)
* (s_mmp_update_interval * 60) seconds.
*/
if (retval) {
- if ((failed_writes % 60) == 0)
+ if ((failed_writes % 60) == 0) {
+ ext4_set_errno(sb, -retval);
ext4_error(sb, "Error writing to MMP block");
+ }
failed_writes++;
}
@@ -205,6 +208,7 @@ static int kmmpd(void *data)
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
+ ext4_set_errno(sb, -retval);
ext4_error(sb, "error reading MMP data: %d",
retval);
goto exit_thread;
@@ -218,6 +222,7 @@ static int kmmpd(void *data)
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
+ ext4_set_errno(sb, EBUSY);
ext4_error(sb, "abort");
put_bh(bh_check);
retval = -EBUSY;
@@ -375,7 +380,8 @@ skip:
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ (int)sizeof(mmp->mmp_bdevname),
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129029534075..ceff4b4b1877 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
- bh = ext4_bread(NULL, inode, block, 0);
+ if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
+ bh = ERR_PTR(-EIO);
+ else
+ bh = ext4_bread(NULL, inode, block, 0);
if (IS_ERR(bh)) {
__ext4_warning(inode->i_sb, func, line,
"inode #%lu: lblock %lu: comm %s: "
@@ -153,9 +156,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
* caller is sure it should be an index block.
*/
if (is_dx_block && type == INDEX) {
- if (ext4_dx_csum_verify(inode, dirent))
+ if (ext4_dx_csum_verify(inode, dirent) &&
+ !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
+ ext4_set_errno(inode->i_sb, EFSBADCRC);
ext4_error_inode(inode, func, line, block,
"Directory index failed checksum");
brelse(bh);
@@ -163,9 +168,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
}
}
if (!is_dx_block) {
- if (ext4_dirblock_csum_verify(inode, bh))
+ if (ext4_dirblock_csum_verify(inode, bh) &&
+ !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
+ ext4_set_errno(inode->i_sb, EFSBADCRC);
ext4_error_inode(inode, func, line, block,
"Directory block failed checksum");
brelse(bh);
@@ -1002,7 +1009,6 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
@@ -1017,7 +1023,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
return err;
}
}
-#endif
+
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
@@ -1065,9 +1071,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
}
errout:
brelse(bh);
-#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
-#endif
return count;
}
@@ -1312,7 +1316,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
{
int len;
- if (!IS_CASEFOLDED(dir)) {
+ if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
cf_name->name = NULL;
return;
}
@@ -1527,6 +1531,7 @@ restart:
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
+ ext4_set_errno(sb, EIO);
EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
(unsigned long) block);
brelse(bh);
@@ -1537,6 +1542,7 @@ restart:
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
+ ext4_set_errno(sb, EFSBADCRC);
EXT4_ERROR_INODE(dir, "checksumming directory "
"block %lu", (unsigned long)block);
brelse(bh);
@@ -2164,7 +2170,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
+#ifdef CONFIG_UNICODE
struct ext4_sb_info *sbi;
+#endif
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2176,14 +2184,14 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
- sbi = EXT4_SB(sb);
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
#ifdef CONFIG_UNICODE
+ sbi = EXT4_SB(sb);
if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- utf8_validate(sbi->s_encoding, &dentry->d_name))
+ sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
@@ -2205,6 +2213,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, &fname, dir, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
+ /* Can we just ignore htree data? */
+ if (ext4_has_metadata_csum(sb)) {
+ EXT4_ERROR_INODE(dir,
+ "Directory has corrupted htree index.");
+ retval = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
@@ -2547,18 +2562,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode)
}
+/*
+ * Add non-directory inode to a directory. On success, the inode reference is
+ * consumed by dentry is instantiation. This is also indicated by clearing of
+ * *inodep pointer. On failure, the caller is responsible for dropping the
+ * inode reference in the safe context.
+ */
static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
+ struct dentry *dentry, struct inode **inodep)
{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
d_instantiate_new(dentry, inode);
+ *inodep = NULL;
return 0;
}
drop_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2592,12 +2618,12 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2624,12 +2650,12 @@ retry:
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2779,10 +2805,12 @@ retry:
if (err) {
out_clear_inode:
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
iput(inode);
- goto out_stop;
+ goto out_retry;
}
ext4_inc_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -2796,6 +2824,7 @@ out_clear_inode:
out_stop:
if (handle)
ext4_journal_stop(handle);
+out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2808,7 +2837,7 @@ bool ext4_empty_dir(struct inode *inode)
{
unsigned int offset;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de, *de1;
+ struct ext4_dir_entry_2 *de;
struct super_block *sb;
if (ext4_has_inline_data(inode)) {
@@ -2833,19 +2862,25 @@ bool ext4_empty_dir(struct inode *inode)
return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
- de1 = ext4_next_entry(de, sb->s_blocksize);
- if (le32_to_cpu(de->inode) != inode->i_ino ||
- le32_to_cpu(de1->inode) == 0 ||
- strcmp(".", de->name) || strcmp("..", de1->name)) {
- ext4_warning_inode(inode, "directory missing '.' and/or '..'");
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
+ 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
return true;
}
- offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
- ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
- de = ext4_next_entry(de1, sb->s_blocksize);
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
+ offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ ext4_warning_inode(inode, "directory missing '..'");
+ brelse(bh);
+ return true;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
- if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ if (!(offset & (sb->s_blocksize - 1))) {
unsigned int lblock;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -2856,12 +2891,11 @@ bool ext4_empty_dir(struct inode *inode)
}
if (IS_ERR(bh))
return true;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
}
+ de = (struct ext4_dir_entry_2 *) (bh->b_data +
+ (offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
bh->b_data, bh->b_size, offset)) {
- de = (struct ext4_dir_entry_2 *)(bh->b_data +
- sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
continue;
}
@@ -2870,7 +2904,6 @@ bool ext4_empty_dir(struct inode *inode)
return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
- de = ext4_next_entry(de, sb->s_blocksize);
}
brelse(bh);
return true;
@@ -3182,18 +3215,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- if (inode->i_nlink == 0) {
- ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
- set_nlink(inode, 1);
- }
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
- drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+ dentry->d_name.len, dentry->d_name.name);
+ else
+ drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
@@ -3328,12 +3360,11 @@ static int ext4_symlink(struct inode *dir,
inode->i_size = disk_link.len - 1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
+ err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
+ if (inode)
+ iput(inode);
goto out_free_encrypted_link;
err_drop_inode:
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 12ceadef32c5..68b39e75446a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
-static int ext4_end_io(ext4_io_end_t *io)
+static int ext4_end_io_end(ext4_io_end_t *io_end)
{
- struct inode *inode = io->inode;
- loff_t offset = io->offset;
- ssize_t size = io->size;
- handle_t *handle = io->handle;
+ struct inode *inode = io_end->inode;
+ handle_t *handle = io_end->handle;
int ret = 0;
- ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
- io, inode->i_ino, io->list.next, io->list.prev);
+ io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+ io_end->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
- ext4_clear_io_unwritten_flag(io);
- ext4_release_io_end(io);
+ ext4_clear_io_unwritten_flag(io_end);
+ ext4_release_io_end(io_end);
return ret;
}
@@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
+ ext4_io_end_t *io_end, *io_end0, *io_end1;
if (list_empty(head))
return;
ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
- list_for_each_entry(io, head, list) {
- cur = &io->list;
+ list_for_each_entry(io_end, head, list) {
+ cur = &io_end->list;
before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
+ io_end0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
+ io_end1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
+ io_end, inode->i_ino, io_end0, io_end1);
}
#endif
}
@@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
static int ext4_do_flush_completed_IO(struct inode *inode,
struct list_head *head)
{
- ext4_io_end_t *io;
+ ext4_io_end_t *io_end;
struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
- io = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
- list_del_init(&io->list);
+ io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io_end->list);
- err = ext4_end_io(io);
+ err = ext4_end_io_end(io_end);
if (unlikely(!ret && err))
ret = err;
}
@@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
- ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
- if (io) {
- io->inode = inode;
- INIT_LIST_HEAD(&io->list);
- atomic_set(&io->count, 1);
+ ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+ if (io_end) {
+ io_end->inode = inode;
+ INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
+ atomic_set(&io_end->count, 1);
}
- return io;
+ return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_extents(io_end->handle,
- io_end->inode, io_end->offset,
- io_end->size);
+ err = ext4_convert_unwritten_io_end_vec(io_end->handle,
+ io_end);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
@@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio)
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,
@@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io,
io->io_end = NULL;
}
-static int io_submit_init_bio(struct ext4_io_submit *io,
- struct buffer_head *bh)
+static void io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
struct bio *bio;
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- if (!bio)
- return -ENOMEM;
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
wbc_init_bio(io->io_wbc, bio);
- return 0;
}
-static int io_submit_add_bh(struct ext4_io_submit *io,
- struct inode *inode,
- struct page *page,
- struct buffer_head *bh)
+static void io_submit_add_bh(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct page *page,
+ struct buffer_head *bh)
{
int ret;
@@ -388,9 +425,7 @@ submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init_bio(io, bh);
- if (ret)
- return ret;
+ io_submit_init_bio(io, bh);
io->io_bio->bi_write_hint = inode->i_write_hint;
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
@@ -398,7 +433,6 @@ submit_and_retry:
goto submit_and_retry;
wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
io->io_next_block++;
- return 0;
}
int ext4_bio_write_page(struct ext4_io_submit *io,
@@ -478,21 +512,36 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+ /*
+ * Since bounce page allocation uses a mempool, we can only use
+ * a waiting mask (i.e. request guaranteed allocation) on the
+ * first page of the bio. Otherwise it can deadlock.
+ */
+ if (io->io_bio)
+ gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
retry_encrypt:
bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
- if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
- if (io->io_bio) {
+ if (ret == -ENOMEM &&
+ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
+ gfp_flags = GFP_NOFS;
+ if (io->io_bio)
ext4_io_submit(io);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- }
- gfp_flags |= __GFP_NOFAIL;
+ else
+ gfp_flags |= __GFP_NOFAIL;
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry_encrypt;
}
- bounce_page = NULL;
- goto out;
+
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ redirty_page_for_writepage(wbc, page);
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ goto unlock;
}
}
@@ -500,30 +549,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
- if (ret) {
- /*
- * We only get here on ENOMEM. Not much else
- * we can do but mark the page as dirty, and
- * better luck next time.
- */
- break;
- }
+ io_submit_add_bh(io, inode,
+ bounce_page ? bounce_page : page, bh);
nr_submitted++;
clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
- /* Error stopped previous loop? Clean up buffers... */
- if (ret) {
- out:
- fscrypt_free_bounce_page(bounce_page);
- printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
- redirty_page_for_writepage(wbc, page);
- do {
- clear_buffer_async_write(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
+unlock:
unlock_page(page);
/* Nothing submitted - we have to end page writeback */
if (!nr_submitted)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c916017db334..c1769afbf799 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -47,13 +47,116 @@
#include "ext4.h"
-static inline bool ext4_bio_encrypted(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS 128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+ STEP_INITIAL = 0,
+ STEP_DECRYPT,
+ STEP_VERITY,
+ STEP_MAX,
+};
+
+struct bio_post_read_ctx {
+ struct bio *bio;
+ struct work_struct work;
+ unsigned int cur_step;
+ unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
+{
+ struct page *page;
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ page = bv->bv_page;
+
+ /* PG_error was set if any post_read step failed */
+ if (bio->bi_status || PageError(page)) {
+ ClearPageUptodate(page);
+ /* will re-read again later */
+ ClearPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fscrypt_decrypt_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
+
+ /*
+ * fsverity_verify_bio() may call readpages() again, and although verity
+ * will be disabled for that, decryption may still be needed, causing
+ * another bio_post_read_ctx to be allocated. So to guarantee that
+ * mempool_alloc() never deadlocks we must free the current ctx first.
+ * This is safe because verity is the last post-read step.
+ */
+ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX);
+ mempool_free(ctx, bio_post_read_ctx_pool);
+ bio->bi_private = NULL;
+
+ fsverity_verify_bio(bio);
+
+ __read_end_io(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+ /*
+ * We use different work queues for decryption and for verity because
+ * verity may require reading metadata pages that need decryption, and
+ * we shouldn't recurse to the same workqueue.
+ */
+ switch (++ctx->cur_step) {
+ case STEP_DECRYPT:
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+ INIT_WORK(&ctx->work, decrypt_work);
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ case STEP_VERITY:
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ default:
+ __read_end_io(ctx->bio);
+ }
+}
+
+static bool bio_post_read_required(struct bio *bio)
{
-#ifdef CONFIG_FS_ENCRYPTION
- return unlikely(bio->bi_private != NULL);
-#else
- return false;
-#endif
+ return bio->bi_private && !bio->bi_status;
}
/*
@@ -70,30 +173,52 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
*/
static void mpage_end_io(struct bio *bio)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ if (bio_post_read_required(bio)) {
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- if (ext4_bio_encrypted(bio)) {
- if (bio->bi_status) {
- fscrypt_release_ctx(bio->bi_private);
- } else {
- fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
- return;
- }
+ ctx->cur_step = STEP_INITIAL;
+ bio_post_read_processing(ctx);
+ return;
}
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
+ __read_end_io(bio);
+}
- if (!bio->bi_status) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static void ext4_set_bio_post_read_ctx(struct bio *bio,
+ const struct inode *inode,
+ pgoff_t first_idx)
+{
+ unsigned int post_read_steps = 0;
+
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ post_read_steps |= 1 << STEP_DECRYPT;
+
+ if (ext4_need_verity(inode, first_idx))
+ post_read_steps |= 1 << STEP_VERITY;
+
+ if (post_read_steps) {
+ /* Due to the mempool, this never fails. */
+ struct bio_post_read_ctx *ctx =
+ mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+
+ ctx->bio = bio;
+ ctx->enabled_steps = post_read_steps;
+ bio->bi_private = ctx;
}
+}
- bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_VERITY) &&
+ (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+ return inode->i_sb->s_maxbytes;
+
+ return i_size_read(inode);
}
int ext4_mpage_readpages(struct address_space *mapping,
@@ -141,7 +266,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (ext4_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -218,6 +344,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
zero_user_segment(page, first_hole << blkbits,
PAGE_SIZE);
if (first_hole == 0) {
+ if (ext4_need_verity(inode, page->index) &&
+ !fsverity_verify_page(page))
+ goto set_error_page;
SetPageUptodate(page);
unlock_page(page);
goto next_page;
@@ -241,24 +370,16 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio = NULL;
}
if (bio == NULL) {
- struct fscrypt_ctx *ctx = NULL;
-
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
- ctx = fscrypt_get_ctx(GFP_NOFS);
- if (IS_ERR(ctx))
- goto set_error_page;
- }
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
- goto set_error_page;
- }
+ ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio->bi_private = ctx;
bio_set_op_attrs(bio, REQ_OP_READ,
is_readahead ? REQ_RAHEAD : 0);
}
@@ -293,3 +414,29 @@ int ext4_mpage_readpages(struct address_space *mapping,
submit_bio(bio);
return 0;
}
+
+int __init ext4_init_post_read_processing(void)
+{
+ bio_post_read_ctx_cache =
+ kmem_cache_create("ext4_bio_post_read_ctx",
+ sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ if (!bio_post_read_ctx_cache)
+ goto fail;
+ bio_post_read_ctx_pool =
+ mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+ bio_post_read_ctx_cache);
+ if (!bio_post_read_ctx_pool)
+ goto fail_free_cache;
+ return 0;
+
+fail_free_cache:
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+ return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+ mempool_destroy(bio_post_read_ctx_pool);
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c0e9aef376a7..86a2500ed292 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
return bh;
}
-/*
- * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh)
+static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits)
{
- int err;
-
- if (ext4_handle_has_enough_credits(handle, thresh))
- return 0;
-
- err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
- if (err < 0)
- return err;
- if (err) {
- err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
- if (err)
- return err;
- }
-
- return 0;
+ return ext4_journal_ensure_credits_fn(handle, credits,
+ EXT4_MAX_TRANS_DATA, 0, 0);
}
/*
@@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
continue;
}
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
@@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
struct buffer_head *gdb;
ext4_debug("update backup group %#04llx\n", block);
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
gdb = sb_getblk(sb, block);
@@ -602,8 +584,8 @@ handle_bb:
/* Initialize block bitmap of the @group */
block = group_data[i].block_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
bh = bclean(handle, sb, block);
@@ -631,8 +613,8 @@ handle_ib:
/* Initialize inode bitmap of the @group */
block = group_data[i].inode_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
/* Mark unused entries in inode bitmap used */
bh = bclean(handle, sb, block);
@@ -842,9 +824,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
if (unlikely(err))
goto errout;
- n_group_desc = ext4_kvmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (!n_group_desc) {
err = -ENOMEM;
ext4_warning(sb, "not enough memory for %lu groups",
@@ -918,9 +899,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
gdb_bh = ext4_sb_bread(sb, gdblock, 0);
if (IS_ERR(gdb_bh))
return PTR_ERR(gdb_bh);
- n_group_desc = ext4_kvmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (!n_group_desc) {
brelse(gdb_bh);
err = -ENOMEM;
@@ -1109,10 +1089,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
- if (ext4_handle_valid(handle) &&
- handle->h_buffer_credits == 0 &&
- ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
- (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
break;
if (meta_bg == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..f464dff09774 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -154,7 +154,7 @@ ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
if (bh == NULL)
return ERR_PTR(-ENOMEM);
- if (buffer_uptodate(bh))
+ if (ext4_buffer_uptodate(bh))
return bh;
ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
wait_on_buffer(bh);
@@ -204,26 +204,6 @@ void ext4_superblock_csum_set(struct super_block *sb)
es->s_checksum = ext4_superblock_csum(sb, es);
}
-void *ext4_kvmalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags, PAGE_KERNEL);
- return ret;
-}
-
-void *ext4_kvzalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kzalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
- return ret;
-}
-
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg)
{
@@ -367,6 +347,8 @@ static void __save_error_info(struct super_block *sb, const char *func,
ext4_update_tstamp(es, s_last_error_time);
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es->s_last_error_line = cpu_to_le32(line);
+ if (es->s_last_error_errcode == 0)
+ es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED;
if (!es->s_first_error_time) {
es->s_first_error_time = es->s_last_error_time;
es->s_first_error_time_hi = es->s_last_error_time_hi;
@@ -375,6 +357,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
es->s_first_error_line = cpu_to_le32(line);
es->s_first_error_ino = es->s_last_error_ino;
es->s_first_error_block = es->s_last_error_block;
+ es->s_first_error_errcode = es->s_last_error_errcode;
}
/*
* Start the daily error reporting function if it hasn't been
@@ -631,6 +614,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
return errstr;
}
+void ext4_set_errno(struct super_block *sb, int err)
+{
+ if (err < 0)
+ err = -err;
+
+ switch (err) {
+ case EIO:
+ err = EXT4_ERR_EIO;
+ break;
+ case ENOMEM:
+ err = EXT4_ERR_ENOMEM;
+ break;
+ case EFSBADCRC:
+ err = EXT4_ERR_EFSBADCRC;
+ break;
+ case EFSCORRUPTED:
+ err = EXT4_ERR_EFSCORRUPTED;
+ break;
+ case ENOSPC:
+ err = EXT4_ERR_ENOSPC;
+ break;
+ case ENOKEY:
+ err = EXT4_ERR_ENOKEY;
+ break;
+ case EROFS:
+ err = EXT4_ERR_EROFS;
+ break;
+ case EFBIG:
+ err = EXT4_ERR_EFBIG;
+ break;
+ case EEXIST:
+ err = EXT4_ERR_EEXIST;
+ break;
+ case ERANGE:
+ err = EXT4_ERR_ERANGE;
+ break;
+ case EOVERFLOW:
+ err = EXT4_ERR_EOVERFLOW;
+ break;
+ case EBUSY:
+ err = EXT4_ERR_EBUSY;
+ break;
+ case ENOTDIR:
+ err = EXT4_ERR_ENOTDIR;
+ break;
+ case ENOTEMPTY:
+ err = EXT4_ERR_ENOTEMPTY;
+ break;
+ case ESHUTDOWN:
+ err = EXT4_ERR_ESHUTDOWN;
+ break;
+ case EFAULT:
+ err = EXT4_ERR_EFAULT;
+ break;
+ default:
+ err = EXT4_ERR_UNKNOWN;
+ }
+ EXT4_SB(sb)->s_es->s_last_error_errcode = err;
+}
+
/* __ext4_std_error decodes expected errors from journaling functions
* automatically and invokes the appropriate error response. */
@@ -655,6 +698,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
sb->s_id, function, line, errstr);
}
+ ext4_set_errno(sb, -errno);
save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -982,8 +1026,10 @@ static void ext4_put_super(struct super_block *sb)
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
- if ((err < 0) && !aborted)
+ if ((err < 0) && !aborted) {
+ ext4_set_errno(sb, -err);
ext4_abort(sb, "Couldn't clean up the journal");
+ }
}
ext4_unregister_sysfs(sb);
@@ -1085,8 +1131,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
ei->i_reserved_data_blocks = 0;
- ei->i_da_metadata_calc_len = 0;
- ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
@@ -1107,6 +1151,9 @@ static int ext4_drop_inode(struct inode *inode)
{
int drop = generic_drop_inode(inode);
+ if (!drop)
+ drop = fscrypt_drop_inode(inode);
+
trace_ext4_drop_inode(inode, drop);
return drop;
}
@@ -1169,9 +1216,9 @@ void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
clear_inode(inode);
- dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1179,6 +1226,7 @@ void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->jinode = NULL;
}
fscrypt_put_encryption_info(inode);
+ fsverity_cleanup_inode(inode);
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1341,6 +1389,18 @@ static bool ext4_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
}
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
+}
+
static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
@@ -1348,6 +1408,8 @@ static const struct fscrypt_operations ext4_cryptops = {
.dummy_context = ext4_dummy_context,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
+ .has_stable_inodes = ext4_has_stable_inodes,
+ .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
#endif
@@ -1370,7 +1432,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1388,7 +1449,7 @@ static const struct dquot_operations ext4_quota_operations = {
.destroy_dquot = dquot_destroy,
.get_projid = ext4_get_projid,
.get_inode_usage = ext4_get_inode_usage,
- .get_next_id = ext4_get_next_id,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -1531,6 +1592,7 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_dioread_nolock, "dioread_nolock"},
+ {Opt_dioread_lock, "nodioread_nolock"},
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
@@ -1874,8 +1936,22 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
} else if (token == Opt_commit) {
if (arg == 0)
arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else if (arg > INT_MAX / HZ) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid commit interval %d, "
+ "must be smaller than %d",
+ arg, INT_MAX / HZ);
+ return -1;
+ }
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_debug_want_extra_isize) {
+ if ((arg & 1) ||
+ (arg < 4) ||
+ (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid want_extra_isize %d", arg);
+ return -1;
+ }
sbi->s_want_extra_isize = arg;
} else if (token == Opt_max_batch_time) {
sbi->s_max_batch_time = arg;
@@ -2040,7 +2116,7 @@ static int parse_options(char *options, struct super_block *sb,
unsigned int *journal_ioprio,
int is_remount)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -2094,16 +2170,6 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- return 0;
- }
- }
return 1;
}
@@ -2943,17 +3009,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_QUOTA
- if (ext4_has_feature_quota(sb) && !readonly) {
+#if !defined(CONFIG_QUOTA) || !defined(CONFIG_QFMT_V2)
+ if (!readonly && (ext4_has_feature_quota(sb) ||
+ ext4_has_feature_project(sb))) {
ext4_msg(sb, KERN_ERR,
- "Filesystem with quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
- return 0;
- }
- if (ext4_has_feature_project(sb) && !readonly) {
- ext4_msg(sb, KERN_ERR,
- "Filesystem with project quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
+ "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
return 0;
}
#endif /* CONFIG_QUOTA */
@@ -3540,37 +3600,6 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
-static void ext4_clamp_want_extra_isize(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
-
- /* determine the minimum size of new large inodes, if present */
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
- sbi->s_want_extra_isize == 0) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
- if (ext4_has_feature_extra_isize(sb)) {
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_want_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_want_extra_isize);
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_min_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_min_extra_isize);
- }
- }
- /* Check if enough inode space is available */
- if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
- ext4_msg(sb, KERN_INFO,
- "required extra inode space not available");
- }
-}
-
static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
@@ -3730,6 +3759,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
set_opt(sb, XATTR_USER);
+ set_opt(sb, DIOREAD_NOLOCK);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
@@ -3778,6 +3808,78 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+ blocksize > EXT4_MAX_BLOCK_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d (%d log_block_size)",
+ blocksize, le32_to_cpu(es->s_log_block_size));
+ goto failed_mount;
+ }
+
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+ sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+ sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+ } else {
+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
+ ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
+ sbi->s_first_ino);
+ goto failed_mount;
+ }
+ if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+ (!is_power_of_2(sbi->s_inode_size)) ||
+ (sbi->s_inode_size > blocksize)) {
+ ext4_msg(sb, KERN_ERR,
+ "unsupported inode size: %d",
+ sbi->s_inode_size);
+ ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
+ goto failed_mount;
+ }
+ /*
+ * i_atime_extra is the last extra field available for
+ * [acm]times in struct ext4_inode. Checking for that
+ * field should suffice to ensure we have extra space
+ * for all three.
+ */
+ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+ sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+ sb->s_time_gran = 1;
+ sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
+ } else {
+ sb->s_time_gran = NSEC_PER_SEC;
+ sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
+ }
+ sb->s_time_min = EXT4_TIMESTAMP_MIN;
+ }
+ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ if (ext4_has_feature_extra_isize(sb)) {
+ unsigned v, max = (sbi->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE);
+
+ v = le16_to_cpu(es->s_want_extra_isize);
+ if (v > max) {
+ ext4_msg(sb, KERN_ERR,
+ "bad s_want_extra_isize: %d", v);
+ goto failed_mount;
+ }
+ if (sbi->s_want_extra_isize < v)
+ sbi->s_want_extra_isize = v;
+
+ v = le16_to_cpu(es->s_min_extra_isize);
+ if (v > max) {
+ ext4_msg(sb, KERN_ERR,
+ "bad s_min_extra_isize: %d", v);
+ goto failed_mount;
+ }
+ if (sbi->s_want_extra_isize < v)
+ sbi->s_want_extra_isize = v;
+ }
+ }
+
if (sbi->s_es->s_mount_opts[0]) {
char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
sizeof(sbi->s_es->s_mount_opts),
@@ -3835,9 +3937,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#endif
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
- "with data=journal disables delayed "
- "allocation and O_DIRECT support!\n");
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
+ clear_opt(sb, DIOREAD_NOLOCK);
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and delalloc");
@@ -3936,14 +4037,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
goto failed_mount;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
if (le32_to_cpu(es->s_log_block_size) >
(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
@@ -4016,29 +4109,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
- if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
- sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
- sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
- } else {
- sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
- sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
- if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
- ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
- sbi->s_first_ino);
- goto failed_mount;
- }
- if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
- (!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
- ext4_msg(sb, KERN_ERR,
- "unsupported inode size: %d",
- sbi->s_inode_size);
- goto failed_mount;
- }
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
- sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
- }
-
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
if (ext4_has_feature_64bit(sb)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
@@ -4272,6 +4342,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &ext4_verityops;
+#endif
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -4412,10 +4485,8 @@ no_journal:
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
- (blocksize != PAGE_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported blocksize for fs encryption");
+ if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
}
@@ -4486,8 +4557,6 @@ no_journal:
} else if (ret)
goto failed_mount4a;
- ext4_clamp_want_extra_isize(sb);
-
ext4_set_resv_clusters(sb);
err = ext4_setup_system_zone(sb);
@@ -5275,8 +5344,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- ext4_clamp_want_extra_isize(sb);
-
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
@@ -5514,9 +5581,12 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
- dquot->dq_dqb.dqb_bsoftlimit :
- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
+ if (dquot->dq_dqb.dqb_bhardlimit &&
+ (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
+ limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit >>= sb->s_blocksize_bits;
+
if (limit && buf->f_blocks > limit) {
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
@@ -5526,9 +5596,11 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit ?
- dquot->dq_dqb.dqb_isoftlimit :
- dquot->dq_dqb.dqb_ihardlimit;
+ limit = dquot->dq_dqb.dqb_isoftlimit;
+ if (dquot->dq_dqb.dqb_ihardlimit &&
+ (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
+ limit = dquot->dq_dqb.dqb_ihardlimit;
+
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
@@ -5803,7 +5875,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
- err = dquot_enable(qf_inode, type, format_id, flags);
+ err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
if (err)
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
iput(qf_inode);
@@ -5961,7 +6033,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
bh = ext4_bread(handle, inode, blk,
EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
- } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+ } while (PTR_ERR(bh) == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -5987,18 +6059,6 @@ out:
}
return len;
}
-
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
-{
- const struct quota_format_ops *ops;
-
- if (!sb_has_quota_loaded(sb, qid->type))
- return -ESRCH;
- ops = sb_dqopt(sb)->ops[qid->type];
- if (!ops || !ops->get_next_id)
- return -ENOSYS;
- return dquot_get_next_id(sb, qid);
-}
#endif
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
@@ -6095,6 +6155,10 @@ static int __init ext4_init_fs(void)
err = ext4_init_pending();
if (err)
+ goto out7;
+
+ err = ext4_init_post_read_processing();
+ if (err)
goto out6;
err = ext4_init_pageio();
@@ -6135,8 +6199,10 @@ out3:
out4:
ext4_exit_pageio();
out5:
- ext4_exit_pending();
+ ext4_exit_post_read_processing();
out6:
+ ext4_exit_pending();
+out7:
ext4_exit_es();
return err;
@@ -6153,6 +6219,7 @@ static void __exit ext4_exit_fs(void)
ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_post_read_processing();
ext4_exit_es();
ext4_exit_pending();
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b3cd7655a6ff..d218ebdafa4a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,6 +29,10 @@ typedef enum {
attr_last_error_time,
attr_feature,
attr_pointer_ui,
+ attr_pointer_ul,
+ attr_pointer_u64,
+ attr_pointer_u8,
+ attr_pointer_string,
attr_pointer_atomic,
attr_journal_task,
} attr_id_t;
@@ -46,6 +50,7 @@ struct ext4_attr {
struct attribute attr;
short attr_id;
short attr_ptr;
+ unsigned short attr_size;
union {
int offset;
void *explicit_ptr;
@@ -154,12 +159,35 @@ static struct ext4_attr ext4_attr_##_name = { \
}, \
}
+#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_pointer_string, \
+ .attr_size = _size, \
+ .attr_ptr = ptr_##_struct##_offset, \
+ .u = { \
+ .offset = offsetof(struct _struct, _elname),\
+ }, \
+}
+
#define EXT4_RO_ATTR_ES_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
+#define EXT4_RO_ATTR_ES_U8(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname)
+
+#define EXT4_RO_ATTR_ES_U64(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname)
+
+#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \
+ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
+
#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
+#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -194,7 +222,20 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+#ifdef CONFIG_EXT4_DEBUG
+EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
+#endif
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
+EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
+EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino);
+EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino);
+EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block);
+EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block);
+EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line);
+EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line);
+EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32);
+EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
@@ -225,9 +266,22 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(errors_count),
+ ATTR_LIST(first_error_ino),
+ ATTR_LIST(last_error_ino),
+ ATTR_LIST(first_error_block),
+ ATTR_LIST(last_error_block),
+ ATTR_LIST(first_error_line),
+ ATTR_LIST(last_error_line),
+ ATTR_LIST(first_error_func),
+ ATTR_LIST(last_error_func),
+ ATTR_LIST(first_error_errcode),
+ ATTR_LIST(last_error_errcode),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
ATTR_LIST(journal_task),
+#ifdef CONFIG_EXT4_DEBUG
+ ATTR_LIST(simulate_fail),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -242,6 +296,9 @@ EXT4_ATTR_FEATURE(encryption);
#ifdef CONFIG_UNICODE
EXT4_ATTR_FEATURE(casefold);
#endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
static struct attribute *ext4_feat_attrs[] = {
@@ -254,6 +311,9 @@ static struct attribute *ext4_feat_attrs[] = {
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
#endif
+#ifdef CONFIG_FS_VERITY
+ ATTR_LIST(verity),
+#endif
ATTR_LIST(metadata_csum_seed),
NULL,
};
@@ -274,7 +334,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
- return snprintf(buf, PAGE_SIZE, "%lld",
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
((time64_t)hi << 32) + le32_to_cpu(lo));
}
@@ -312,6 +372,30 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
else
return snprintf(buf, PAGE_SIZE, "%u\n",
*((unsigned int *) ptr));
+ case attr_pointer_ul:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ *((unsigned long *) ptr));
+ case attr_pointer_u8:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned char *) ptr));
+ case attr_pointer_u64:
+ if (!ptr)
+ return 0;
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ le64_to_cpup(ptr));
+ else
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ *((unsigned long long *) ptr));
+ case attr_pointer_string:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size,
+ (char *) ptr);
case attr_pointer_atomic:
if (!ptr)
return 0;
@@ -355,6 +439,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
else
*((unsigned int *) ptr) = t;
return len;
+ case attr_pointer_ul:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *((unsigned long *) ptr) = t;
+ return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 000000000000..dc5ec724d889
--- /dev/null
+++ b/fs/ext4/verity.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first 64K boundary beyond i_size. This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4. This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs. Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+
+static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
+{
+ return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode. __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+ loff_t pos)
+{
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *addr;
+
+ page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+ NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ addr = kmap_atomic(page);
+ memcpy(buf, addr + offset_in_page(pos), n);
+ kunmap_atomic(addr);
+
+ put_page(page);
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+ loff_t pos)
+{
+ if (pos + count > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *fsdata;
+ void *addr;
+ int res;
+
+ res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+ &page, &fsdata);
+ if (res)
+ return res;
+
+ addr = kmap_atomic(page);
+ memcpy(addr + offset_in_page(pos), buf, n);
+ kunmap_atomic(addr);
+
+ res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+ page, fsdata);
+ if (res < 0)
+ return res;
+ if (res != n)
+ return -EIO;
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_add() */
+ handle_t *handle;
+ int err;
+
+ if (ext4_verity_in_progress(inode))
+ return -EBUSY;
+
+ /*
+ * Since the file was opened readonly, we have to initialize the jbd
+ * inode and quotas here and not rely on ->open() doing it. This must
+ * be done before evicting the inline data.
+ */
+
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
+ return err;
+
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ err = ext4_convert_inline_data(inode);
+ if (err)
+ return err;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_warning_inode(inode,
+ "verity is only allowed on extent-based files");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * ext4 uses the last allocated block to find the verity descriptor, so
+ * we must remove any other blocks past EOF which might confuse things.
+ */
+ err = ext4_truncate(inode);
+ if (err)
+ return err;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ ext4_journal_stop(handle);
+ return err;
+}
+
+/*
+ * ext4 stores the verity descriptor beginning on the next filesystem block
+ * boundary after the Merkle tree. Then, the descriptor size is stored in the
+ * last 4 bytes of the last allocated filesystem block --- which is either the
+ * block in which the descriptor ends, or the next block after that if there
+ * weren't at least 4 bytes remaining.
+ *
+ * We can't simply store the descriptor in an xattr because it *must* be
+ * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
+ * xattrs. Also, if the descriptor includes a large signature blob it may be
+ * too large to store in an xattr without the EA_INODE feature.
+ */
+static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
+ merkle_tree_size, i_blocksize(inode));
+ const u64 desc_end = desc_pos + desc_size;
+ const __le32 desc_size_disk = cpu_to_le32(desc_size);
+ const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
+ i_blocksize(inode)) -
+ sizeof(desc_size_disk);
+ int err;
+
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+ if (err)
+ return err;
+
+ return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_del() */
+ handle_t *handle;
+ int err = 0;
+ int err2;
+
+ if (desc != NULL) {
+ /* Succeeded; write the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+
+ /* Write all pages before clearing VERITY_IN_PROGRESS. */
+ if (!err)
+ err = filemap_write_and_wait(inode->i_mapping);
+ }
+
+ /* If we failed, truncate anything we wrote past i_size. */
+ if (desc == NULL || err)
+ ext4_truncate(inode);
+
+ /*
+ * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+ * deleting the inode from the orphan list, even if something failed.
+ * If everything succeeded, we'll also set the verity bit in the same
+ * transaction.
+ */
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ err2 = ext4_orphan_del(handle, inode);
+ if (err2)
+ goto out_stop;
+
+ if (desc != NULL && !err) {
+ struct ext4_iloc iloc;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+out_stop:
+ ext4_journal_stop(handle);
+ return err ?: err2;
+}
+
+static int ext4_get_verity_descriptor_location(struct inode *inode,
+ size_t *desc_size_ret,
+ u64 *desc_pos_ret)
+{
+ struct ext4_ext_path *path;
+ struct ext4_extent *last_extent;
+ u32 end_lblk;
+ u64 desc_size_pos;
+ __le32 desc_size_disk;
+ u32 desc_size;
+ u64 desc_pos;
+ int err;
+
+ /*
+ * Descriptor size is in last 4 bytes of last allocated block.
+ * See ext4_write_verity_descriptor().
+ */
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
+ return -EFSCORRUPTED;
+ }
+
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ last_extent = path[path->p_depth].p_ext;
+ if (!last_extent) {
+ EXT4_ERROR_INODE(inode, "verity file has no extents");
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return -EFSCORRUPTED;
+ }
+
+ end_lblk = le32_to_cpu(last_extent->ee_block) +
+ ext4_ext_get_actual_len(last_extent);
+ desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if (desc_size_pos < sizeof(desc_size_disk))
+ goto bad;
+ desc_size_pos -= sizeof(desc_size_disk);
+
+ err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+ if (err)
+ return err;
+ desc_size = le32_to_cpu(desc_size_disk);
+
+ /*
+ * The descriptor is stored just before the desc_size_disk, but starting
+ * on a filesystem block boundary.
+ */
+
+ if (desc_size > INT_MAX || desc_size > desc_size_pos)
+ goto bad;
+
+ desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
+ if (desc_pos < ext4_verity_metadata_pos(inode))
+ goto bad;
+
+ *desc_size_ret = desc_size;
+ *desc_pos_ret = desc_pos;
+ return 0;
+
+bad:
+ EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
+ return -EFSCORRUPTED;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ size_t desc_size = 0;
+ u64 desc_pos = 0;
+ int err;
+
+ err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
+ if (err)
+ return err;
+
+ if (buf_size) {
+ if (desc_size > buf_size)
+ return -ERANGE;
+ err = pagecache_read(inode, buf, desc_size, desc_pos);
+ if (err)
+ return err;
+ }
+ return desc_size;
+}
+
+/*
+ * Prefetch some pages from the file's Merkle tree.
+ *
+ * This is basically a stripped-down version of __do_page_cache_readahead()
+ * which works on pages past i_size.
+ */
+static void ext4_merkle_tree_readahead(struct address_space *mapping,
+ pgoff_t start_index, unsigned long count)
+{
+ LIST_HEAD(pages);
+ unsigned int nr_pages = 0;
+ struct page *page;
+ pgoff_t index;
+ struct blk_plug plug;
+
+ for (index = start_index; index < start_index + count; index++) {
+ page = xa_load(&mapping->i_pages, index);
+ if (!page || xa_is_value(page)) {
+ page = __page_cache_alloc(readahead_gfp_mask(mapping));
+ if (!page)
+ break;
+ page->index = index;
+ list_add(&page->lru, &pages);
+ nr_pages++;
+ }
+ }
+ blk_start_plug(&plug);
+ ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true);
+ blk_finish_plug(&plug);
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index,
+ unsigned long num_ra_pages)
+{
+ struct page *page;
+
+ index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (!page || !PageUptodate(page)) {
+ if (page)
+ put_page(page);
+ else if (num_ra_pages > 1)
+ ext4_merkle_tree_readahead(inode->i_mapping, index,
+ num_ra_pages);
+ page = read_mapping_page(inode->i_mapping, index, NULL);
+ }
+ return page;
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+
+ return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+ .begin_enable_verity = ext4_begin_enable_verity,
+ .end_enable_verity = ext4_end_enable_verity,
+ .get_verity_descriptor = ext4_get_verity_descriptor,
+ .read_merkle_tree_page = ext4_read_merkle_tree_page,
+ .write_merkle_tree_block = ext4_write_merkle_tree_block,
+};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 491f9ee4040e..8cac7d95c3ad 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
return credits;
}
-static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
- int credits, struct buffer_head *bh,
- bool dirty, bool block_csum)
-{
- int error;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- if (handle->h_buffer_credits >= credits)
- return 0;
-
- error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
- if (!error)
- return 0;
- if (error < 0) {
- ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
- return error;
- }
-
- if (bh && dirty) {
- if (block_csum)
- ext4_xattr_block_csum_set(inode, bh);
- error = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (error) {
- ext4_warning(inode->i_sb, "Handle metadata (error %d)",
- error);
- return error;
- }
- }
-
- error = ext4_journal_restart(handle, credits);
- if (error) {
- ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
- return error;
- }
-
- if (bh) {
- error = ext4_journal_get_write_access(handle, bh);
- if (error) {
- ext4_warning(inode->i_sb,
- "Get write access failed (error %d)",
- error);
- return error;
- }
- }
- return 0;
-}
-
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
@@ -1149,6 +1100,24 @@ cleanup:
return saved_err;
}
+static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, bool block_csum, bool dirty)
+{
+ int error;
+
+ if (bh && dirty) {
+ if (block_csum)
+ ext4_xattr_block_csum_set(inode, bh);
+ error = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (error) {
+ ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+ error);
+ return error;
+ }
+ }
+ return 0;
+}
+
static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
struct buffer_head *bh,
@@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
continue;
}
- err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
- dirty, block_csum);
- if (err) {
+ err = ext4_journal_ensure_credits_fn(handle, credits, credits,
+ ext4_free_metadata_revoke_credits(parent->i_sb, 1),
+ ext4_xattr_restart_fn(handle, parent, bh, block_csum,
+ dirty));
+ if (err < 0) {
ext4_warning_inode(ea_inode, "Ensure credits err=%d",
err);
continue;
}
+ if (err > 0) {
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_warning_inode(ea_inode,
+ "Re-get write access err=%d",
+ err);
+ continue;
+ }
+ }
err = ext4_xattr_inode_dec_ref(handle, ea_inode);
if (err) {
@@ -1476,7 +1456,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
if (!ce)
return NULL;
- ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+ ea_data = kvmalloc(value_len, GFP_KERNEL);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
return NULL;
@@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
flags & XATTR_CREATE);
brelse(bh);
- if (!ext4_handle_has_enough_credits(handle, credits)) {
+ if (jbd2_handle_buffer_credits(handle) < credits) {
error = -ENOSPC;
goto cleanup;
}
@@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
struct inode *ea_inode;
int error;
- error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
- NULL /* bh */,
- false /* dirty */,
- false /* block_csum */);
- if (error) {
+ error = ext4_journal_ensure_credits(handle, extra_credits,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (error < 0) {
EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
goto cleanup;
}
@@ -2901,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
if (IS_ERR(bh)) {
error = PTR_ERR(bh);
- if (error == -EIO)
+ if (error == -EIO) {
+ ext4_set_errno(inode->i_sb, EIO);
EXT4_ERROR_INODE(inode, "block %llu read error",
EXT4_I(inode)->i_file_acl);
+ }
bh = NULL;
goto cleanup;
}
OpenPOWER on IntegriCloud