diff options
Diffstat (limited to 'fs')
38 files changed, 379 insertions, 137 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 82e8f6edfb48..b12e37f27530 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -749,7 +749,7 @@ static int autofs4_dir_mkdir(struct inode *dir, autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 41e04183e4ce..4ad6f669fe34 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,10 +377,10 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); - if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) - pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", - task_pid_nr(current), current->comm, - (void *)addr); + if ((type & MAP_FIXED_NOREPLACE) && + PTR_ERR((void *)map_addr) == -EEXIST) + pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, (void *)addr); return(map_addr); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5474ef14d6e6..2771cc56a622 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -459,6 +459,25 @@ struct btrfs_block_rsv { unsigned short full; unsigned short type; unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; }; /* @@ -714,6 +733,12 @@ struct btrfs_delayed_root; */ #define BTRFS_FS_EXCL_OP 16 +/* + * To info transaction_kthread we need an immediate commit so it doesn't + * need to wait for commit_interval + */ +#define BTRFS_FS_NEED_ASYNC_COMMIT 17 + struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 06ec8ab6d9ba..a8d492dbd3e7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -556,6 +556,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, dst_rsv = &fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + + /* + * Here we migrate space rsv from transaction rsv, since have already + * reserved space when starting a transaction. So no need to reserve + * qgroup space here. + */ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", @@ -577,7 +583,10 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &fs_info->delayed_block_rsv; - btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved); + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we don't need + * to release/reserve qgroup space. + */ trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); @@ -602,9 +611,6 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); - if (ret < 0) - return ret; /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we @@ -616,6 +622,10 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_qgroup_reserve_meta_prealloc(root, + fs_info->nodesize, true); + if (ret < 0) + return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); /* @@ -634,6 +644,8 @@ static int btrfs_delayed_inode_reserve_metadata( "delayed_inode", btrfs_ino(inode), num_bytes, 1); + } else { + btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize); } return ret; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9e98295de7ce..e1b0651686f7 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -540,8 +540,10 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret, + int action, int is_data, int is_system, + int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) + { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -585,6 +587,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->is_system = is_system; head_ref->ref_tree = RB_ROOT; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); @@ -772,6 +775,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -800,8 +804,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted, old_ref_mod, - new_ref_mod); + is_system, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -868,7 +872,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted, + action, 1, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -898,9 +902,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); + /* + * extent_ops just modify the flags of an extent and they don't result + * in ref count changes, hence it's safe to pass false/0 for is_system + * argument + */ add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL, NULL, NULL); + extent_op->is_data, 0, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 741869dbc316..7f00db50bd24 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -127,6 +127,7 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int is_system:1; unsigned int processing:1; }; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ac8b1d21baf..60caa68c3618 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1824,6 +1824,7 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && + !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 75cfb80d2551..e2f16b68fcbf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2601,13 +2601,19 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); if (head->total_ref_mod < 0) { - struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; + u64 flags; - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - ASSERT(cache); - percpu_counter_add(&cache->space_info->total_bytes_pinned, + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, -head->num_bytes); - btrfs_put_block_group(cache); if (head->is_data) { spin_lock(&delayed_refs->lock); @@ -5559,14 +5565,18 @@ again: static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes) + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) { struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; u64 ret; spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) + if (num_bytes == (u64)-1) { num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } block_rsv->size -= num_bytes; if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; @@ -5575,6 +5585,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } else { num_bytes = 0; } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } spin_unlock(&block_rsv->lock); ret = num_bytes; @@ -5597,6 +5614,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info_add_old_bytes(fs_info, space_info, num_bytes); } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; return ret; } @@ -5738,17 +5757,21 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 num_bytes = 0; + u64 qgroup_num_bytes = 0; int ret = -ENOSPC; spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) num_bytes = block_rsv->size - block_rsv->reserved; + if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) + qgroup_num_bytes = block_rsv->qgroup_rsv_size - + block_rsv->qgroup_rsv_reserved; spin_unlock(&block_rsv->lock); if (num_bytes == 0) return 0; - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); if (ret) return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); @@ -5756,7 +5779,13 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, block_rsv_add_bytes(block_rsv, num_bytes, 0); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), num_bytes, 1); - } + + /* Don't forget to increase qgroup_rsv_reserved */ + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&block_rsv->lock); + } else + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); return ret; } @@ -5777,20 +5806,23 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; + u64 qgroup_to_release = 0; /* * Since we statically set the block_rsv->size we just want to say we * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, released); + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); else - btrfs_qgroup_convert_reserved_meta(inode->root, released); + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); } void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, @@ -5802,7 +5834,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5882,7 +5914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1); + (u64)-1, NULL); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -5906,7 +5938,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } @@ -6011,6 +6043,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, { struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; u64 csum_leaves; unsigned outstanding_extents; @@ -6023,9 +6056,17 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, inode->csum_bytes); reserve_size += btrfs_calc_trans_metadata_size(fs_info, csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; spin_unlock(&block_rsv->lock); } @@ -8403,7 +8444,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0167a9c97c9c..f660ba1e5e58 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1748,7 +1748,7 @@ again: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - (ret != 0)); + true); if (ret) { btrfs_drop_pages(pages, num_pages); break; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e064c49c9a9a..d241285a0d2a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -31,6 +31,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -5905,11 +5906,13 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) struct dir_entry *entry = addr; char *name = (char *)(entry + 1); - ctx->pos = entry->offset; - if (!dir_emit(ctx, name, entry->name_len, entry->ino, - entry->type)) + ctx->pos = get_unaligned(&entry->offset); + if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), + get_unaligned(&entry->ino), + get_unaligned(&entry->type))) return 1; - addr += sizeof(struct dir_entry) + entry->name_len; + addr += sizeof(struct dir_entry) + + get_unaligned(&entry->name_len); ctx->pos++; } return 0; @@ -5999,14 +6002,15 @@ again: } entry = addr; - entry->name_len = name_len; + put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); - entry->ino = location.objectid; - entry->offset = found_key.offset; + put_unaligned(location.objectid, &entry->ino); + put_unaligned(found_key.offset, &entry->offset); entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 124276bba8cf..21a831d3d087 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -189,9 +189,10 @@ void btrfs_print_leaf(struct extent_buffer *l) fs_info = l->fs_info; nr = btrfs_header_nritems(l); - btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d", - btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(fs_info, l)); + btrfs_info(fs_info, + "leaf %llu gen %llu total ptrs %d free space %d owner %llu", + btrfs_header_bytenr(l), btrfs_header_generation(l), nr, + btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); @@ -325,7 +326,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c) +void btrfs_print_tree(struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; @@ -342,15 +343,19 @@ void btrfs_print_tree(struct extent_buffer *c) return; } btrfs_info(fs_info, - "node %llu level %d total ptrs %d free spc %u", - btrfs_header_bytenr(c), level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr); + "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu", + btrfs_header_bytenr(c), level, btrfs_header_generation(c), + nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr, + btrfs_header_owner(c)); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); - pr_info("\tkey %d (%llu %u %llu) block %llu\n", + pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", i, key.objectid, key.type, key.offset, - btrfs_node_blockptr(c, i)); + btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); } + if (!follow) + return; for (i = 0; i < nr; i++) { struct btrfs_key first_key; struct extent_buffer *next; @@ -372,7 +377,7 @@ void btrfs_print_tree(struct extent_buffer *c) if (btrfs_header_level(next) != level - 1) BUG(); - btrfs_print_tree(next); + btrfs_print_tree(next, follow); free_extent_buffer(next); } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 4a98481688f4..e6bb38fd75ad 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -7,6 +7,6 @@ #define BTRFS_PRINT_TREE_H void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c); +void btrfs_print_tree(struct extent_buffer *c, bool follow); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 09c7e4fd550f..9fb758d5077a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> +#include <linux/sizes.h> #include "ctree.h" #include "transaction.h" @@ -2375,8 +2376,21 @@ out: return ret; } -static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) +/* + * Two limits to commit transaction in advance. + * + * For RATIO, it will be 1/RATIO of the remaining limit + * (excluding data and prealloc meta) as threshold. + * For SIZE, it will be in byte unit as threshold. + */ +#define QGROUP_PERTRANS_RATIO 32 +#define QGROUP_PERTRANS_SIZE SZ_32M +static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, + const struct btrfs_qgroup *qg, u64 num_bytes) { + u64 limit; + u64 threshold; + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2385,6 +2399,31 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; + /* + * Even if we passed the check, it's better to check if reservation + * for meta_pertrans is pushing us near limit. + * If there is too much pertrans reservation or it's near the limit, + * let's try commit transaction to free some, using transaction_kthread + */ + if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | + BTRFS_QGROUP_LIMIT_MAX_EXCL))) { + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) + limit = qg->max_excl; + else + limit = qg->max_rfer; + threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - + qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / + QGROUP_PERTRANS_RATIO; + threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + + /* + * Use transaction_kthread to commit transaction, so we no + * longer need to bother nested transaction nor lock context. + */ + if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + btrfs_commit_transaction_locksafe(fs_info); + } + return true; } @@ -2434,7 +2473,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - if (enforce && !qgroup_check_limits(qg, num_bytes)) { + if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { ret = -EDQUOT; goto out; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 63fdcab64b01..c944b4769e3c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2267,6 +2267,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); + clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c88fccd80bc5..d8c0826bc2c7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -199,6 +199,20 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); + +/* + * Try to commit transaction asynchronously, so this is safe to call + * even holding a spinlock. + * + * It's done by informing transaction_kthread to commit transaction without + * waiting for commit interval. + */ +static inline void btrfs_commit_transaction_locksafe( + struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); +} int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 7e72348639e4..315f7e63e7cc 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -228,7 +228,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { - return (ci->i_max_files || ci->i_max_bytes); + bool ret = false; + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + ci->i_vino.snap == CEPH_NOSNAP && + ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino) + ret = true; + spin_unlock(&ci->i_ceph_lock); + return ret; } static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, @@ -1008,14 +1016,19 @@ int __ceph_setxattr(struct inode *inode, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool check_realm = false; bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; vxattr = ceph_match_vxattr(inode, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; + if (vxattr) { + if (vxattr->readonly) + return -EOPNOTSUPP; + if (value && !strncmp(vxattr->name, "ceph.quota", 10)) + check_realm = true; + } /* pass any unhandled ceph.* xattrs through to the MDS */ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) @@ -1109,6 +1122,15 @@ do_sync_unlocked: err = -EBUSY; } else { err = ceph_sync_setxattr(inode, name, value, size, flags); + if (err >= 0 && check_realm) { + /* check if snaprealm was created for quota inode */ + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + !(ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino)) + err = -EOPNOTSUPP; + spin_unlock(&ci->i_ceph_lock); + } } out: ceph_free_cap_flush(prealloc_cf); diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index fe5567655662..0e74690d11bc 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -54,7 +54,7 @@ do { \ pr_debug_ ## ratefunc("%s: " \ fmt, __FILE__, ##__VA_ARGS__); \ } else if ((type) & VFS) { \ - pr_err_ ## ratefunc("CuIFS VFS: " \ + pr_err_ ## ratefunc("CIFS VFS: " \ fmt, ##__VA_ARGS__); \ } else if ((type) & NOISY && (NOISY != 0)) { \ pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6d3e40d7029c..1529a088383d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -455,6 +455,9 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) server->sign = true; } + if (cifs_rdma_enabled(server) && server->sign) + cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled"); + return 0; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e8830f076a7f..a5aa158d535a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2959,6 +2959,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } + if (volume_info->seal) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB3 or later required for encryption\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else if (tcon->ses->server->capabilities & + SMB2_GLOBAL_CAP_ENCRYPTION) + tcon->seal = true; + else { + cifs_dbg(VFS, "Encryption is not supported on share\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } + } + /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? @@ -3007,22 +3023,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->use_resilient = true; } - if (volume_info->seal) { - if (ses->server->vals->protocol_id == 0) { - cifs_dbg(VFS, - "SMB3 or later required for encryption\n"); - rc = -EOPNOTSUPP; - goto out_fail; - } else if (tcon->ses->server->capabilities & - SMB2_GLOBAL_CAP_ENCRYPTION) - tcon->seal = true; - else { - cifs_dbg(VFS, "Encryption is not supported on share\n"); - rc = -EOPNOTSUPP; - goto out_fail; - } - } - /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 81ba6e0d88d8..925844343038 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -684,6 +684,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto mknod_out; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) goto mknod_out; @@ -692,10 +695,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { - kfree(full_path); rc = -ENOMEM; - free_xid(xid); - return rc; + goto mknod_out; } if (backup_cred(cifs_sb)) @@ -742,7 +743,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, pdev->minor = cpu_to_le64(MINOR(device_number)); rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); - } /* else if (S_ISFIFO) */ + } tcon->ses->server->ops->close(xid, tcon, &fid); d_drop(direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4bcd4e838b47..23fd430fe74a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3462,7 +3462,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * If the page is mmap'ed into a process' page tables, then we need to make * sure that it doesn't change while being written back. */ -static int +static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b4ae932ea134..b76b85881dcc 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -252,9 +252,14 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) - wsize = min_t(unsigned int, + if (server->rdma) { + if (server->sign) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_fragmented_send_size); + else + wsize = min_t(unsigned int, wsize, server->smbd_conn->max_readwrite_size); + } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); @@ -272,9 +277,14 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) - rsize = min_t(unsigned int, + if (server->rdma) { + if (server->sign) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_fragmented_recv_size); + else + rsize = min_t(unsigned int, rsize, server->smbd_conn->max_readwrite_size); + } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) @@ -1452,7 +1462,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf = NULL; + struct smb2_err_rsp *err_buf; struct smb2_symlink_err_rsp *symlink; unsigned int sub_len; unsigned int sub_offset; @@ -1476,7 +1486,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov); - if (!rc || !err_buf) { + if (!rc || !err_iov.iov_base) { kfree(utf16_path); return -ENOENT; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0f044c4a2dc9..60db51bae0e3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -383,10 +383,10 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(6); - pneg_ctxt->CipherCount = cpu_to_le16(2); - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; - pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */ + pneg_ctxt->CipherCount = cpu_to_le16(1); +/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */ + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM; } static void @@ -444,6 +444,7 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, return -EINVAL; } server->cipher_type = ctxt->Ciphers[0]; + server->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; return 0; } @@ -2590,7 +2591,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (server->rdma && rdata && + if (server->rdma && rdata && !server->sign && rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; @@ -2968,7 +2969,7 @@ smb2_async_writev(struct cifs_writedata *wdata, * If we want to do a server RDMA read, fill in and append * smbd_buffer_descriptor_v1 to the end of write request */ - if (server->rdma && wdata->bytes >= + if (server->rdma && !server->sign && wdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 6093e5142b2b..d28f358022c5 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -297,7 +297,7 @@ struct smb2_encryption_neg_context { __le16 DataLength; __le32 Reserved; __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ + __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; struct smb2_negotiate_rsp { diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5008af546dd1..c62f7c95683c 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1028,7 +1028,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, "rdma_request sge[%d] addr=%llu length=%u\n", - i, request->sge[0].addr, request->sge[0].length); + i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, request->sge[i].addr, @@ -2086,7 +2086,7 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec iov[SMBDIRECT_MAX_SGE]; + struct kvec *iov; int rc; info->smbd_send_pending++; @@ -2096,32 +2096,20 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) } /* - * This usually means a configuration error - * We use RDMA read/write for packet size > rdma_readwrite_threshold - * as long as it's properly configured we should never get into this - * situation - */ - if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) { - log_write(ERR, "maximum send segment %x exceeding %x\n", - rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE); - rc = -EINVAL; - goto done; - } - - /* - * Remove the RFC1002 length defined in MS-SMB2 section 2.1 - * It is used only for TCP transport + * Skip the RFC1002 length defined in MS-SMB2 section 2.1 + * It is used only for TCP transport in the iov[0] * In future we may want to add a transport layer under protocol * layer so this will only be issued to TCP transport */ - iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4; - iov[0].iov_len = rqst->rq_iov[0].iov_len - 4; - buflen += iov[0].iov_len; + + if (rqst->rq_iov[0].iov_len != 4) { + log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); + return -EINVAL; + } + iov = &rqst->rq_iov[1]; /* total up iov array first */ - for (i = 1; i < rqst->rq_nvec; i++) { - iov[i].iov_base = rqst->rq_iov[i].iov_base; - iov[i].iov_len = rqst->rq_iov[i].iov_len; + for (i = 0; i < rqst->rq_nvec-1; i++) { buflen += iov[i].iov_len; } @@ -2139,6 +2127,10 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } + cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); + for (i = 0; i < rqst->rq_nvec-1; i++) + dump_smb(iov[i].iov_base, iov[i].iov_len); + remaining_data_length = buflen; log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d " @@ -2194,12 +2186,14 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } i++; + if (i == rqst->rq_nvec-1) + break; } start = i; buflen = 0; } else { i++; - if (i == rqst->rq_nvec) { + if (i == rqst->rq_nvec-1) { /* send out all remaining vecs */ remaining_data_length -= buflen; log_write(INFO, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 8f6f25918229..927226a2122f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -753,7 +753,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, goto out; #ifdef CONFIG_CIFS_SMB311 - if (ses->status == CifsNew) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) smb311_update_preauth_hash(ses, rqst->rq_iov+1, rqst->rq_nvec-1); #endif @@ -798,7 +798,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, *resp_buf_type = CIFS_SMALL_BUFFER; #ifdef CONFIG_CIFS_SMB311 - if (ses->status == CifsNew) { + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { .iov_base = buf + 4, .iov_len = get_rfc1002_length(buf) @@ -834,8 +834,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL); - if (!new_iov) + if (!new_iov) { + /* otherwise cifs_send_recv below sets resp_buf_type */ + *resp_buf_type = CIFS_NO_BUFFER; return -ENOMEM; + } } else new_iov = s_iov; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a33d8fb1bf2a..508b905d744d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -321,6 +321,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; + ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; @@ -338,7 +339,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -346,7 +347,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -354,8 +355,8 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize) + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0a7315961bac..c969275ce3ee 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5329,8 +5329,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, stop = le32_to_cpu(extent->ee_block); /* - * In case of left shift, Don't start shifting extents until we make - * sure the hole is big enough to accommodate the shift. + * For left shifts, make sure the hole on the left is big enough to + * accommodate the shift. For right shifts, make sure the last extent + * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, @@ -5350,9 +5351,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { - ext4_ext_drop_refs(path); - kfree(path); - return -EINVAL; + ret = -EINVAL; + goto out; + } + } else { + if (shift > EXT_MAX_BLOCKS - + (stop + ext4_ext_get_actual_len(extent))) { + ret = -EINVAL; + goto out; } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 185f7e61f4cf..eb104e8476f0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5886,5 +5886,6 @@ static void __exit ext4_exit_fs(void) MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba70a895..47d7c151fcba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ac311037d7a5..8aa453784402 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -532,6 +532,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, */ ret = start_this_handle(journal, handle, GFP_NOFS); if (ret < 0) { + handle->h_journal = journal; jbd2_journal_free_reserved(handle); return ret; } diff --git a/fs/proc/base.c b/fs/proc/base.c index eafa39a3a88c..1b2ede6abcdf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1693,6 +1693,12 @@ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t uid; kgid_t gid; + if (unlikely(task->flags & PF_KTHREAD)) { + *ruid = GLOBAL_ROOT_UID; + *rgid = GLOBAL_ROOT_GID; + return; + } + /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index a000d7547479..b572cc865b92 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - idr_get_cursor(&task_active_pid_ns(current)->idr)); + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 65ae54659833..c486ad4b43f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1310,9 +1310,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset = swp_offset(entry); + offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1332,6 +1334,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); } spin_unlock(ptl); return err; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ce4a34a2751d..35a124400d60 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -511,7 +511,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) if (args->flags & ATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); - ASSERT(retval == 0); + if (retval) + return retval; + /* + * Since we have removed the old attr, clear ATTR_REPLACE so + * that the leaf format add routine won't trip over the attr + * not being around. + */ + args->flags &= ~ATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6a7c2f03ea11..040eeda8426f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -725,12 +725,16 @@ xfs_bmap_extents_to_btree( *logflagsp = 0; if ((error = xfs_alloc_vextent(&args))) { xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return -ENOSPC; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index ef68b1de006a..1201107eabc6 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -466,6 +466,8 @@ xfs_dinode_verify( return __this_address; if (di_size > XFS_DFORK_DSIZE(dip, mp)) return __this_address; + if (dip->di_nextents) + return __this_address; /* fall through */ case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -484,12 +486,31 @@ xfs_dinode_verify( if (XFS_DFORK_Q(dip)) { switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: + if (dip->di_anextents) + return __this_address; + /* fall through */ case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; default: return __this_address; } + } else { + /* + * If there is no fork offset, this may be a freshly-made inode + * in a new disk cluster, in which case di_aformat is zeroed. + * Otherwise, such an inode must be in EXTENTS format; this goes + * for freed inodes as well. + */ + switch (dip->di_aformat) { + case 0: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return __this_address; + } + if (dip->di_anextents) + return __this_address; } /* only version 3 or greater inodes are extensively verified here */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 299aee4b7b0b..eed073cc4778 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -778,22 +778,26 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; + loff_t isize = i_size_read(inode); - new_size = i_size_read(inode) + len; if (offset & blksize_mask || len & blksize_mask) { error = -EINVAL; goto out_unlock; } - /* check the new inode size does not wrap through zero */ - if (new_size > inode->i_sb->s_maxbytes) { + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) { error = -EFBIG; goto out_unlock; } + new_size = isize + len; /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + if (offset >= isize) { error = -EINVAL; goto out_unlock; } |