summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/check-integrity.c163
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h78
-rw-r--r--fs/btrfs/dev-replace.c23
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c49
-rw-r--r--fs/btrfs/extent-tree.c211
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c117
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c152
-rw-r--r--fs/btrfs/ioctl.c36
-rw-r--r--fs/btrfs/ordered-data.c49
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/scrub.c90
-rw-r--r--fs/btrfs/send.c49
-rw-r--r--fs/btrfs/super.c94
-rw-r--r--fs/btrfs/sysfs.c34
-rw-r--r--fs/btrfs/transaction.c166
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c50
-rw-r--r--fs/btrfs/volumes.c38
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/xattr.c150
29 files changed, 1208 insertions, 508 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe9c9f6..d897ef803b3b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfsic_block_data_ctx *block_ctx_out,
int mirror_num);
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
- u32 len, struct block_device *bdev,
- struct btrfsic_block_data_ctx *block_ctx_out);
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
l = NULL;
next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
} else {
- if (next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr)) {
- printk(KERN_INFO
- "Referenced block @%llu (%s/%llu/%d)"
- " found in hash table, %c,"
- " bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state, next_block),
- next_block->logical_bytenr);
- } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- printk(KERN_INFO
- "Referenced block @%llu (%s/%llu/%d)"
- " found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state, next_block));
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr))
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block),
+ next_block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block));
+ }
next_block->logical_bytenr = next_bytenr;
next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
return -1;
}
if (!block_was_created) {
- if (next_block->logical_bytenr != next_bytenr &&
+ if ((state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE) &&
+ next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
return ret;
}
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
- u32 len, struct block_device *bdev,
- struct btrfsic_block_data_ctx *block_ctx_out)
-{
- block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
- block_ctx_out->dev_bytenr = bytenr;
- block_ctx_out->start = bytenr;
- block_ctx_out->len = len;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
- if (NULL != block_ctx_out->dev) {
- return 0;
- } else {
- printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
- return -ENXIO;
- }
-}
-
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
{
if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
dev_state,
dev_bytenr);
}
- if (block->logical_bytenr != bytenr &&
- !(!block->is_metadata &&
- block->logical_bytenr == 0))
- printk(KERN_INFO
- "Written block @%llu (%s/%llu/%d)"
- " found in hash table, %c,"
- " bytenr mismatch"
- " (!= stored %llu).\n",
- bytenr, dev_state->name, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr);
- else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- printk(KERN_INFO
- "Written block @%llu (%s/%llu/%d)"
- " found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block));
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (block->logical_bytenr != bytenr &&
+ !(!block->is_metadata &&
+ block->logical_bytenr == 0))
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ bytenr, dev_state->name,
+ dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state,
+ block),
+ block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ bytenr, dev_state->name,
+ dev_bytenr, block->mirror_num,
+ btrfsic_get_block_type(state,
+ block));
+ }
block->logical_bytenr = bytenr;
} else {
if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
}
}
- if (block->is_superblock)
- ret = btrfsic_map_superblock(state, bytenr,
- processed_len,
- bdev, &block_ctx);
- else
- ret = btrfsic_map_block(state, bytenr, processed_len,
- &block_ctx, 0);
- if (ret) {
- printk(KERN_INFO
- "btrfsic: btrfsic_map_block(root @%llu)"
- " failed!\n", bytenr);
- goto continue_loop;
- }
- block_ctx.datav = mapped_datav;
- /* the following is required in case of writes to mirrors,
- * use the same that was used for the lookup */
block_ctx.dev = dev_state;
block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
if (is_metadata || state->include_extent_data) {
block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
/* this is getting ugly for the
* include_extent_data case... */
bytenr = 0; /* unknown */
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.mem_to_free = NULL;
- block_ctx.pagev = NULL;
} else {
processed_len = state->metablock_size;
bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
"Written block @%llu (%s/%llu/?)"
" !found in hash table, M.\n",
bytenr, dev_state->name, dev_bytenr);
-
- ret = btrfsic_map_block(state, bytenr, processed_len,
- &block_ctx, 0);
- if (ret) {
- printk(KERN_INFO
- "btrfsic: btrfsic_map_block(root @%llu)"
- " failed!\n",
- dev_bytenr);
- goto continue_loop;
- }
}
- block_ctx.datav = mapped_datav;
- /* the following is required in case of writes to mirrors,
- * use the same that was used for the lookup */
+
block_ctx.dev = dev_state;
block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
block = btrfsic_block_alloc();
if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
root->sectorsize, PAGE_CACHE_SIZE);
return -1;
}
- state = kzalloc(sizeof(*state), GFP_NOFS);
- if (NULL == state) {
- printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
- return -1;
+ state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!state) {
+ state = vzalloc(sizeof(*state));
+ if (!state) {
+ printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+ return -1;
+ }
}
if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
mutex_unlock(&btrfsic_mutex);
- kfree(state);
+ if (is_vmalloc_addr(state))
+ vfree(state);
+ else
+ kfree(state);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d3220d31d3cb..1bf411bc28fd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode,
+ const struct compressed_bio *cb)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
int ret;
+ if (cb->errors)
+ mapping_set_error(inode->i_mapping, -EIO);
+
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
continue;
}
for (i = 0; i < ret; i++) {
+ if (cb->errors)
+ SetPageError(pages[i]);
end_page_writeback(pages[i]);
page_cache_release(pages[i]);
}
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
cb->start,
cb->start + cb->len - 1,
- NULL, 1);
+ NULL,
+ err ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
- end_compressed_writeback(inode, cb->start, cb->len);
+ end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
/*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 150822ee0a0b..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
*/
if (!p->leave_spinning)
btrfs_set_path_blocking(p);
- if (ret < 0)
+ if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 470e3177a7e8..e6fbbd74b716 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
+ unsigned int skip_release_on_error:1;
};
/*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
struct percpu_counter total_bytes_pinned;
struct list_head list;
+ struct list_head ro_bgs;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
+ unsigned int has_caching_ctl:1;
+ unsigned int removed:1;
int disk_cache_state;
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
/* For delayed block group creation or deletion of empty block groups */
struct list_head bg_list;
+
+ /* For read-only block groups */
+ struct list_head ro_list;
+
+ atomic_t trimming;
};
/* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
*/
u64 last_trans_log_full_commit;
unsigned long mount_opt;
+ /*
+ * Track requests for actions that need to be done during transaction
+ * commit (like for some mount options).
+ */
+ unsigned long pending_changes;
unsigned long compress_type:4;
int commit_interval;
/*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
+
+ /*
+ * Chunks that can't be freed yet (under a trim/discard operation)
+ * and will be latter freed. Protected by fs_info->chunk_mutex.
+ */
+ struct list_head pinned_chunks;
};
struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
-#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
+
#define btrfs_set_and_info(root, opt, fmt, args...) \
{ \
if (!btrfs_test_opt(root, opt)) \
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args {
}
/*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+
+#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
+#define BTRFS_PENDING_COMMIT (2)
+
+#define btrfs_test_pending(info, opt) \
+ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt) \
+ set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt) \
+ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), SET_##opt); \
+ btrfs_clear_pending((info), CLEAR_##opt); \
+ } \
+} while(0)
+
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), CLEAR_##opt); \
+ btrfs_clear_pending((info), SET_##opt); \
+ } \
+} while(0)
+
+/*
* Inode flags
*/
#define BTRFS_INODE_NODATASUM (1 << 0)
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start);
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
-int btrfs_start_nocow_write(struct btrfs_root *root);
-void btrfs_end_nocow_write(struct btrfs_root *root);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
int verify_dir_item(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
struct page **pages, size_t num_pages,
loff_t pos, size_t write_bytes,
struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0bf41f8b1e23..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -417,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
- WARN_ON(ret);
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == -EINPROGRESS) {
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+ ret = 0;
+ } else {
+ WARN_ON(ret);
+ }
- return 0;
+ return ret;
leave:
dev_replace->srcdev = NULL;
@@ -537,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
+ return scrub_ret;
}
printk_in_rcu(KERN_INFO
@@ -566,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- /* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info, src_device);
- btrfs_kobj_add_device(fs_info, tgt_device);
-
btrfs_dev_replace_unlock(dev_replace);
btrfs_rm_dev_replace_blocked(fs_info);
- btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
btrfs_rm_dev_replace_unblocked(fs_info);
@@ -589,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
if (!IS_ERR(trans))
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df866e919..1752625fb4dd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
#include "hash.h"
#include "transaction.h"
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len);
-
/*
* insert a name into a directory, doing overflow properly if there is a hash
* collision. data_size indicates how big the item inserted should be. On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len)
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bf9f897065d..30965120772b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ INIT_LIST_HEAD(&fs_info->pinned_chunks);
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
btrfs_set_opt(fs_info->mount_opt, SSD);
}
- /* Set the real inode map cache flag */
- if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
- btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+ /*
+ * Mount does not set all options immediatelly, we can do it now and do
+ * not have to wait for transaction commit
+ */
+ btrfs_apply_pending_changes(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
+
+ lock_chunks(root);
+ while (!list_empty(&fs_info->pinned_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&fs_info->pinned_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
+ unlock_chunks(root);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
*/
if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->root);
+ btrfs_super_root(sb));
if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
- printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->chunk_root);
+ printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
+ btrfs_super_chunk_root(sb));
if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
- printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
btrfs_super_log_root(sb));
if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4129,6 +4144,25 @@ again:
return 0;
}
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
+ btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47c1ba141082..222d6aea4a8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
struct btrfs_caching_control *ctl;
spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_STARTED) {
- spin_unlock(&cache->lock);
- return NULL;
- }
-
- /* We're loading it the fast way, so we don't have a caching_ctl. */
if (!cache->caching_ctl) {
spin_unlock(&cache->lock);
return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
spin_unlock(&cache->lock);
if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+ mutex_lock(&caching_ctl->mutex);
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
+ caching_ctl->progress = (u64)-1;
} else {
if (load_cache_only) {
cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
}
spin_unlock(&cache->lock);
+ mutex_unlock(&caching_ctl->mutex);
+
wake_up(&caching_ctl->wait);
if (ret == 1) {
put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
spin_unlock(&cache->lock);
wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct rb_node *node;
+
spin_lock(&root->fs_info->block_group_cache_lock);
+
+ /* If our block group was removed, we need a full search. */
+ if (RB_EMPTY_NODE(&cache->cache_node)) {
+ const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_first_block_group(root->fs_info,
+ next_bytenr);
+ return cache;
+ }
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->chunk_alloc = 0;
found->flush = 0;
init_waitqueue_head(&found->wait);
+ INIT_LIST_HEAD(&found->ro_bgs);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
/*
* No longer have used bytes in this block group, queue
* it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
}
spin_unlock(&info->unused_bgs_lock);
}
- btrfs_set_block_group_used(&cache->item, old_val);
- cache->pinned += num_bytes;
- cache->space_info->bytes_pinned += num_bytes;
- cache->space_info->bytes_used -= num_bytes;
- cache->space_info->disk_used -= num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- set_extent_dirty(info->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- GFP_NOFS | __GFP_NOFAIL);
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro = 1;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
/*
* helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
*/
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group_cache *block_group;
u64 free_bytes = 0;
int factor;
- list_for_each_entry(block_group, groups_list, list) {
+ /* It's df, we don't care if it's racey */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
spin_unlock(&block_group->lock);
}
-
- return free_bytes;
-}
-
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
- int i;
- u64 free_bytes = 0;
-
- spin_lock(&sinfo->lock);
-
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- if (!list_empty(&sinfo->block_groups[i]))
- free_bytes += __btrfs_get_ro_block_group_free_space(
- &sinfo->block_groups[i]);
-
spin_unlock(&sinfo->lock);
return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
cache->bytes_super - btrfs_block_group_used(&cache->item);
sinfo->bytes_readonly -= num_bytes;
cache->ro = 0;
+ list_del_init(&cache->ro_list);
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
btrfs_init_free_space_ctl(cache);
+ atomic_set(&cache->trimming, 0);
return cache;
}
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
int ret = 0;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
- list_del_init(&block_group->bg_list);
if (ret)
- continue;
+ goto next;
spin_lock(&block_group->lock);
memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+next:
+ list_del_init(&block_group->bg_list);
}
}
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start)
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em)
{
struct btrfs_path *path;
struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int ret;
int index;
int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
root = root->fs_info->extent_root;
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
if (root->fs_info->first_logical_byte == block_group->key.objectid)
root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ list_del_init(&block_group->ro_list);
if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
+ if (block_group->has_caching_ctl)
+ caching_ctl = get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&root->fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &root->fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ atomic_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&root->fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ put_caching_control(caching_ctl);
+ put_caching_control(caching_ctl);
+ }
+ }
btrfs_remove_free_space_cache(block_group);
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
+ lock_chunks(root);
+ if (!list_empty(&em->list)) {
+ /* We're in the transaction->pending_chunks list. */
+ free_extent_map(em);
+ }
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ /*
+ * Make sure a trimmer task always sees the em in the pinned_chunks list
+ * if it sees block_group->removed == 1 (needs to lock block_group->lock
+ * before checking block_group->removed).
+ */
+ if (!remove_em) {
+ /*
+ * Our em might be in trans->transaction->pending_chunks which
+ * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+ * and so is the fs_info->pinned_chunks list.
+ *
+ * So at this point we must be holding the chunk_mutex to avoid
+ * any races with chunk allocation (more specifically at
+ * volumes.c:contains_pending_extent()), to ensure it always
+ * sees the em, either in the pending_chunks list or in the
+ * pinned_chunks list.
+ */
+ list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+ }
+ spin_unlock(&block_group->lock);
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ /*
+ * The em might be in the pending_chunks list, so make sure the
+ * chunk mutex is locked, since remove_extent_mapping() will
+ * delete us from that list.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+
+ unlock_chunks(root);
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
start = block_group->key.objectid;
end = start + block_group->key.offset - 1;
- clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS);
- clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ if (ret) {
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
+end_trans:
btrfs_end_transaction(trans, root);
next:
btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
}
/*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk after the snapshot creation.
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
*/
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
wake_up(&root->subv_writers->wait);
}
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
{
if (atomic_read(&root->will_be_snapshoted))
return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
*/
smp_mb();
if (atomic_read(&root->will_be_snapshoted)) {
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
return 0;
}
return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424e0013..4ebabd237153 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
clear = 1;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
prealloc = alloc_extent_state(mask);
- if (!prealloc)
- return -ENOMEM;
}
spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
state->state |= bits_to_set;
}
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ const u64 flags)
{
if (cached_ptr && !(*cached_ptr)) {
- if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ if (!flags || (state->state & flags)) {
*cached_ptr = state;
atomic_inc(&state->refs);
}
}
}
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
/*
* set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err = 0;
u64 last_start;
u64 last_end;
+ bool first_iteration = true;
btrfs_debug_check_extent_io_range(tree, start, end);
again:
if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
prealloc = alloc_extent_state(mask);
- if (!prealloc)
+ if (!prealloc && !first_iteration)
return -ENOMEM;
}
@@ -1234,6 +1255,7 @@ search_again:
spin_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
+ first_iteration = false;
goto again;
}
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
state = find_first_extent_bit_state(tree, start, bits);
got_it:
if (state) {
- cache_state(state, cached_state);
+ cache_state_if_flags(state, cached_state, 0);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
if (page_ops == 0)
return 0;
+ if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ mapping_set_error(inode->i_mapping, -EIO);
+
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
clear_page_dirty_for_io(pages[i]);
if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+ if (page_ops & PAGE_SET_ERROR)
+ SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
end_page_writeback(pages[i]);
if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938be986..ece9ce87edff 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
#define PAGE_SET_WRITEBACK (1 << 2)
#define PAGE_END_WRITEBACK (1 << 3)
#define PAGE_SET_PRIVATE2 (1 << 4)
+#define PAGE_SET_ERROR (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b39afb..6a98bddd8f33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
- list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceabd99a8..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- ret = btrfs_start_nocow_write(root);
+ ret = btrfs_start_write_no_snapshoting(root);
if (!ret)
return -ENOSPC;
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
btrfs_free_reserved_data_space(inode,
reserve_bytes);
else
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
break;
}
@@ -1632,7 +1632,7 @@ again:
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
if (only_release_metadata && copied > 0) {
u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
err = written_buffered;
goto out;
}
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
endbyte = pos + written_buffered - 1;
- err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ err = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (err)
+ goto out;
+ err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
if (err)
goto out;
written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
int ret;
atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
return 0;
}
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
+ */
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+ return ret;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..030847bf7cec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
#include "disk-io.h"
#include "extent_io.h"
#include "inode-map.h"
+#include "volumes.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+struct btrfs_trim_range {
+ u64 start;
+ u64 bytes;
+ struct list_head list;
+};
+
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
int ret;
struct btrfs_free_cluster *cluster = NULL;
struct rb_node *node = rb_first(&ctl->free_space_offset);
+ struct btrfs_trim_range *trim_entry;
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
cluster = NULL;
}
}
+
+ /*
+ * Make sure we don't miss any range that was removed from our rbtree
+ * because trimming is running. Otherwise after a umount+mount (or crash
+ * after committing the transaction) we would leak free space and get
+ * an inconsistent free space cache report from fsck.
+ */
+ list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+ ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+ trim_entry->bytes, NULL);
+ if (ret)
+ goto fail;
+ *entries += 1;
+ }
+
return 0;
fail:
return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_set_generation(&io_ctl, trans->transid);
+ mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
ret = write_cache_extent_entries(&io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
- if (ret)
+ if (ret) {
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto out_nospc;
+ }
/*
* Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* committed, we shouldn't lose them.
*/
ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
- if (ret)
+ if (ret) {
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto out_nospc;
+ }
- /* At last, we write out all the bitmaps. */
+ /*
+ * At last, we write out all the bitmaps and keep cache_writeout_mutex
+ * locked while doing it because a concurrent trim can be manipulating
+ * or freeing the bitmap.
+ */
ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ mutex_unlock(&ctl->cache_writeout_mutex);
if (ret)
goto out_nospc;
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
ctl->start = block_group->key.objectid;
ctl->private = block_group;
ctl->op = &free_space_op;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
/*
* we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
static int do_trimming(struct btrfs_block_group_cache *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
- u64 reserved_start, u64 reserved_bytes)
+ u64 reserved_start, u64 reserved_bytes,
+ struct btrfs_trim_range *trim_entry)
{
struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
int update = 0;
u64 trimmed = 0;
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
if (!ret)
*total_trimmed += trimmed;
+ mutex_lock(&ctl->cache_writeout_mutex);
btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+ list_del(&trim_entry->list);
+ mutex_unlock(&ctl->cache_writeout_mutex);
if (update) {
spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
u64 bytes;
while (start < end) {
+ struct btrfs_trim_range trim_entry;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, start, 0, 1);
if (!entry) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
node = rb_next(&entry->offset_index);
if (!node) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto out;
}
entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
if (entry->offset >= end) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
bytes = min(extent_start + extent_bytes, end) - start;
if (bytes < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
kmem_cache_free(btrfs_free_space_cachep, entry);
spin_unlock(&ctl->tree_lock);
+ trim_entry.start = extent_start;
+ trim_entry.bytes = extent_bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- extent_start, extent_bytes);
+ extent_start, extent_bytes, &trim_entry);
if (ret)
break;
next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
while (offset < end) {
bool next_bitmap = false;
+ struct btrfs_trim_range trim_entry;
+ mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, offset, 1, 0);
if (!entry) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
ret2 = search_bitmap(ctl, entry, &start, &bytes);
if (ret2 || start >= end) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
bytes = min(bytes, end - start);
if (bytes < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
free_bitmap(ctl, entry);
spin_unlock(&ctl->tree_lock);
+ trim_entry.start = start;
+ trim_entry.bytes = bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- start, bytes);
+ start, bytes, &trim_entry);
if (ret)
break;
next:
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
*trimmed = 0;
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ atomic_inc(&block_group->trimming);
+ spin_unlock(&block_group->lock);
+
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
if (ret)
- return ret;
+ goto out;
ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+ spin_lock(&block_group->lock);
+ if (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed) {
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ spin_unlock(&block_group->lock);
+
+ em_tree = &block_group->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ lock_chunks(block_group->fs_info->chunk_root);
+ list_del_init(&em->list);
+ unlock_chunks(block_group->fs_info->chunk_root);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We've left one free space entry and other tasks trimming
+ * this block group have left 1 entry each one. Free them.
+ */
+ __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
return ret;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
u64 start;
struct btrfs_free_space_op *op;
void *private;
+ struct mutex cache_writeout_mutex;
+ struct list_head trimming_ranges;
};
struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646bd2e4b..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
root->root_key.objectid);
if (IS_ERR(tsk)) {
btrfs_warn(root->fs_info, "failed to start inode caching task");
- btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
"disabling inode map caching");
}
}
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
ctl->start = 0;
ctl->private = NULL;
ctl->op = &free_ino_op;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
/*
* Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d23362f4464e..8de23355f6cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
* are written in the same order that the flusher thread sent them
* down.
*/
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end,
struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
- /*
- * skip compression for a small file range(<=blocksize) that
- * isn't an inline extent, since it dosen't save disk space at all.
- */
- if ((end - start + 1) <= blocksize &&
- (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
- goto cleanup_and_bail_uncompressed;
-
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -440,6 +432,14 @@ again:
total_compressed = actual_end - start;
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if (total_compressed <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
/* we want to make sure that amount of ram required to uncompress
* an extent is reasonable, so we limit the total size in ram
* of a compressed extent to 128k. This is a crucial number
@@ -527,7 +527,10 @@ cont:
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
EXTENT_DEFRAG;
+ unsigned long page_error_op;
+
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+ page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
/*
* inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
clear_flags, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK |
+ page_error_op |
PAGE_END_WRITEBACK);
goto free_pages_out;
}
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
*num_added += 1;
}
-out:
- return ret;
+ return;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
page_cache_release(pages[i]);
}
kfree(pages);
+}
- goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+ int i;
+
+ if (!async_extent->pages)
+ return;
+
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
}
/*
@@ -639,7 +656,7 @@ free_pages_out:
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
struct async_cow *async_cow)
{
struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
struct extent_io_tree *io_tree;
int ret = 0;
- if (list_empty(&async_cow->extents))
- return 0;
-
again:
while (!list_empty(&async_cow->extents)) {
async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
async_extent->compressed_size,
0, alloc_hint, &ins, 1, 1);
if (ret) {
- int i;
-
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- page_cache_release(async_extent->pages[i]);
- }
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ free_async_extent_pages(async_extent);
if (ret == -ENOSPC) {
unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages);
+ if (ret) {
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct page *p = async_extent->pages[0];
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ p->mapping = inode->i_mapping;
+ tree->ops->writepage_end_io_hook(p, start, end,
+ NULL, 0);
+ p->mapping = NULL;
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
- if (ret)
- goto out;
cond_resched();
}
- ret = 0;
-out:
- return ret;
+ return;
out_free_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
@@ -832,7 +849,9 @@ out_free:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
}
@@ -1318,7 +1337,7 @@ next_slot:
* we fall into common COW way.
*/
if (!nolock) {
- err = btrfs_start_nocow_write(root);
+ err = btrfs_start_write_no_snapshoting(root);
if (!err)
goto out_check;
}
@@ -1342,7 +1361,7 @@ out_check:
if (extent_end <= start) {
path->slots[0]++;
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
goto next_slot;
}
if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
page_started, nr_written, 1);
if (ret) {
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
goto error;
}
cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
num_bytes);
if (ret) {
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
goto error;
}
}
@@ -1424,7 +1443,7 @@ out_check:
EXTENT_DELALLOC, PAGE_UNLOCK |
PAGE_SET_PRIVATE2);
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -4580,6 +4599,26 @@ next:
return err;
}
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+ while (true) {
+ int ret;
+
+ ret = btrfs_start_write_no_snapshoting(root);
+ if (ret)
+ break;
+ wait_on_atomic_t(&root->will_be_snapshoted,
+ wait_snapshoting_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+ }
+}
+
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize > oldsize) {
truncate_pagecache(inode, newsize);
+ /*
+ * Don't do an expanding truncate while snapshoting is ongoing.
+ * This is to ensure the snapshot captures a fully consistent
+ * state of this file - if the snapshot captures this expanding
+ * truncation, it must capture all writes that happened before
+ * this truncation.
+ */
+ wait_for_snapshot_creation(root);
ret = btrfs_cont_expand(inode, oldsize, newsize);
- if (ret)
+ if (ret) {
+ btrfs_end_write_no_snapshoting(root);
return ret;
+ }
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ btrfs_end_write_no_snapshoting(root);
return PTR_ERR(trans);
+ }
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
ret = btrfs_update_inode(trans, root, inode);
+ btrfs_end_write_no_snapshoting(root);
btrfs_end_transaction(trans, root);
} else {
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
- ret = filemap_write_and_wait_range(inode->i_mapping,
- lockstart,
- lockend);
+ ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+ if (ret)
+ break;
+ ret = filemap_fdatawait_range(inode->i_mapping,
+ lockstart,
+ lockend);
if (ret)
break;
@@ -9442,6 +9497,21 @@ out_inode:
}
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+ int ret = 0;
+
+ if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+ ret = -ENOSPC;
+ if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4399f0c3a4ce..b590e23fa03e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
return ret;
}
-static void btrfs_wait_nocow_write(struct btrfs_root *root)
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
{
s64 writers;
DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
- btrfs_wait_nocow_write(root);
+ btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
- /*
- * If orphan cleanup did remove any orphans, it means the tree was
- * modified and therefore the commit root is not the same as the
- * current root anymore. This is a problem, because send uses the
- * commit root and therefore can see inode items that don't exist
- * in the current root anymore, and for example make calls to
- * btrfs_iget, which will do tree lookups based on the current root
- * and not on the commit root. Those lookups will fail, returning a
- * -ESTALE error, and making send fail with that error. So make sure
- * a send does not see any orphans we have just removed, and that it
- * will see the same inodes regardless of whether a transaction
- * commit happened before it started (meaning that the commit root
- * will be the same as the current root) or not.
- */
- if (readonly && pending_snapshot->snap->node !=
- pending_snapshot->snap->commit_root) {
- trans = btrfs_join_transaction(pending_snapshot->snap);
- if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
- ret = PTR_ERR(trans);
- goto fail;
- }
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans,
- pending_snapshot->snap);
- if (ret)
- goto fail;
- }
- }
-
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -761,7 +732,8 @@ fail:
free:
kfree(pending_snapshot);
out:
- atomic_dec(&root->will_be_snapshoted);
+ if (atomic_dec_and_test(&root->will_be_snapshoted))
+ wake_up_atomic_t(&root->will_be_snapshoted);
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec4cc20..534544e08f76 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
INIT_LIST_HEAD(&entry->log_list);
+ INIT_LIST_HEAD(&entry->trans_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -431,19 +432,31 @@ out:
/* Needs to either be called under a log transaction or the log_mutex */
void btrfs_get_logged_extents(struct inode *inode,
- struct list_head *logged_list)
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
+ struct rb_node *prev;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
- for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ n = __tree_search(&tree->tree, end, &prev);
+ if (!n)
+ n = prev;
+ for (; n; n = rb_prev(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ if (ordered->file_offset > end)
+ continue;
+ if (entry_end(ordered) <= start)
+ break;
if (!list_empty(&ordered->log_list))
continue;
- list_add_tail(&ordered->log_list, logged_list);
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+ list_add(&ordered->log_list, logged_list);
atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
spin_unlock_irq(&log->log_extents_lock[index]);
}
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- btrfs_put_ordered_extent(ordered);
+ if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ list_add_tail(&ordered->trans_list, &trans->ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
- ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+ ret = btrfs_fdatawrite_range(inode, start, orig_end);
if (ret)
return ret;
- /*
- * So with compression we will find and lock a dirty page and clear the
- * first one as dirty, setup an async extent, and immediately return
- * with the entire range locked but with nobody actually marked with
- * writeback. So we can't just filemap_write_and_wait_range() and
- * expect it to work since it will just kick off a thread to do the
- * actual work. So we need to call filemap_fdatawrite_range _again_
- * since it will wait on the page lock, which won't be unlocked until
- * after the pages have been marked as writeback and so we're good to go
- * from there. We have to do this otherwise we'll miss the ordered
- * extents and that results in badness. Please Josef, do not think you
- * know better and pull this out at some point in the future, it is
- * right and you are wrong.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- orig_end);
- if (ret)
- return ret;
- }
+
ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
if (ret)
return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621e..e96cd4ccd805 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
ordered extent */
#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+ * in the logging code. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
/* If we need to wait on this to be done */
struct list_head log_list;
+ /* If the transaction needs to wait on this ordered extent */
+ struct list_head trans_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode,
- struct list_head *logged_list);
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end);
void btrfs_put_logged_extents(struct list_head *logged_list);
void btrfs_submit_logged_extents(struct list_head *logged_list,
struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
void ordered_data_exit(void);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27f2e16cd259..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4049,6 +4049,50 @@ out:
scrub_pending_trans_workers_dec(sctx);
}
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+ u64 logical)
+{
+ struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_io_tree *io_tree;
+ struct extent_map *em;
+ u64 lockstart = start, lockend = start + len - 1;
+ int ret = 0;
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock;
+ }
+
+ /*
+ * This extent does not actually cover the logical extent anymore,
+ * move on to the next inode.
+ */
+ if (em->block_start > logical ||
+ em->block_start + em->block_len < logical + len) {
+ free_extent_map(em);
+ ret = 1;
+ goto out_unlock;
+ }
+ free_extent_map(em);
+
+out_unlock:
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ return ret;
+}
+
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
@@ -4057,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct inode *inode;
struct page *page;
struct btrfs_root *local_root;
- struct btrfs_ordered_extent *ordered;
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace;
+ u64 nocow_ctx_logical;
u64 len = nocow_ctx->len;
- u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index;
int srcu_index;
int ret = 0;
@@ -4095,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
io_tree = &BTRFS_I(inode)->io_tree;
+ nocow_ctx_logical = nocow_ctx->logical;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
- }
-
- em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock;
- }
-
- /*
- * This extent does not actually cover the logical extent anymore,
- * move on to the next inode.
- */
- if (em->block_start > nocow_ctx->logical ||
- em->block_start + em->block_len < nocow_ctx->logical + len) {
- free_extent_map(em);
- goto out_unlock;
+ ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto out;
}
- free_extent_map(em);
while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT;
@@ -4135,7 +4159,7 @@ again:
goto next_page;
} else {
ClearPageError(page);
- err = extent_read_full_page_nolock(io_tree, page,
+ err = extent_read_full_page(io_tree, page,
btrfs_get_extent,
nocow_ctx->mirror_num);
if (err) {
@@ -4160,6 +4184,14 @@ again:
goto next_page;
}
}
+
+ ret = check_extent_to_block(inode, offset, len,
+ nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto next_page;
+ }
+
err = write_page_nocow(nocow_ctx->sctx,
physical_for_dev_replace, page);
if (err)
@@ -4173,12 +4205,10 @@ next_page:
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
+ nocow_ctx_logical += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
ret = COPY_COMPLETE;
-out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828dd0a86..804432dbc351 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
return ret;
}
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+ int i;
+ struct btrfs_trans_handle *trans = NULL;
+
+again:
+ if (sctx->parent_root &&
+ sctx->parent_root->node != sctx->parent_root->commit_root)
+ goto commit_trans;
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++)
+ if (sctx->clone_roots[i].root->node !=
+ sctx->clone_roots[i].root->commit_root)
+ goto commit_trans;
+
+ if (trans)
+ return btrfs_end_transaction(trans, sctx->send_root);
+
+ return 0;
+
+commit_trans:
+ /* Use any root, all fs roots will get their commit roots updated. */
+ if (!trans) {
+ trans = btrfs_join_transaction(sctx->send_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto again;
+ }
+
+ return btrfs_commit_transaction(trans, sctx->send_root);
+}
+
static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
{
spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
+ ret = ensure_commit_roots_uptodate(sctx);
+ if (ret)
+ goto out;
+
current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 54bd91ece35b..60f7cbe815e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
- if (!trans->blocks_used) {
+ if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
const char *errstr;
errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"disabling disk space caching");
break;
case Opt_inode_cache:
- btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+ btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
"enabling inode map caching");
break;
case Opt_noinode_cache:
- btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
"disabling inode map caching");
break;
case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
- if (PTR_ERR(trans) == -ENOENT)
- return 0;
- return PTR_ERR(trans);
+ if (PTR_ERR(trans) == -ENOENT) {
+ /*
+ * Exit unless we have some pending changes
+ * that need to go through commit
+ */
+ if (fs_info->pending_changes == 0)
+ return 0;
+ trans = btrfs_start_transaction(root, 0);
+ } else {
+ return PTR_ERR(trans);
+ }
}
return btrfs_commit_transaction(trans, root);
}
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
int i = 0, nr_devices;
int ret;
+ /*
+ * We aren't under the device list lock, so this is racey-ish, but good
+ * enough for our purposes.
+ */
nr_devices = fs_info->fs_devices->open_devices;
- BUG_ON(!nr_devices);
+ if (!nr_devices) {
+ smp_mb();
+ nr_devices = fs_info->fs_devices->open_devices;
+ ASSERT(nr_devices);
+ if (!nr_devices) {
+ *free_bytes = 0;
+ return 0;
+ }
+ }
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
else
min_stripe_size = BTRFS_STRIPE_LEN;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (fs_info->alloc_start)
+ mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!device->in_fs_metadata || !device->bdev ||
device->is_tgtdev_for_dev_replace)
continue;
+ if (i >= nr_devices)
+ break;
+
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
skip_space = 1024 * 1024;
/* user can set the offset in fs_info->alloc_start. */
- if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
- device->total_bytes)
+ if (fs_info->alloc_start &&
+ fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+ device->total_bytes) {
+ rcu_read_unlock();
skip_space = max(fs_info->alloc_start, skip_space);
- /*
- * btrfs can not use the free space in [0, skip_space - 1],
- * we must subtract it from the total. In order to implement
- * it, we account the used space in this range first.
- */
- ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
- &used_space);
- if (ret) {
- kfree(devices_info);
- return ret;
- }
+ /*
+ * btrfs can not use the free space in
+ * [0, skip_space - 1], we must subtract it from the
+ * total. In order to implement it, we account the used
+ * space in this range first.
+ */
+ ret = btrfs_account_dev_extents_size(device, 0,
+ skip_space - 1,
+ &used_space);
+ if (ret) {
+ kfree(devices_info);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ret;
+ }
- /* calc the free space in [0, skip_space - 1] */
- skip_space -= used_space;
+ rcu_read_lock();
+
+ /* calc the free space in [0, skip_space - 1] */
+ skip_space -= used_space;
+ }
/*
* we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
i++;
}
+ rcu_read_unlock();
+ if (fs_info->alloc_start)
+ mutex_unlock(&fs_devices->device_list_mutex);
nr_devices = i;
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* holding chunk_muext to avoid allocating new chunks, holding
* device_list_mutex to avoid the device being removed
*/
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree -= block_rsv->size >> bits;
spin_unlock(&block_rsv->lock);
- buf->f_bavail = total_free_data;
+ buf->f_bavail = div_u64(total_free_data, factor);
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
- if (ret) {
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (ret)
return ret;
- }
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b2e7bb4393f6..92db3f648df4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info;
struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
- struct btrfs_trans_handle *trans;
u64 features, set, clear;
unsigned long val;
int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
btrfs_info(fs_info, "%s %s feature flag",
val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
- trans = btrfs_start_transaction(fs_info->fs_root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
spin_lock(&fs_info->super_lock);
features = get_features(fs_info, fa->feature_set);
if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
set_features(fs_info, fa->feature_set, features);
spin_unlock(&fs_info->super_lock);
- ret = btrfs_commit_transaction(trans, fs_info->fs_root);
- if (ret)
- return ret;
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
return count;
}
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
const char *buf, size_t len)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = fs_info->fs_root;
- int ret;
size_t p_len;
if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
if (p_len >= BTRFS_LABEL_SIZE)
return -EINVAL;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- spin_lock(&root->fs_info->super_lock);
+ spin_lock(&fs_info->super_lock);
memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
memcpy(fs_info->super_copy->label, buf, p_len);
- spin_unlock(&root->fs_info->super_lock);
- ret = btrfs_commit_transaction(trans, root);
+ spin_unlock(&fs_info->super_lock);
- if (!ret)
- return len;
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
- return ret;
+ return len;
}
BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+ if (need_resched()) {
+ spin_unlock(&tree->lock);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ }
+ spin_unlock(&tree->lock);
+}
+
static noinline void switch_commit_roots(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
root->commit_root = btrfs_root_node(root);
if (is_fstree(root->objectid))
btrfs_unpin_free_ino(root);
+ clear_btree_io_tree(&root->dirty_log_pages);
}
up_write(&fs_info->commit_root_sem);
}
@@ -220,6 +247,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
+ INIT_LIST_HEAD(&cur_trans->pending_ordered);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
h->sync = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
+ INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ if (!list_empty(&trans->ordered)) {
+ spin_lock(&info->trans_lock);
+ list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ spin_unlock(&info->trans_lock);
+ }
+
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
- convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
- mark, &cached_state, GFP_NOFS);
- cached_state = NULL;
- err = filemap_fdatawrite_range(mapping, start, end);
+ bool wait_writeback = false;
+
+ err = convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state, GFP_NOFS);
+ /*
+ * convert_extent_bit can return -ENOMEM, which is most of the
+ * time a temporary error. So when it happens, ignore the error
+ * and wait for writeback of this range to finish - because we
+ * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+ * to btrfs_wait_marked_extents() would not know that writeback
+ * for this range started and therefore wouldn't wait for it to
+ * finish - we don't want to commit a superblock that points to
+ * btree nodes/leafs for which writeback hasn't finished yet
+ * (and without errors).
+ * We cleanup any entries left in the io tree when committing
+ * the transaction (through clear_btree_io_tree()).
+ */
+ if (err == -ENOMEM) {
+ err = 0;
+ wait_writeback = true;
+ }
+ if (!err)
+ err = filemap_fdatawrite_range(mapping, start, end);
if (err)
werr = err;
+ else if (wait_writeback)
+ werr = filemap_fdatawait_range(mapping, start, end);
+ free_extent_state(cached_state);
+ cached_state = NULL;
cond_resched();
start = end + 1;
}
- if (err)
- werr = err;
return werr;
}
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
- clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
- 0, 0, &cached_state, GFP_NOFS);
- err = filemap_fdatawait_range(mapping, start, end);
+ /*
+ * Ignore -ENOMEM errors returned by clear_extent_bit().
+ * When committing the transaction, we'll remove any entries
+ * left in the io tree. For a log commit, we don't remove them
+ * after committing the log because the tree can be accessed
+ * concurrently - we do it only at transaction commit time when
+ * it's safe to do it (through clear_btree_io_tree()).
+ */
+ err = clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ 0, 0, &cached_state, GFP_NOFS);
+ if (err == -ENOMEM)
+ err = 0;
+ if (!err)
+ err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
+ free_extent_state(cached_state);
+ cached_state = NULL;
cond_resched();
start = end + 1;
}
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
return 0;
}
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (!trans || !trans->transaction) {
- struct inode *btree_inode;
- btree_inode = root->fs_info->btree_inode;
- return filemap_write_and_wait(btree_inode->i_mapping);
- }
- return btrfs_write_and_wait_marked_extents(root,
+ int ret;
+
+ ret = btrfs_write_and_wait_marked_extents(root,
&trans->transaction->dirty_pages,
EXTENT_DIRTY);
+ clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+ return ret;
}
/*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, -1);
}
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &ordered->flags));
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
+ list_splice(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
btrfs_scrub_pause(root);
/*
* Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
/*
- * Since the transaction is done, we should set the inode map cache flag
- * before any other comming transaction.
+ * Since the transaction is done, we can apply the pending changes
+ * before the next transaction.
*/
- if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
- btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
- else
- btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+ btrfs_apply_pending_changes(root->fs_info);
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
return (ret < 0) ? 0 : 1;
}
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+ unsigned long prev;
+ unsigned long bit;
+
+ prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+ if (!prev)
+ return;
+
+ bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+ if (prev & bit)
+ btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+ prev &= ~bit;
+
+ bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+ if (prev & bit)
+ btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+ prev &= ~bit;
+
+ bit = 1 << BTRFS_PENDING_COMMIT;
+ if (prev & bit)
+ btrfs_debug(fs_info, "pending commit done");
+ prev &= ~bit;
+
+ if (prev)
+ btrfs_warn(fs_info,
+ "unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..00ed29c4b3f9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
+ struct list_head pending_ordered;
struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
+ struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
+
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213cec861..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+ mark);
+ btrfs_wait_logged_extents(trans, log, log_transid);
wait_log_commit(trans, log_root_tree,
root_log_ctx.log_transid);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = root_log_ctx.log_ret;
+ if (!ret)
+ ret = root_log_ctx.log_ret;
goto out;
}
ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_NEW | EXTENT_DIRTY);
- btrfs_wait_logged_extents(log, log_transid);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ if (!ret)
+ ret = btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
+ if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ btrfs_wait_logged_extents(trans, log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ /*
+ * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+ * i_mapping flags, so that the next fsync won't get
+ * an outdated io error too.
+ */
+ btrfs_inode_check_errors(inode);
*ordered_io_error = true;
break;
}
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
&token);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(inode, &logged_list);
+ btrfs_get_logged_extents(inode, &logged_list, start, end);
/*
* a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (fast_search) {
+ /*
+ * Some ordered extents started by fsync might have completed
+ * before we collected the ordered extents in logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
+ */
+ err = btrfs_inode_check_errors(inode);
+ if (err) {
+ ctx->io_err = err;
+ goto out_unlock;
+ }
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx);
if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cbb766577f31..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-static void lock_chunks(struct btrfs_root *root)
-{
- mutex_lock(&root->fs_info->chunk_mutex);
-}
-
-static void unlock_chunks(struct btrfs_root *root)
-{
- mutex_unlock(&root->fs_info->chunk_mutex);
-}
-
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
u64 *start, u64 len)
{
struct extent_map *em;
+ struct list_head *search_list = &trans->transaction->pending_chunks;
int ret = 0;
- list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+again:
+ list_for_each_entry(em, search_list, list) {
struct map_lookup *map;
int i;
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
ret = 1;
}
}
+ if (search_list == &trans->transaction->pending_chunks) {
+ search_list = &trans->root->fs_info->pinned_chunks;
+ goto again;
+ }
return ret;
}
@@ -1800,8 +1796,8 @@ error_undo:
goto error_brelse;
}
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev)
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices;
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->bdev)
fs_devices->open_devices--;
+}
+
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
+{
+ struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
call_rcu(&srcdev->rcu, free_device);
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
}
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for the tree */
- free_extent_map(em);
out:
/* once for us */
free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
+ /* One for the pending_chunks list reference */
+ free_extent_map(em);
error:
kfree(devices_info);
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 70be2571cedf..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -456,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -521,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
struct btrfs_transaction *transaction);
+
+static inline void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf20131fbe4..47b19465f0dc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
+#include "locking.h"
ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
- struct btrfs_dir_item *di;
+ struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->skip_release_on_error = 1;
+
+ if (!value) {
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+ name, name_len, -1);
+ if (!di && (flags & XATTR_REPLACE))
+ ret = -ENODATA;
+ else if (di)
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ goto out;
+ }
+ /*
+ * For a replace we can't just do the insert blindly.
+ * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+ * doesn't exist. If it exists, fall down below to the insert/replace
+ * path - we can't race with a concurrent xattr delete, because the VFS
+ * locks the inode's i_mutex before calling setxattr or removexattr.
+ */
if (flags & XATTR_REPLACE) {
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
- name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- } else if (!di) {
+ ASSERT(mutex_is_locked(&inode->i_mutex));
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+ name, name_len, 0);
+ if (!di) {
ret = -ENODATA;
goto out;
}
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
btrfs_release_path(path);
+ di = NULL;
+ }
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+ name, name_len, value, size);
+ if (ret == -EOVERFLOW) {
/*
- * remove the attribute
+ * We have an existing item in a leaf, split_leaf couldn't
+ * expand it. That item might have or not a dir_item that
+ * matches our target xattr, so lets check.
*/
- if (!value)
- goto out;
- } else {
- di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
- name, name_len, 0);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ ret = 0;
+ btrfs_assert_tree_locked(path->nodes[0]);
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (!di && !(flags & XATTR_REPLACE)) {
+ ret = -ENOSPC;
goto out;
}
- if (!di && !value)
- goto out;
- btrfs_release_path(path);
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ ASSERT(di); /* logic error */
+ } else if (ret) {
+ goto out;
}
-again:
- ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
- name, name_len, value, size);
- /*
- * If we're setting an xattr to a new value but the new value is say
- * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
- * back from split_leaf. This is because it thinks we'll be extending
- * the existing item size, but we're asking for enough space to add the
- * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
- * the rest of the function figure it out.
- */
- if (ret == -EOVERFLOW)
+ if (di && (flags & XATTR_CREATE)) {
ret = -EEXIST;
+ goto out;
+ }
- if (ret == -EEXIST) {
- if (flags & XATTR_CREATE)
- goto out;
+ if (di) {
/*
- * We can't use the path we already have since we won't have the
- * proper locking for a delete, so release the path and
- * re-lookup to delete the thing.
+ * We're doing a replace, and it must be atomic, that is, at
+ * any point in time we have either the old or the new xattr
+ * value in the tree. We don't want readers (getxattr and
+ * listxattrs) to miss a value, this is specially important
+ * for ACLs.
*/
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
- name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- } else if (!di) {
- /* Shouldn't happen but just in case... */
- btrfs_release_path(path);
- goto again;
+ const int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 data_size = sizeof(*di) + name_len + size;
+ struct btrfs_item *item;
+ unsigned long data_ptr;
+ char *ptr;
+
+ if (size > old_data_len) {
+ if (btrfs_leaf_free_space(root, leaf) <
+ (size - old_data_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
}
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
+ if (old_data_len + name_len + sizeof(*di) == item_size) {
+ /* No other xattrs packed in the same leaf item. */
+ if (size > old_data_len)
+ btrfs_extend_item(root, path,
+ size - old_data_len);
+ else if (size < old_data_len)
+ btrfs_truncate_item(root, path, data_size, 1);
+ } else {
+ /* There are other xattrs packed in the same item. */
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto out;
+ btrfs_extend_item(root, path, data_size);
+ }
+ item = btrfs_item_nr(slot);
+ ptr = btrfs_item_ptr(leaf, slot, char);
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ di = (struct btrfs_dir_item *)ptr;
+ btrfs_set_dir_data_len(leaf, di, size);
+ data_ptr = ((unsigned long)(di + 1)) + name_len;
+ write_extent_buffer(leaf, value, data_ptr, size);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
/*
- * We have a value to set, so go back and try to insert it now.
+ * Insert, and we had space for the xattr, so path->slots[0] is
+ * where our xattr dir_item is and btrfs_insert_xattr_item()
+ * filled it.
*/
- if (value) {
- btrfs_release_path(path);
- goto again;
- }
}
out:
btrfs_free_path(path);
OpenPOWER on IntegriCloud