diff options
Diffstat (limited to 'fs')
60 files changed, 810 insertions, 360 deletions
@@ -830,16 +830,20 @@ void exit_aio(struct mm_struct *mm) static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -847,10 +851,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -869,6 +875,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } @@ -1021,6 +1028,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); + put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test @@ -1062,6 +1070,9 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) goto out; + head %= ctx->nr_events; + tail %= ctx->nr_events; + while (ret < nr) { long avail; struct io_event *ev; @@ -1100,8 +1111,6 @@ static long aio_read_events_ring(struct kioctx *ctx, flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); - - put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395ab586..1c55388ae633 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 92371c414228..1daea0b47187 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2af6e66fe788..eea26e1b2fda 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8bb4aa19898f..08e65e9cf2aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -2904,7 +2905,9 @@ retry_root_backup: if (ret) goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99c253918208..813537f362f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5678,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5701,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5741,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0d321c23069a..47aceb494d1d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* @@ -3139,7 +3142,6 @@ out: static void clone_update_extent_map(struct inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, const u64 hole_offset, const u64 hole_len) { @@ -3154,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode, return; } - if (fi) { + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); btrfs_extent_item_to_extent_map(inode, path, fi, false, em); em->generation = -1; if (btrfs_file_extent_type(path->nodes[0], fi) == @@ -3508,18 +3514,15 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); } /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) clone_update_extent_map(inode, trans, - path, NULL, drop_start, + NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, - extent, 0, 0); + clone_update_extent_map(inode, trans, path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3562,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen); - if (ret) - goto out; - clone_update_extent_map(inode, trans, path, NULL, last_dest_end, - destoff + len - last_dest_end); } out: diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d3b64f..9626b4ad3b9a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291a523e..4a88f073fdd7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4662d92a4b73..8e16bca69c56 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; @@ -1467,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; @@ -1808,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index df39458f1487..78699364f537 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,14 +605,37 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -623,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -666,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab576318a84..ac46df37504c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 511839c04f11..5f379affdf23 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -491,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } @@ -1615,11 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ if (ret) return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c83b24251e53..6104676857f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -1680,6 +1683,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; @@ -2143,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2156,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { @@ -2205,6 +2224,7 @@ error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 4f196314c0c1..b67d8fc81277 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -EIO; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0227b45ef00a..15e9505aa35f 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -290,7 +290,8 @@ int cifsConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapChars) { - int i, j, charlen; + int i, charlen; + int j = 0; char src_char; __le16 dst_char; wchar_t tmp; @@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, if (!mapChars) return cifs_strtoUTF16(target, source, PATH_MAX, cp); - for (i = 0, j = 0; i < srclen; j++) { + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; switch (src_char) { case 0: - put_unaligned(0, &target[j]); goto ctoUTF16_out; case ':': dst_char = cpu_to_le16(UNI_COLON); @@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, } ctoUTF16_out: + put_unaligned(0, &target[j]); /* Null terminate target unicode string */ return j; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2c90d07c0b3a..888398067420 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -725,6 +725,19 @@ out_nls: goto out; } +static ssize_t +cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t rc; + struct inode *inode = file_inode(iocb->ki_filp); + + rc = cifs_revalidate_mapping(inode); + if (rc) + return rc; + + return generic_file_read_iter(iocb, iter); +} + static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -881,7 +894,7 @@ const struct inode_operations cifs_symlink_inode_ops = { const struct file_operations cifs_file_ops = { .read = new_sync_read, .write = new_sync_write, - .read_iter = generic_file_read_iter, + .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, @@ -939,7 +952,7 @@ const struct file_operations cifs_file_direct_ops = { const struct file_operations cifs_file_nobrl_ops = { .read = new_sync_read, .write = new_sync_write, - .read_iter = generic_file_read_iter, + .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 264ece71bdb2..68559fd557fb 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -374,7 +374,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; oparms.create_options = create_options; - oparms.disposition = FILE_OPEN; + oparms.disposition = FILE_CREATE; oparms.path = path; oparms.fid = &fid; oparms.reconnect = false; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0762d143e252..fca382037ddd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -194,7 +194,16 @@ static void ext4_init_block_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -359,6 +368,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -369,6 +379,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -376,6 +389,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3f5c188953a4..0b7e28e7eaa4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -966,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0ee59a6644e2..5b87fc36aab8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } @@ -851,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -887,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8a57e9fcd1b9..fd69da194826 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], @@ -1310,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; + ext4_lblk_t count2; + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } - first2 = (first > offset) ? first - offset : 0; + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, - first2, count - offset, + first2, count2, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); @@ -1329,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); } brelse(bh); bh = NULL; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59e31622cc6e..2dcb936be90e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1431,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1441,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9b9aabfb4d2..6df7bc611dbd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1525,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; @@ -2809,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2826,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, @@ -3880,38 +3879,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - - /* - * set up enough so that it can read an inode, - * and create new inode for buddy allocator - */ - sbi->s_gdb_count = db_count; - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); - goto failed_mount2; - } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2a; + goto failed_mount2; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2a; + goto failed_mount2; } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3946,6 +3926,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4135,13 +4123,21 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount5; + goto failed_mount4a; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); goto failed_mount5; } @@ -4218,8 +4214,11 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_release_system_zone(sb); + ext4_mb_release(sb); failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4243,14 +4242,11 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); -failed_mount2a: - ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: - ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0924521306b4..f8cf619edb5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -608,8 +608,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; @@ -637,7 +637,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; if (dn.data_blkaddr != NULL_ADDR) { @@ -671,7 +671,7 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -708,10 +708,23 @@ out: return err; } +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, get_data_block); + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 966acb039e3b..a4addd72ebbd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -376,11 +376,11 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); +error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0); remove_dirty_dir_inode(inode); -error: remove_inode_page(inode); return ERR_PTR(err); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e51c732b0dd9..58df97e174d0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -342,9 +342,6 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -644,7 +641,8 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c58e33075719..7d8b96275092 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -659,16 +659,19 @@ static int expand_inode_data(struct inode *inode, loff_t offset, off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - f2fs_lock_op(sbi); + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); if (ret) break; - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) @@ -683,8 +686,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index adc622c6bdce..2cf6962f6cc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -78,6 +78,7 @@ static int do_read_inode(struct inode *inode) if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9138c32aa698..a6bdddc33ce2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -417,9 +417,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); @@ -448,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -457,9 +458,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); @@ -474,7 +472,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: - f2fs_put_page(new_page, 1); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9dfb9a042fd2..4b697ccc9b0c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -42,6 +42,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f25f0e07e26f..d04613df710a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,14 +272,15 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; return err; } - sbi->sm_info->cmd_control_info = fcc; return err; } @@ -1885,8 +1886,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2b18637cb9e..8f96d9372ade 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -689,9 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (unlikely(ino < F2FS_ROOT_INO(sbi))) - return ERR_PTR(-ESTALE); - if (unlikely(ino >= NM_I(sbi)->max_nid)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 098f97bdcf1b..ca887314aba9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - kunmap_atomic(cs->mapaddr); - } else { - kunmap_atomic(cs->mapaddr); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = kmap_atomic(buf->page); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = kmap_atomic(buf->page); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 42198359fa1b..0c6048247a34 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -814,13 +815,6 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename_common(olddir, oldent, newdir, newent, 0, - FUSE_RENAME, sizeof(struct fuse_rename_in)); -} - static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) @@ -831,17 +825,30 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - if (fc->no_rename2 || fc->minor < 23) - return -EINVAL; + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, - FUSE_RENAME2, sizeof(struct fuse_rename2_in)); - if (err == -ENOSYS) { - fc->no_rename2 = 1; - err = -EINVAL; + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); } + return err; +} +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename2(olddir, oldent, newdir, newent, 0); } static int fuse_link(struct dentry *entry, struct inode *newdir, @@ -985,7 +992,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1171,7 +1178,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6e16dad13e9b..40ac2628ddcf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1687,7 +1687,7 @@ static int fuse_writepage_locked(struct page *page) error = -EIO; req->ff = fuse_write_file_get(fc, fi); if (!req->ff) - goto err_free; + goto err_nofile; fuse_write_fill(req, req->ff, page_offset(page), 0); @@ -1715,6 +1715,8 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: @@ -1955,8 +1957,8 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kzalloc(sizeof(struct page *) * - FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) goto out; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 754dcf23de8a..8474028d7848 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -478,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -488,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -511,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -1006,7 +1018,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 38cfcf5f6fce..6f0f590cc5a3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1588,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e3d37f607f97..d895b4b7b661 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -39,6 +39,19 @@ struct kernfs_open_node { struct list_head files; /* goes through kernfs_open_file.list */ }; +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) return DEFAULT_POLLMASK|POLLERR|POLLPRI; } -/** - * kernfs_notify - notify a kernfs file - * @kn: file to notify - * - * Notify @kn such that poll(2) on @kn wakes up. - */ -void kernfs_notify(struct kernfs_node *kn) +static void kernfs_notify_workfn(struct work_struct *work) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_node *kn; struct kernfs_open_node *on; struct kernfs_super_info *info; - unsigned long flags; - - if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); /* kick poll */ - spin_lock_irqsave(&kernfs_open_node_lock, flags); + spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) { @@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn) wake_up_interruptible(&on->poll); } - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + spin_unlock_irq(&kernfs_open_node_lock); /* kick fsnotify */ mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct inode *inode; struct dentry *dentry; @@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn) } mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98a6cdd..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) kernfs_put(root_kn); } +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", diff --git a/fs/mbcache.c b/fs/mbcache.c index bf166e388f0d..187477ded6b3 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -73,6 +73,7 @@ #include <linux/mbcache.h> #include <linux/init.h> #include <linux/blockgroup_lock.h> +#include <linux/log2.h> #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -93,7 +94,7 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) -#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c496f8a74639..9927913c97c2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping) return ret; } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity |= flags; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); +} + /* * Invalidate the local caches */ @@ -162,17 +173,16 @@ static void nfs_zap_caches_locked(struct inode *inode) memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_fscache_invalidate(inode); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); } else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } @@ -187,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } @@ -209,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); @@ -369,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -415,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -550,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) spin_lock(&inode->i_lock); i_size_write(inode, offset); + /* Optimisation */ + if (offset == 0) + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -578,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL); spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { @@ -1101,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && inode->i_version == fattr->pre_change_attr) { inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ @@ -1117,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) @@ -1128,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr ret |= NFS_INO_INVALID_ATTR; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); - return ret; } @@ -1189,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfsi->cache_validity |= invalid; + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1402,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); - } + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); @@ -1601,6 +1609,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->npages == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1702,10 +1711,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid &= ~NFS_INO_INVALID_DATA; if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || (save_cache_validity & NFS_INO_REVAL_FORCED)) - nfsi->cache_validity |= invalid; - - if (invalid & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, invalid); return 0; out_err: diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index f63cb87cd730..ba2affa51941 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -230,7 +230,7 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3d5dbf80d46a..3d83cb1fdc70 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -139,16 +139,22 @@ static size_t nfs_parse_server_name(char *string, size_t len, * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * - * Return the pseudoflavor of the first security mechanism in - * "flavors" that is locally supported. Return RPC_AUTH_UNIX if - * no matching flavor is found in the array. The "flavors" array + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported. The "flavors" array * is searched in the order returned from the server, per RFC 3530 - * recommendation. + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * */ -static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, + struct nfs_server *server, struct nfs4_secinfo_flavors *flavors) { - rpc_authflavor_t pseudoflavor; + rpc_authflavor_t pflavor; struct nfs4_secinfo4 *secinfo; unsigned int i; @@ -159,62 +165,73 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, case RPC_AUTH_NULL: case RPC_AUTH_UNIX: case RPC_AUTH_GSS: - pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, &secinfo->flavor_info); - /* make sure pseudoflavor matches sec= mount opt */ - if (pseudoflavor != RPC_AUTH_MAXFLAVOR && - nfs_auth_info_match(&server->auth_info, - pseudoflavor)) - return pseudoflavor; - break; + /* does the pseudoflavor match a sec= mount opt? */ + if (pflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, pflavor)) { + struct rpc_clnt *new; + struct rpc_cred *cred; + + /* Cloning creates an rpc_auth for the flavor */ + new = rpc_clone_client_set_auth(clnt, pflavor); + if (IS_ERR(new)) + continue; + /** + * Check that the user actually can use the + * flavor. This is mostly for RPC_AUTH_GSS + * where cr_init obtains a gss context + */ + cred = rpcauth_lookupcred(new->cl_auth, 0); + if (IS_ERR(cred)) { + rpc_shutdown_client(new); + continue; + } + put_rpccred(cred); + return new; + } } } - - /* if there were any sec= options then nothing matched */ - if (server->auth_info.flavor_len > 0) - return -EPERM; - - return RPC_AUTH_UNIX; + return ERR_PTR(-EPERM); } -static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, + struct qstr *name) { struct page *page; struct nfs4_secinfo_flavors *flavors; - rpc_authflavor_t flavor; + struct rpc_clnt *new; int err; page = alloc_page(GFP_KERNEL); if (!page) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + flavors = page_address(page); err = nfs4_proc_secinfo(inode, name, flavors); if (err < 0) { - flavor = err; + new = ERR_PTR(err); goto out; } - flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors); + new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); out: put_page(page); - return flavor; -} - -/* - * Please call rpc_shutdown_client() when you are done with this client. - */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, - struct qstr *name) -{ - rpc_authflavor_t flavor; - - flavor = nfs4_negotiate_security(inode, name); - if ((int)flavor < 0) - return ERR_PTR((int)flavor); - - return rpc_clone_client_set_auth(clnt, flavor); + return new; } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, @@ -397,11 +414,6 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else { - rpc_authflavor_t new = nfs4_negotiate_security(dir, name); - if ((int)new >= 0) - flavor = new; - } mnt = nfs_do_submount(dentry, fh, fattr, flavor); out: rpc_shutdown_client(client); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 285ad5334018..4bf3d97cc5a0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3247,7 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - client = nfs4_create_sec_client(client, dir, name); + client = nfs4_negotiate_security(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3ee5af4e738e..98ff061ccaf3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -934,12 +934,14 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) return false; smp_rmb(); if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) return false; out: + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return false; return PageUptodate(page) != 0; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6851b003f2a4..8f029db5d271 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -617,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, create->cr_linkname, create->cr_linklen, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 83baf2bfe9e9..b56b1cc02718 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -600,7 +600,18 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create READ_BUF(4); create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + /* + * The VFS will want a null-terminated string, and + * null-terminating in place isn't safe since this might + * end on a page boundary: + */ + create->cr_linkname = + kmalloc(create->cr_linklen + 1, GFP_KERNEL); + if (!create->cr_linkname) + return nfserr_jukebox; + memcpy(create->cr_linkname, p, create->cr_linklen); + create->cr_linkname[create->cr_linklen] = '\0'; + defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: @@ -2630,7 +2641,7 @@ nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { __be32 *p; - p = xdr_reserve_space(xdr, 6); + p = xdr_reserve_space(xdr, 20); if (!p) return NULL; *p++ = htonl(2); @@ -3267,7 +3278,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd wire_count = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); - xdr_truncate_encode(xdr, length_offset + 4 + maxcount); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); if (maxcount & 3) write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index a106b3f2b22a..fae17c640df3 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -331,6 +331,7 @@ struct dlm_lock_resource u16 state; char lvb[DLM_LVB_LEN]; unsigned int inflight_locks; + unsigned int inflight_assert_workers; unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; @@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3087a21d32f9..82abf0cc9a12 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; + res->inflight_assert_workers = 0; res->dlm = dlm; @@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, wake_up(&res->wq); } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -1603,7 +1641,8 @@ send_response: mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); - } + } else + dlm_lockres_grab_inflight_worker(dlm, res); } else { if (res) dlm_lockres_put(res); @@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) dlm_lockres_release_ast(dlm, res); put: + dlm_lockres_drop_inflight_worker(dlm, res); + dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -3088,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it so that only one mle will be found */ __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); - ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; - mlog(0, "%s:%.*s: master=%u, newmaster=%u, " - "telling master to get ref for cleared out mle " - "during migration\n", dlm->name, namelen, name, - master, new_master); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } } spin_unlock(&tmp->spinlock); } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 5de019437ea5..45067faf5695 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); - } + } else + __dlm_lockres_grab_inflight_worker(dlm, res); } else /* put.. incase we are not the master */ dlm_lockres_put(res); spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 9db869de829d..69aac6f088ad 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * refs on it. */ unused = __dlm_lockres_unused(lockres); if (!unused || - (lockres->state & DLM_LOCK_RES_MIGRATING)) { + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { mlog(0, "%s: res %.*s is in use or being remastered, " - "used %d, state %d\n", dlm->name, - lockres->lockname.len, lockres->lockname.name, - !unused, lockres->state); - list_move_tail(&dlm->purge_list, &lockres->purge); + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); spin_unlock(&lockres->spinlock); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 5698b52cf5c9..2e3c9dbab68c 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { /* must clear the actions because this unlock * is about to be retried. cannot free or do * any list manipulation. */ @@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, res->lockname.name, status==DLM_RECOVERING?"recovering": (status==DLM_MIGRATING?"migrating": - "forward")); + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); actions = 0; } if (flags & LKM_CANCEL) @@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * updated state to the recovery master. this thread * just needs to finish out the operation and call * the unlockast. */ - ret = DLM_NORMAL; + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; } else { /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); @@ -638,7 +644,9 @@ retry: if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + /* We want to go away for a tiny bit to allow recovery * / migration to complete on this resource. I don't * know of any wait queue we could sleep on as this @@ -650,7 +658,7 @@ retry: msleep(50); mlog(0, "retrying unlock due to pending recovery/" - "migration/in-progress\n"); + "migration/in-progress/reconnect\n"); goto retry; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2060fc398445..8add6f1030d7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) return inode; } +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir, sigset_t oldset; int did_block_signals = 0; struct posix_acl *default_acl = NULL, *acl = NULL; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, &lookup); @@ -469,6 +487,9 @@ leave: * ocfs2_delete_inode will mutex_lock again. */ if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -991,6 +1012,65 @@ leave: return status; } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog(ML_NOTICE, "max lookup times reached, filesystem " + "may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + /* * The only place this should be used is rename! * if they have the same id, then the 1st one is the only one locked. @@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, struct inode *inode2) { int status; + int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); struct buffer_head **tmpbh; @@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (*bh2) *bh2 = NULL; - /* we always want to lock the one with the lower lockid first. */ + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ if (oi1->ip_blkno != oi2->ip_blkno) { - if (oi1->ip_blkno < oi2->ip_blkno) { + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } + + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ tmpbh = bh2; bh2 = bh1; @@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; /* At some point it might be nice to break this function up a * bit. */ @@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } } /* if old and new are the same, this'll just do one lock. */ @@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + should_add_orphan = true; } } else { BUG_ON(new_dentry->d_parent->d_inode != new_dir); @@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode) || - (ocfs2_read_links_count(newfe) == 1)) { - status = ocfs2_orphan_add(osb, handle, new_inode, - newfe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - /* change the dirent to point to the correct inode */ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, old_inode); @@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir, else ocfs2_add_links_count(newfe, -1); ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, &lookup); @@ -1864,6 +1980,9 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d6..6cb019b7c6a8 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename, __entry->new_len, __get_str(new_name)) ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + TRACE_EVENT(ocfs2_rename_target_exists, TP_PROTO(int new_len, const char *new_name), TP_ARGS(new_len, new_name), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 714e53b9cc66..636aab69ead5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); + ocfs2_rw_unlock(inode, 1); goto out; } @@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c7a89cea5c5d..ddb662b32447 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); + ocfs2_truncate_log_shutdown(osb); + /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - /* - * During dismount, when it recovers another node it will call - * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. - */ - ocfs2_truncate_log_shutdown(osb); - ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9e5f0e..bf2d03f8fd3e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63715c0..7f30bdc57d13 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -702,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -713,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb108d2..3857b720cb1b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> +#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; |