diff options
Diffstat (limited to 'fs')
240 files changed, 7556 insertions, 6222 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fc06fd27065e..dd6f7ee1e312 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", page, (unsigned long)filp->private_data); + /* Update file times before taking page lock */ + file_update_time(filp); + v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ v9fs_fscache_wait_on_page_write(inode, page); diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 6e0be43ef6ef..a32246b8359e 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -10,30 +10,6 @@ #include <linux/slab.h> #include "affs.h" -/* This is, of course, shamelessly stolen from fs/minix */ - -static const int nibblemap[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 }; - -static u32 -affs_count_free_bits(u32 blocksize, const void *data) -{ - const u32 *map; - u32 free; - u32 tmp; - - map = data; - free = 0; - for (blocksize /= 4; blocksize > 0; blocksize--) { - tmp = *map++; - while (tmp) { - free += nibblemap[tmp & 0xf]; - tmp >>= 4; - } - } - - return free; -} - u32 affs_count_free_blocks(struct super_block *sb) { @@ -317,7 +293,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) goto out; } pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key); - bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4); + bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); /* Don't try read the extension if this is the last block, * but we also need the right bm pointer below @@ -367,7 +343,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) /* recalculate bitmap count for last block */ bm--; - bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4); + bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); out: affs_brelse(bh); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 1feb68ecef95..842d00048a65 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -94,25 +94,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev, { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; - struct dentry *p, *q; + struct dentry *q; spin_lock(&sbi->lookup_lock); + spin_lock(&root->d_lock); - if (prev == NULL) { - spin_lock(&root->d_lock); + if (prev) + next = prev->d_u.d_child.next; + else { prev = dget_dlock(root); next = prev->d_subdirs.next; - p = prev; - goto start; } - p = prev; - spin_lock(&p->d_lock); -again: - next = p->d_u.d_child.next; -start: +cont: if (next == &root->d_subdirs) { - spin_unlock(&p->d_lock); + spin_unlock(&root->d_lock); spin_unlock(&sbi->lookup_lock); dput(prev); return NULL; @@ -121,16 +117,15 @@ start: q = list_entry(next, struct dentry, d_u.d_child); spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); - /* Negative dentry - try next */ - if (!simple_positive(q)) { - spin_unlock(&p->d_lock); - lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); - p = q; - goto again; + /* Already gone or negative dentry (under construction) - try next */ + if (q->d_count == 0 || !simple_positive(q)) { + spin_unlock(&q->d_lock); + next = q->d_u.d_child.next; + goto cont; } dget_dlock(q); spin_unlock(&q->d_lock); - spin_unlock(&p->d_lock); + spin_unlock(&root->d_lock); spin_unlock(&sbi->lookup_lock); dput(prev); @@ -404,11 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, DPRINTK("checking mountpoint %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 2; - if (dentry->d_count > ino_count) - goto next; - /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) goto next; @@ -1312,7 +1312,7 @@ EXPORT_SYMBOL(bio_copy_kern); * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, pdflush) could clean the pages if they are mapped + * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index adb1cd7ceb9b..4bab807227ad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3342,10 +3342,22 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); + +#ifdef CONFIG_PRINTK +__printf(2, 3) void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); +#else +static inline __printf(2, 3) +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} +#endif + +__printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); + void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -3386,6 +3398,7 @@ do { \ (errno), fmt, ##args); \ } while (0) +__printf(5, 6) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 502b20c56e84..62e0cafd6e25 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1114,7 +1114,7 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, spin_unlock(&root->fs_info->delalloc_lock); btrfs_panic(root->fs_info, -EOVERFLOW, "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%lu)", + " dirty_mdatadata_bytes (%llu)", buf->len, root->fs_info->dirty_metadata_bytes); } @@ -1614,8 +1614,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { btrfs_run_delayed_iputs(root); @@ -1647,7 +1645,6 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9aa01ec2138d..5caf285c6e4d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t err = 0; size_t count, ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); @@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48bdfd2591c2..6e8f416773d4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -324,7 +324,8 @@ static noinline int add_async_extent(struct async_cow *cow, * If this code finds it can't get good compression, it puts an * entry onto the work queue to write the uncompressed bytes. This * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that pdflush sent them down. + * are written in the same order that the flusher thread sent them + * down. */ static noinline int compress_file_range(struct inode *inode, struct page *locked_page, @@ -6629,6 +6630,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + sb_start_pagefault(inode->i_sb); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); @@ -6718,12 +6720,15 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - if (!ret) + if (!ret) { + sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; + } unlock_page(page); out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out_noreserve: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 43f0012016e3..7bb755677a22 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -195,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(file); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); ip_oldflags = ip->flags; @@ -209,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write_file(file); - if (ret) - goto out_unlock; - if (flags & FS_SYNC_FL) ip->flags |= BTRFS_INODE_SYNC; else @@ -275,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode->i_flags = i_oldflags; } - mnt_drop_write_file(file); out_unlock: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 643335a4fe3c..051c7fe551dd 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -596,7 +596,7 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting - * for pdflush to find them + * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->i_mapping, start, end); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c5dbd9149679..4da08652004d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1241,7 +1241,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (rb_node) { btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " - "tree\n"); + "tree\n", node->bytenr); kfree(node); return -EEXIST; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fa61ef59cd61..f2eb24c477a3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -100,10 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; } -/* NOTE: - * We move write_super stuff at umount in order to avoid deadlock - * for umount hold all lock. - */ static void save_error_info(struct btrfs_fs_info *fs_info) { __save_error_info(fs_info); @@ -125,6 +121,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } } +#ifdef CONFIG_PRINTK /* * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. @@ -167,7 +164,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, va_end(args); } -const char *logtypes[] = { +static const char * const logtypes[] = { "emergency", "alert", "critical", @@ -185,21 +182,49 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) struct va_format vaf; va_list args; const char *type = logtypes[4]; + int kern_level; va_start(args, fmt); - if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { - memcpy(lvl, fmt, 3); - lvl[3] = '\0'; - fmt += 3; - type = logtypes[fmt[1] - '0']; + kern_level = printk_get_level(fmt); + if (kern_level) { + size_t size = printk_skip_level(fmt) - fmt; + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + fmt += size; + type = logtypes[kern_level - '0']; } else *lvl = '\0'; vaf.fmt = fmt; vaf.va = &args; + printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); + + va_end(args); +} + +#else + +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + /* Don't go through full error handling during mount */ + if (sb->s_flags & MS_BORN) { + save_error_info(fs_info); + btrfs_handle_error(fs_info); + } } +#endif /* * We only mark the transaction aborted and then set the file system read-only. diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7ac7cdcc294e..17be3dedacba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -335,6 +335,8 @@ again: if (!h) return ERR_PTR(-ENOMEM); + sb_start_intwrite(root->fs_info->sb); + if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -345,6 +347,7 @@ again: } while (ret == -EBUSY); if (ret < 0) { + sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(ret); } @@ -548,6 +551,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + sb_end_intwrite(root->fs_info->sb); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1578,6 +1583,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + sb_end_intwrite(root->fs_info->sb); + trace_btrfs_transaction_commit(root); btrfs_scrub_continue(root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8708f994e67..e86ae04abe6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1744,10 +1744,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->fs_devices = root->fs_info->fs_devices; - /* - * we don't want write_supers to jump in here with our device - * half setup - */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, diff --git a/fs/buffer.c b/fs/buffer.c index c7062c896d7c..9f6d2e41281d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2306,8 +2306,8 @@ EXPORT_SYMBOL(block_commit_write); * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. * - * Direct callers of this function should call vfs_check_frozen() so that page - * fault does not busyloop until the fs is thawed. + * Direct callers of this function should protect against filesystem freezing + * using sb_start_write() - sb_end_write() functions. */ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) @@ -2318,6 +2318,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, loff_t size; int ret; + /* + * Update file times before taking page lock. We may end up failing the + * fault so this update may be superfluous but who really cares... + */ + file_update_time(vma->vm_file); + lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || @@ -2339,18 +2345,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; - /* - * Freezing in progress? We check after the page is marked dirty and - * with page lock held so if the test here fails, we are sure freezing - * code will wait during syncing until the page fault is done - at that - * point page will be dirty and unlocked so freezing code will write it - * and writeprotect it again. - */ set_page_dirty(page); - if (inode->i_sb->s_frozen != SB_UNFROZEN) { - ret = -EAGAIN; - goto out_unlock; - } wait_on_page_writeback(page); return 0; out_unlock: @@ -2365,12 +2360,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, int ret; struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; - /* - * This check is racy but catches the common case. The check in - * __block_page_mkwrite() is reliable. - */ - vfs_check_frozen(sb, SB_FREEZE_WRITE); + sb_start_pagefault(sb); ret = __block_page_mkwrite(vma, vmf, get_block); + sb_end_pagefault(sb); return block_page_mkwrite_return(ret); } EXPORT_SYMBOL(block_page_mkwrite); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c0353dfac51f..c994691d9445 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -919,7 +919,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) * own time */ path.mnt = cache->mnt; path.dentry = object->backer; - file = dentry_open(&path, O_RDWR, cache->cache_cred); + file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred); if (IS_ERR(file)) { ret = PTR_ERR(file); } else { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b67304e4b80..452e71a1b753 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size, len; int ret; + /* Update time before taking page lock */ + file_update_time(vma->vm_file); + size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 00894ff9246c..e5b77319c97b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,7 @@ int ceph_init_dentry(struct dentry *dentry) goto out_unlock; } - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) d_set_d_op(dentry, &ceph_dentry_ops); else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) d_set_d_op(dentry, &ceph_snapdir_dentry_ops); @@ -79,7 +78,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) return NULL; spin_lock(&dentry->d_lock); - if (dentry->d_parent) { + if (!IS_ROOT(dentry)) { inode = dentry->d_parent->d_inode; ihold(inode); } @@ -634,44 +633,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return dentry; } -int ceph_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode, - int *opened) -{ - int err; - struct dentry *res = NULL; - - if (!(flags & O_CREAT)) { - if (dentry->d_name.len > NAME_MAX) - return -ENAMETOOLONG; - - err = ceph_init_dentry(dentry); - if (err < 0) - return err; - - return ceph_lookup_open(dir, dentry, file, flags, mode, opened); - } - - if (d_unhashed(dentry)) { - res = ceph_lookup(dir, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; - } - - /* We don't deal with positive dentries here */ - if (dentry->d_inode) - return finish_no_open(file, res); - - *opened |= FILE_CREATED; - err = ceph_lookup_open(dir, dentry, file, flags, mode, opened); - dput(res); - - return err; -} - /* * If we do a create but get no trace back from the MDS, follow up with * a lookup (the VFS expects us to link up the provided dentry). @@ -1154,7 +1115,7 @@ static void ceph_d_prune(struct dentry *dentry) dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ - if (!dentry->d_parent || IS_ROOT(dentry)) + if (IS_ROOT(dentry)) return; /* if we are not hashed, we don't affect D_COMPLETE */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 1b81d6c31878..ecebbc09bfc7 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> @@ -106,9 +107,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* - * If the filp already has private_data, that means the file was - * already opened by intent during lookup, and we do nothing. - * * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -207,24 +205,29 @@ out: /* - * Do a lookup + open with a single request. - * - * If this succeeds, but some subsequent check in the vfs - * may_open() fails, the struct *file gets cleaned up (i.e. - * ceph_release gets called). So fear not! + * Do a lookup + open with a single request. If we get a non-existent + * file or symlink, return 1 so the VFS can retry. */ -int ceph_lookup_open(struct inode *dir, struct dentry *dentry, +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, int *opened) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; - struct dentry *ret; + struct dentry *dn; int err; - dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", - dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); + dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + + if (dentry->d_name.len > NAME_MAX) + return -ENAMETOOLONG; + + err = ceph_init_dentry(dentry); + if (err < 0) + return err; /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); @@ -241,22 +244,31 @@ int ceph_lookup_open(struct inode *dir, struct dentry *dentry, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); err = ceph_handle_snapdir(req, dentry, err); - if (err) - goto out; - if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (err) - goto out; - err = finish_open(file, req->r_dentry, ceph_open, opened); -out: - ret = ceph_finish_lookup(req, dentry, err); - ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", ret); - if (IS_ERR(ret)) - return PTR_ERR(ret); + if (d_unhashed(dentry)) { + dn = ceph_finish_lookup(req, dentry, err); + if (IS_ERR(dn)) + err = PTR_ERR(dn); + } else { + /* we were given a hashed negative dentry */ + dn = NULL; + } + if (err) + goto out_err; + if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { + /* make vfs retry on splice, ENOENT, or symlink */ + dout("atomic_open finish_no_open on dn %p\n", dn); + err = finish_no_open(file, dn); + } else { + dout("atomic_open finish_open on dn %p\n", dn); + err = finish_open(file, dentry, ceph_open, opened); + } - dput(ret); +out_err: + ceph_mdsc_put_request(req); + dout("atomic_open result=%d\n", err); return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 200bc87eceb1..a5a735422aa7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -10,6 +10,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> @@ -394,11 +395,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); - s->s_con.private = s; - s->s_con.ops = &mds_con_ops; - s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; - s->s_con.peer_name.num = cpu_to_le64(mds); + ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; @@ -440,7 +437,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->sessions[mds] = s; atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ - ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); return s; @@ -1472,11 +1470,6 @@ retry: else len += 1 + temp->d_name.len; temp = temp->d_parent; - if (temp == NULL) { - rcu_read_unlock(); - pr_err("build_path corrupt dentry %p\n", dentry); - return ERR_PTR(-EINVAL); - } } rcu_read_unlock(); if (len) @@ -1513,12 +1506,6 @@ retry: if (pos) path[--pos] = '/'; temp = temp->d_parent; - if (temp == NULL) { - rcu_read_unlock(); - pr_err("build_path corrupt dentry\n"); - kfree(path); - return ERR_PTR(-EINVAL); - } } rcu_read_unlock(); if (pos != 0 || read_seqretry(&rename_lock, seq)) { @@ -2531,7 +2518,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; + ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); /* replay unsafe requests */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e5206fc76562..cbb2f54a3019 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; - int i; - int num = realm->num_prior_parent_snaps + realm->num_snaps; + u32 num = realm->num_prior_parent_snaps + realm->num_snaps; /* * build parent context, if it hasn't been built. @@ -321,11 +320,11 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - realm->cached_context->num_snaps); + (unsigned int) realm->cached_context->num_snaps); return 0; } @@ -342,6 +341,8 @@ static int build_snap_context(struct ceph_snap_realm *realm) num = 0; snapc->seq = realm->seq; if (parent) { + u32 i; + /* include any of parent's snaps occurring _after_ my parent became my parent */ for (i = 0; i < parent->cached_context->num_snaps; i++) @@ -361,8 +362,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", - realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", + realm->ino, realm, snapc, snapc->seq, + (unsigned int) snapc->num_snaps); if (realm->cached_context) ceph_put_snap_context(realm->cached_context); @@ -402,9 +404,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm) * helper to allocate and decode an array of snapids. free prior * instance, if any. */ -static int dup_array(u64 **dst, __le64 *src, int num) +static int dup_array(u64 **dst, __le64 *src, u32 num) { - int i; + u32 i; kfree(*dst); if (num) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7076109f014d..b982239f38f9 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -18,6 +18,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/auth.h> diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f4d5522cb619..66ebe720e40d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -612,9 +612,9 @@ struct ceph_snap_realm { u64 parent_since; /* snapid when our current parent became so */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */ - int num_prior_parent_snaps; /* had prior to parent_since */ + u32 num_prior_parent_snaps; /* had prior to parent_since */ u64 *snaps; /* snaps specific to this realm */ - int num_snaps; + u32 num_snaps; struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ @@ -806,9 +806,9 @@ extern int ceph_copy_from_page_vector(struct page **pages, loff_t off, size_t len); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_open(struct inode *inode, struct file *file); -extern int ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct file *od, unsigned flags, - umode_t mode, int *opened); +extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened); extern int ceph_release(struct inode *inode, struct file *filp); /* dir.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 785cb3057c95..2c2ae5be9902 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -457,6 +457,7 @@ start: for (i = 0; i < numattr; i++) kfree(xattrs[i]); kfree(xattrs); + xattrs = NULL; goto start; } err = -EIO; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 497da5ce704c..977dc0e85ccb 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -246,6 +246,16 @@ struct smb_version_operations { bool (*can_echo)(struct TCP_Server_Info *); /* send echo request */ int (*echo)(struct TCP_Server_Info *); + /* create directory */ + int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, + struct cifs_sb_info *); + /* set info on created directory */ + void (*mkdir_setinfo)(struct inode *, const char *, + struct cifs_sb_info *, struct cifs_tcon *, + const unsigned int); + /* remove directory */ + int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *, + struct cifs_sb_info *); }; struct smb_version_values { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index cf7fb185103c..f1bbf8305d3a 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -289,18 +289,15 @@ extern int CIFSSMBUnixSetFileInfo(const unsigned int xid, u16 fid, u32 pid_of_opener); extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, - struct cifs_tcon *tcon, char *file_name, + struct cifs_tcon *tcon, const char *file_name, const struct cifs_unix_set_info_args *args, const struct nls_table *nls_codepage, - int remap_special_chars); + int remap); extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, - const char *newName, - const struct nls_table *nls_codepage, - int remap_special_chars); + const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, - const char *name, const struct nls_table *nls_codepage, - int remap_special_chars); + const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name, __u16 type, const struct nls_table *nls_codepage, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index cabc7a01f5df..074923ce593d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -948,15 +948,15 @@ DelFileRetry: } int -CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, - const char *dirName, const struct nls_table *nls_codepage, - int remap) +CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) { DELETE_DIRECTORY_REQ *pSMB = NULL; DELETE_DIRECTORY_RSP *pSMBr = NULL; int rc = 0; int bytes_returned; int name_len; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; cFYI(1, "In CIFSSMBRmDir"); RmDirRetry: @@ -966,14 +966,15 @@ RmDirRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName, - PATH_MAX, nls_codepage, remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, + PATH_MAX, cifs_sb->local_nls, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ - name_len = strnlen(dirName, PATH_MAX); + name_len = strnlen(name, PATH_MAX); name_len++; /* trailing null */ - strncpy(pSMB->DirName, dirName, name_len); + strncpy(pSMB->DirName, name, name_len); } pSMB->BufferFormat = 0x04; @@ -992,14 +993,15 @@ RmDirRetry: } int -CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, - const char *name, const struct nls_table *nls_codepage, int remap) +CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) { int rc = 0; CREATE_DIRECTORY_REQ *pSMB = NULL; CREATE_DIRECTORY_RSP *pSMBr = NULL; int bytes_returned; int name_len; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; cFYI(1, "In CIFSSMBMkDir"); MkDirRetry: @@ -1010,7 +1012,8 @@ MkDirRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, - PATH_MAX, nls_codepage, remap); + PATH_MAX, cifs_sb->local_nls, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ @@ -5943,7 +5946,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, int CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, - char *fileName, + const char *file_name, const struct cifs_unix_set_info_args *args, const struct nls_table *nls_codepage, int remap) { @@ -5964,14 +5967,14 @@ setPermsRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name, PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); + name_len = strnlen(file_name, PATH_MAX); name_len++; /* trailing null */ - strncpy(pSMB->FileName, fileName, name_len); + strncpy(pSMB->FileName, file_name, name_len); } params = 6 + name_len; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 35cb6a374a45..7354877fa3bd 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1219,16 +1219,153 @@ unlink_out: return rc; } +static int +cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode, + const char *full_path, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, const unsigned int xid) +{ + int rc = 0; + struct inode *newinode = NULL; + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, + xid); + else + rc = cifs_get_inode_info(&newinode, full_path, NULL, + inode->i_sb, xid, NULL); + if (rc) + return rc; + + d_instantiate(dentry, newinode); + /* + * setting nlink not necessary except in cases where we failed to get it + * from the server or was set bogus + */ + if ((dentry->d_inode) && (dentry->d_inode->i_nlink < 2)) + set_nlink(dentry->d_inode, 2); + + mode &= ~current_umask(); + /* must turn on setgid bit if parent dir has it */ + if (inode->i_mode & S_ISGID) + mode |= S_ISGID; + + if (tcon->unix_ext) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = (__u64)current_fsuid(); + if (inode->i_mode & S_ISGID) + args.gid = (__u64)inode->i_gid; + else + args.gid = (__u64)current_fsgid(); + } else { + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; + } + CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + } else { + struct TCP_Server_Info *server = tcon->ses->server; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && + (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo) + server->ops->mkdir_setinfo(newinode, full_path, cifs_sb, + tcon, xid); + if (dentry->d_inode) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + dentry->d_inode->i_mode = (mode | S_IFDIR); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + dentry->d_inode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + dentry->d_inode->i_gid = inode->i_gid; + else + dentry->d_inode->i_gid = + current_fsgid(); + } + } + } + return rc; +} + +static int +cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, + const char *full_path, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, const unsigned int xid) +{ + int rc = 0; + u32 oplock = 0; + FILE_UNIX_BASIC_INFO *info = NULL; + struct inode *newinode = NULL; + struct cifs_fattr fattr; + + info = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (info == NULL) { + rc = -ENOMEM; + goto posix_mkdir_out; + } + + mode &= ~current_umask(); + rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, + NULL /* netfid */, info, &oplock, full_path, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == -EOPNOTSUPP) + goto posix_mkdir_out; + else if (rc) { + cFYI(1, "posix mkdir returned 0x%x", rc); + d_drop(dentry); + goto posix_mkdir_out; + } + + if (info->Type == cpu_to_le32(-1)) + /* no return info, go query for it */ + goto posix_mkdir_get_info; + /* + * BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if + * need to set uid/gid. + */ + + cifs_unix_basic_to_fattr(&fattr, info, cifs_sb); + cifs_fill_uniqueid(inode->i_sb, &fattr); + newinode = cifs_iget(inode->i_sb, &fattr); + if (!newinode) + goto posix_mkdir_get_info; + + d_instantiate(dentry, newinode); + +#ifdef CONFIG_CIFS_DEBUG2 + cFYI(1, "instantiated dentry %p %s to inode %p", dentry, + dentry->d_name.name, newinode); + + if (newinode->i_nlink != 2) + cFYI(1, "unexpected number of links %d", newinode->i_nlink); +#endif + +posix_mkdir_out: + kfree(info); + return rc; +posix_mkdir_get_info: + rc = cifs_mkdir_qinfo(inode, dentry, mode, full_path, cifs_sb, tcon, + xid); + goto posix_mkdir_out; +} + int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) { - int rc = 0, tmprc; + int rc = 0; unsigned int xid; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - char *full_path = NULL; - struct inode *newinode = NULL; - struct cifs_fattr fattr; + struct TCP_Server_Info *server; + char *full_path; cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode); @@ -1248,145 +1385,29 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { - u32 oplock = 0; - FILE_UNIX_BASIC_INFO *pInfo = - kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); - if (pInfo == NULL) { - rc = -ENOMEM; + rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb, + tcon, xid); + if (rc != -EOPNOTSUPP) goto mkdir_out; - } - - mode &= ~current_umask(); - rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, - mode, NULL /* netfid */, pInfo, &oplock, - full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == -EOPNOTSUPP) { - kfree(pInfo); - goto mkdir_retry_old; - } else if (rc) { - cFYI(1, "posix mkdir returned 0x%x", rc); - d_drop(direntry); - } else { - if (pInfo->Type == cpu_to_le32(-1)) { - /* no return info, go query for it */ - kfree(pInfo); - goto mkdir_get_info; - } -/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need - to set uid/gid */ - - cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); - cifs_fill_uniqueid(inode->i_sb, &fattr); - newinode = cifs_iget(inode->i_sb, &fattr); - if (!newinode) { - kfree(pInfo); - goto mkdir_get_info; - } - - d_instantiate(direntry, newinode); + } -#ifdef CONFIG_CIFS_DEBUG2 - cFYI(1, "instantiated dentry %p %s to inode %p", - direntry, direntry->d_name.name, newinode); + server = tcon->ses->server; - if (newinode->i_nlink != 2) - cFYI(1, "unexpected number of links %d", - newinode->i_nlink); -#endif - } - kfree(pInfo); + if (!server->ops->mkdir) { + rc = -ENOSYS; goto mkdir_out; } -mkdir_retry_old: + /* BB add setting the equivalent of mode via CreateX w/ACLs */ - rc = CIFSSMBMkDir(xid, tcon, full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); if (rc) { cFYI(1, "cifs_mkdir returned 0x%x", rc); d_drop(direntry); - } else { -mkdir_get_info: - if (tcon->unix_ext) - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - else - rc = cifs_get_inode_info(&newinode, full_path, NULL, - inode->i_sb, xid, NULL); - - d_instantiate(direntry, newinode); - /* setting nlink not necessary except in cases where we - * failed to get it from the server or was set bogus */ - if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) - set_nlink(direntry->d_inode, 2); - - mode &= ~current_umask(); - /* must turn on setgid bit if parent dir has it */ - if (inode->i_mode & S_ISGID) - mode |= S_ISGID; - - if (tcon->unix_ext) { - struct cifs_unix_set_info_args args = { - .mode = mode, - .ctime = NO_CHANGE_64, - .atime = NO_CHANGE_64, - .mtime = NO_CHANGE_64, - .device = 0, - }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64)current_fsuid(); - if (inode->i_mode & S_ISGID) - args.gid = (__u64)inode->i_gid; - else - args.gid = (__u64)current_fsgid(); - } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; - } - CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - } else { - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && - (mode & S_IWUGO) == 0) { - FILE_BASIC_INFO pInfo; - struct cifsInodeInfo *cifsInode; - u32 dosattrs; - - memset(&pInfo, 0, sizeof(pInfo)); - cifsInode = CIFS_I(newinode); - dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; - pInfo.Attributes = cpu_to_le32(dosattrs); - tmprc = CIFSSMBSetPathInfo(xid, tcon, - full_path, &pInfo, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (tmprc == 0) - cifsInode->cifsAttrs = dosattrs; - } - if (direntry->d_inode) { - if (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_DYNPERM) - direntry->d_inode->i_mode = - (mode | S_IFDIR); - - if (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_SET_UID) { - direntry->d_inode->i_uid = - current_fsuid(); - if (inode->i_mode & S_ISGID) - direntry->d_inode->i_gid = - inode->i_gid; - else - direntry->d_inode->i_gid = - current_fsgid(); - } - } - } + goto mkdir_out; } + + rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, + xid); mkdir_out: /* * Force revalidate to get parent dir info when needed since cached @@ -1405,7 +1426,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) unsigned int xid; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; char *full_path = NULL; struct cifsInodeInfo *cifsInode; @@ -1425,10 +1447,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) rc = PTR_ERR(tlink); goto rmdir_exit; } - pTcon = tlink_tcon(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (!server->ops->rmdir) { + rc = -ENOSYS; + cifs_put_tlink(tlink); + goto rmdir_exit; + } - rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); if (!rc) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index c40356d24c5c..3129ac74b819 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -586,6 +586,27 @@ cifs_print_stats(struct seq_file *m, struct cifs_tcon *tcon) #endif } +static void +cifs_mkdir_setinfo(struct inode *inode, const char *full_path, + struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, + const unsigned int xid) +{ + FILE_BASIC_INFO info; + struct cifsInodeInfo *cifsInode; + u32 dosattrs; + int rc; + + memset(&info, 0, sizeof(info)); + cifsInode = CIFS_I(inode); + dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; + info.Attributes = cpu_to_le32(dosattrs); + rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) + cifsInode->cifsAttrs = dosattrs; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -620,6 +641,9 @@ struct smb_version_operations smb1_operations = { .get_srv_inum = cifs_get_srv_inum, .build_path_to_root = cifs_build_path_to_root, .echo = CIFSSMBEcho, + .mkdir = CIFSSMBMkDir, + .mkdir_setinfo = cifs_mkdir_setinfo, + .rmdir = CIFSSMBRmDir, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1ba5c405315c..2aa5cb08c526 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -122,3 +122,42 @@ out: kfree(smb2_data); return rc; } + +int +smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_open_op_close(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0, + CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); +} + +void +smb2_mkdir_setinfo(struct inode *inode, const char *name, + struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, + const unsigned int xid) +{ + FILE_BASIC_INFO data; + struct cifsInodeInfo *cifs_i; + u32 dosattrs; + int tmprc; + + memset(&data, 0, sizeof(data)); + cifs_i = CIFS_I(inode); + dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; + data.Attributes = cpu_to_le32(dosattrs); + tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0, + CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); + if (tmprc == 0) + cifs_i->cifsAttrs = dosattrs; +} + +int +smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, + NULL, SMB2_OP_DELETE); +} diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 410cf925ea26..826209bf3684 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -318,6 +318,9 @@ struct smb_version_operations smb21_operations = { .query_path_info = smb2_query_path_info, .get_srv_inum = smb2_get_srv_inum, .build_path_to_root = smb2_build_path_to_root, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, }; struct smb_version_values smb21_values = { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 902bbe2b5ad3..bfaa7b148afd 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -52,6 +52,14 @@ extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, FILE_ALL_INFO *data, bool *adjust_tz); +extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, const unsigned int xid); +extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); + /* * SMB2 Worker functions - most of protocol specific implementation details * are contained within these calls. diff --git a/fs/compat.c b/fs/compat.c index 6161255fac45..1bdb350ea5d3 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1155,11 +1155,14 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, struct file *file; int fput_needed; ssize_t ret; + loff_t pos; file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_readv(file, vec, vlen, &file->f_pos); + pos = file->f_pos; + ret = compat_readv(file, vec, vlen, &pos); + file->f_pos = pos; fput_light(file, fput_needed); return ret; } @@ -1221,11 +1224,14 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, struct file *file; int fput_needed; ssize_t ret; + loff_t pos; file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_writev(file, vec, vlen, &file->f_pos); + pos = file->f_pos; + ret = compat_writev(file, vec, vlen, &pos); + file->f_pos = pos; fput_light(file, fput_needed); return ret; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 989e034f02bd..cfb4b9fed520 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -385,8 +385,6 @@ struct ecryptfs_msg_ctx { struct mutex mux; }; -struct ecryptfs_daemon; - struct ecryptfs_daemon { #define ECRYPTFS_DAEMON_IN_READ 0x00000001 #define ECRYPTFS_DAEMON_IN_POLL 0x00000002 @@ -394,10 +392,7 @@ struct ecryptfs_daemon { #define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008 u32 flags; u32 num_queued_msg_ctx; - struct pid *pid; - uid_t euid; - struct user_namespace *user_ns; - struct task_struct *task; + struct file *file; struct mutex mux; struct list_head msg_ctx_out_queue; wait_queue_head_t wait; @@ -554,6 +549,8 @@ extern struct kmem_cache *ecryptfs_key_tfm_cache; struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb); void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); +int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, size_t *decrypted_name_size, struct dentry *ecryptfs_dentry, @@ -607,13 +604,8 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); -int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, - struct pid *pid); -int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, - struct pid *pid); -int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, - struct user_namespace *user_ns, struct pid *pid, - u32 seq); +int ecryptfs_process_response(struct ecryptfs_daemon *daemon, + struct ecryptfs_message *msg, u32 seq); int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx); int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, @@ -658,8 +650,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, struct inode *ecryptfs_inode); struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); -int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, - struct user_namespace *user_ns); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, @@ -671,8 +662,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, u16 msg_flags, struct ecryptfs_daemon *daemon); void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int -ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, - struct user_namespace *user_ns, struct pid *pid); +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); int ecryptfs_init_kthread(void); void ecryptfs_destroy_kthread(void); int ecryptfs_privileged_open(struct file **lower_file, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 2b17f2f9b121..44ce5c6a541d 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -138,29 +138,50 @@ out: return rc; } -static void ecryptfs_vma_close(struct vm_area_struct *vma) -{ - filemap_write_and_wait(vma->vm_file->f_mapping); -} - -static const struct vm_operations_struct ecryptfs_file_vm_ops = { - .close = ecryptfs_vma_close, - .fault = filemap_fault, -}; +struct kmem_cache *ecryptfs_file_info_cache; -static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int read_or_initialize_metadata(struct dentry *dentry) { + struct inode *inode = dentry->d_inode; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; int rc; - rc = generic_file_mmap(file, vma); + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mount_crypt_stat = &ecryptfs_superblock_to_private( + inode->i_sb)->mount_crypt_stat; + mutex_lock(&crypt_stat->cs_mutex); + + if (crypt_stat->flags & ECRYPTFS_POLICY_APPLIED && + crypt_stat->flags & ECRYPTFS_KEY_VALID) { + rc = 0; + goto out; + } + + rc = ecryptfs_read_metadata(dentry); if (!rc) - vma->vm_ops = &ecryptfs_file_vm_ops; + goto out; + + if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) { + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); + rc = 0; + goto out; + } + if (!(mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) && + !i_size_read(ecryptfs_inode_to_lower(inode))) { + rc = ecryptfs_initialize_file(dentry, inode); + if (!rc) + goto out; + } + + rc = -EIO; +out: + mutex_unlock(&crypt_stat->cs_mutex); return rc; } -struct kmem_cache *ecryptfs_file_info_cache; - /** * ecryptfs_open * @inode: inode speciying file to open @@ -236,32 +257,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = 0; goto out; } - mutex_lock(&crypt_stat->cs_mutex); - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) - || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { - rc = ecryptfs_read_metadata(ecryptfs_dentry); - if (rc) { - ecryptfs_printk(KERN_DEBUG, - "Valid headers not found\n"); - if (!(mount_crypt_stat->flags - & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { - rc = -EIO; - printk(KERN_WARNING "Either the lower file " - "is not in a valid eCryptfs format, " - "or the key could not be retrieved. " - "Plaintext passthrough mode is not " - "enabled; returning -EIO\n"); - mutex_unlock(&crypt_stat->cs_mutex); - goto out_put; - } - rc = 0; - crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED - | ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - goto out; - } - } - mutex_unlock(&crypt_stat->cs_mutex); + rc = read_or_initialize_metadata(ecryptfs_dentry); + if (rc) + goto out_put; ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, (unsigned long long)i_size_read(inode)); @@ -292,15 +290,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - int rc = 0; - - rc = generic_file_fsync(file, start, end, datasync); - if (rc) - goto out; - rc = vfs_fsync_range(ecryptfs_file_to_lower(file), start, end, - datasync); -out: - return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } static int ecryptfs_fasync(int fd, struct file *file, int flag) @@ -369,7 +359,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = ecryptfs_file_mmap, + .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ffa2be57804d..534b129ea676 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -143,6 +143,31 @@ static int ecryptfs_interpose(struct dentry *lower_dentry, return 0; } +static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); + struct dentry *lower_dir_dentry; + int rc; + + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_unlink(lower_dir_inode, lower_dentry); + if (rc) { + printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); + goto out_unlock; + } + fsstack_copy_attr_times(dir, lower_dir_inode); + set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); + inode->i_ctime = dir->i_ctime; + d_drop(dentry); +out_unlock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + return rc; +} + /** * ecryptfs_do_create * @directory_inode: inode of the new file's dentry's parent in ecryptfs @@ -182,8 +207,10 @@ ecryptfs_do_create(struct inode *directory_inode, } inode = __ecryptfs_get_inode(lower_dentry->d_inode, directory_inode->i_sb); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + vfs_unlink(lower_dir_dentry->d_inode, lower_dentry); goto out_lock; + } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: @@ -200,8 +227,8 @@ out: * * Returns zero on success */ -static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, - struct inode *ecryptfs_inode) +int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; @@ -264,7 +291,9 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, * that this on disk file is prepared to be an ecryptfs file */ rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); if (rc) { - drop_nlink(ecryptfs_inode); + ecryptfs_do_unlink(directory_inode, ecryptfs_dentry, + ecryptfs_inode); + make_bad_inode(ecryptfs_inode); unlock_new_inode(ecryptfs_inode); iput(ecryptfs_inode); goto out; @@ -318,21 +347,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, struct vfsmount *lower_mnt; int rc = 0; - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); - BUG_ON(!lower_dentry->d_count); - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - ecryptfs_set_dentry_private(dentry, dentry_info); if (!dentry_info) { printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); dput(lower_dentry); - mntput(lower_mnt); - d_drop(dentry); return -ENOMEM; } + + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + BUG_ON(!lower_dentry->d_count); + + ecryptfs_set_dentry_private(dentry, dentry_info); ecryptfs_set_dentry_lower(dentry, lower_dentry); ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); @@ -381,12 +409,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; - if ((ecryptfs_dentry->d_name.len == 1 - && !strcmp(ecryptfs_dentry->d_name.name, ".")) - || (ecryptfs_dentry->d_name.len == 2 - && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { - goto out_d_drop; - } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, @@ -397,8 +419,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, - encrypted_and_encoded_name); - goto out_d_drop; + ecryptfs_dentry->d_name.name); + goto out; } if (lower_dentry->d_inode) goto interpose; @@ -415,7 +437,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " "filename; rc = [%d]\n", __func__, rc); - goto out_d_drop; + goto out; } mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, @@ -427,14 +449,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); - goto out_d_drop; + goto out; } interpose: rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, ecryptfs_dir_inode); - goto out; -out_d_drop: - d_drop(ecryptfs_dentry); out: kfree(encrypted_and_encoded_name); return ERR_PTR(rc); @@ -476,27 +495,7 @@ out_lock: static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) { - int rc = 0; - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); - struct dentry *lower_dir_dentry; - - dget(lower_dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_unlink(lower_dir_inode, lower_dentry); - if (rc) { - printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); - goto out_unlock; - } - fsstack_copy_attr_times(dir, lower_dir_inode); - set_nlink(dentry->d_inode, - ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink); - dentry->d_inode->i_ctime = dir->i_ctime; - d_drop(dentry); -out_unlock: - unlock_dir(lower_dir_dentry); - dput(lower_dentry); - return rc; + return ecryptfs_do_unlink(dir, dentry, dentry->d_inode); } static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, @@ -971,12 +970,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) goto out; } - if (S_ISREG(inode->i_mode)) { - rc = filemap_write_and_wait(inode->i_mapping); - if (rc) - goto out; - fsstack_copy_attr_all(inode, lower_inode); - } memcpy(&lower_ia, ia, sizeof(lower_ia)); if (ia->ia_valid & ATTR_FILE) lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1c0b3b6b75c6..2768138eefee 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -279,6 +279,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, char *fnek_src; char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; + u8 cipher_code; *check_ruid = 0; @@ -420,6 +421,18 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, && !fn_cipher_key_bytes_set) mount_crypt_stat->global_default_fn_cipher_key_bytes = mount_crypt_stat->global_default_cipher_key_size; + + cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size); + if (!cipher_code) { + ecryptfs_printk(KERN_ERR, + "eCryptfs doesn't support cipher: %s", + mount_crypt_stat->global_default_cipher_name); + rc = -EINVAL; + goto out; + } + mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, NULL)) { @@ -540,6 +553,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags } ecryptfs_set_superblock_lower(s, path.dentry->d_sb); + + /** + * Set the POSIX ACL flag based on whether they're enabled in the lower + * mount. Force a read-only eCryptfs mount if the lower mount is ro. + * Allow a ro eCryptfs mount even when the lower mount is rw. + */ + s->s_flags = flags & ~MS_POSIXACL; + s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL); + s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index a750f957b145..b29bb8bfa8d9 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -32,8 +32,8 @@ static struct mutex ecryptfs_msg_ctx_lists_mux; static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; static int ecryptfs_hash_bits; -#define ecryptfs_uid_hash(uid) \ - hash_long((unsigned long)uid, ecryptfs_hash_bits) +#define ecryptfs_current_euid_hash(uid) \ + hash_long((unsigned long)current_euid(), ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -105,26 +105,24 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) /** * ecryptfs_find_daemon_by_euid - * @euid: The effective user id which maps to the desired daemon id - * @user_ns: The namespace in which @euid applies * @daemon: If return value is zero, points to the desired daemon pointer * * Must be called with ecryptfs_daemon_hash_mux held. * - * Search the hash list for the given user id. + * Search the hash list for the current effective user id. * * Returns zero if the user id exists in the list; non-zero otherwise. */ -int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, - struct user_namespace *user_ns) +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon) { struct hlist_node *elem; int rc; hlist_for_each_entry(*daemon, elem, - &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)], - euid_chain) { - if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) { + &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()], + euid_chain) { + if ((*daemon)->file->f_cred->euid == current_euid() && + (*daemon)->file->f_cred->user_ns == current_user_ns()) { rc = 0; goto out; } @@ -137,9 +135,7 @@ out: /** * ecryptfs_spawn_daemon - Create and initialize a new daemon struct * @daemon: Pointer to set to newly allocated daemon struct - * @euid: Effective user id for the daemon - * @user_ns: The namespace in which @euid applies - * @pid: Process id for the daemon + * @file: File used when opening /dev/ecryptfs * * Must be called ceremoniously while in possession of * ecryptfs_sacred_daemon_hash_mux @@ -147,8 +143,7 @@ out: * Returns zero on success; non-zero otherwise */ int -ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, - struct user_namespace *user_ns, struct pid *pid) +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file) { int rc = 0; @@ -159,16 +154,13 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); goto out; } - (*daemon)->euid = euid; - (*daemon)->user_ns = get_user_ns(user_ns); - (*daemon)->pid = get_pid(pid); - (*daemon)->task = current; + (*daemon)->file = file; mutex_init(&(*daemon)->mux); INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue); init_waitqueue_head(&(*daemon)->wait); (*daemon)->num_queued_msg_ctx = 0; hlist_add_head(&(*daemon)->euid_chain, - &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]); + &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()]); out: return rc; } @@ -188,9 +180,6 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ) || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) { rc = -EBUSY; - printk(KERN_WARNING "%s: Attempt to destroy daemon with pid " - "[0x%p], but it is in the midst of a read or a poll\n", - __func__, daemon->pid); mutex_unlock(&daemon->mux); goto out; } @@ -203,12 +192,6 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) ecryptfs_msg_ctx_alloc_to_free(msg_ctx); } hlist_del(&daemon->euid_chain); - if (daemon->task) - wake_up_process(daemon->task); - if (daemon->pid) - put_pid(daemon->pid); - if (daemon->user_ns) - put_user_ns(daemon->user_ns); mutex_unlock(&daemon->mux); kzfree(daemon); out: @@ -216,42 +199,9 @@ out: } /** - * ecryptfs_process_quit - * @euid: The user ID owner of the message - * @user_ns: The namespace in which @euid applies - * @pid: The process ID for the userspace program that sent the - * message - * - * Deletes the corresponding daemon for the given euid and pid, if - * it is the registered that is requesting the deletion. Returns zero - * after deleting the desired daemon; non-zero otherwise. - */ -int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - struct ecryptfs_daemon *daemon; - int rc; - - mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns); - if (rc || !daemon) { - rc = -EINVAL; - printk(KERN_ERR "Received request from user [%d] to " - "unregister unrecognized daemon [0x%p]\n", euid, pid); - goto out_unlock; - } - rc = ecryptfs_exorcise_daemon(daemon); -out_unlock: - mutex_unlock(&ecryptfs_daemon_hash_mux); - return rc; -} - -/** * ecryptfs_process_reponse * @msg: The ecryptfs message received; the caller should sanity check * msg->data_len and free the memory - * @pid: The process ID of the userspace application that sent the - * message * @seq: The sequence number of the message; must match the sequence * number for the existing message context waiting for this * response @@ -270,16 +220,11 @@ out_unlock: * * Returns zero on success; non-zero otherwise */ -int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, - struct user_namespace *user_ns, struct pid *pid, - u32 seq) +int ecryptfs_process_response(struct ecryptfs_daemon *daemon, + struct ecryptfs_message *msg, u32 seq) { - struct ecryptfs_daemon *uninitialized_var(daemon); struct ecryptfs_msg_ctx *msg_ctx; size_t msg_size; - struct nsproxy *nsproxy; - struct user_namespace *tsk_user_ns; - uid_t ctx_euid; int rc; if (msg->index >= ecryptfs_message_buf_len) { @@ -292,51 +237,6 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, } msg_ctx = &ecryptfs_msg_ctx_arr[msg->index]; mutex_lock(&msg_ctx->mux); - mutex_lock(&ecryptfs_daemon_hash_mux); - rcu_read_lock(); - nsproxy = task_nsproxy(msg_ctx->task); - if (nsproxy == NULL) { - rc = -EBADMSG; - printk(KERN_ERR "%s: Receiving process is a zombie. Dropping " - "message.\n", __func__); - rcu_read_unlock(); - mutex_unlock(&ecryptfs_daemon_hash_mux); - goto wake_up; - } - tsk_user_ns = __task_cred(msg_ctx->task)->user_ns; - ctx_euid = task_euid(msg_ctx->task); - rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns); - rcu_read_unlock(); - mutex_unlock(&ecryptfs_daemon_hash_mux); - if (rc) { - rc = -EBADMSG; - printk(KERN_WARNING "%s: User [%d] received a " - "message response from process [0x%p] but does " - "not have a registered daemon\n", __func__, - ctx_euid, pid); - goto wake_up; - } - if (ctx_euid != euid) { - rc = -EBADMSG; - printk(KERN_WARNING "%s: Received message from user " - "[%d]; expected message from user [%d]\n", __func__, - euid, ctx_euid); - goto unlock; - } - if (tsk_user_ns != user_ns) { - rc = -EBADMSG; - printk(KERN_WARNING "%s: Received message from user_ns " - "[0x%p]; expected message from user_ns [0x%p]\n", - __func__, user_ns, tsk_user_ns); - goto unlock; - } - if (daemon->pid != pid) { - rc = -EBADMSG; - printk(KERN_ERR "%s: User [%d] sent a message response " - "from an unrecognized process [0x%p]\n", - __func__, ctx_euid, pid); - goto unlock; - } if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) { rc = -EINVAL; printk(KERN_WARNING "%s: Desired context element is not " @@ -359,9 +259,8 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, } memcpy(msg_ctx->msg, msg, msg_size); msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; - rc = 0; -wake_up: wake_up_process(msg_ctx->task); + rc = 0; unlock: mutex_unlock(&msg_ctx->mux); out: @@ -383,14 +282,11 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx) { struct ecryptfs_daemon *daemon; - uid_t euid = current_euid(); int rc; - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); + rc = ecryptfs_find_daemon_by_euid(&daemon); if (rc || !daemon) { rc = -ENOTCONN; - printk(KERN_ERR "%s: User [%d] does not have a daemon " - "registered\n", __func__, euid); goto out; } mutex_lock(&ecryptfs_msg_ctx_lists_mux); diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index c0038f6566d4..412e6eda25f8 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -33,7 +33,7 @@ static atomic_t ecryptfs_num_miscdev_opens; /** * ecryptfs_miscdev_poll - * @file: dev file (ignored) + * @file: dev file * @pt: dev poll table (ignored) * * Returns the poll mask @@ -41,20 +41,10 @@ static atomic_t ecryptfs_num_miscdev_opens; static unsigned int ecryptfs_miscdev_poll(struct file *file, poll_table *pt) { - struct ecryptfs_daemon *daemon; + struct ecryptfs_daemon *daemon = file->private_data; unsigned int mask = 0; - uid_t euid = current_euid(); - int rc; - mutex_lock(&ecryptfs_daemon_hash_mux); - /* TODO: Just use file->private_data? */ - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); - if (rc || !daemon) { - mutex_unlock(&ecryptfs_daemon_hash_mux); - return -EINVAL; - } mutex_lock(&daemon->mux); - mutex_unlock(&ecryptfs_daemon_hash_mux); if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { printk(KERN_WARNING "%s: Attempt to poll on zombified " "daemon\n", __func__); @@ -79,7 +69,7 @@ out_unlock_daemon: /** * ecryptfs_miscdev_open * @inode: inode of miscdev handle (ignored) - * @file: file for miscdev handle (ignored) + * @file: file for miscdev handle * * Returns zero on success; non-zero otherwise */ @@ -87,7 +77,6 @@ static int ecryptfs_miscdev_open(struct inode *inode, struct file *file) { struct ecryptfs_daemon *daemon = NULL; - uid_t euid = current_euid(); int rc; mutex_lock(&ecryptfs_daemon_hash_mux); @@ -98,30 +87,20 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) "count; rc = [%d]\n", __func__, rc); goto out_unlock_daemon_list; } - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); - if (rc || !daemon) { - rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(), - task_pid(current)); - if (rc) { - printk(KERN_ERR "%s: Error attempting to spawn daemon; " - "rc = [%d]\n", __func__, rc); - goto out_module_put_unlock_daemon_list; - } - } - mutex_lock(&daemon->mux); - if (daemon->pid != task_pid(current)) { + rc = ecryptfs_find_daemon_by_euid(&daemon); + if (!rc) { rc = -EINVAL; - printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], " - "but pid [0x%p] has attempted to open the handle " - "instead\n", __func__, daemon->pid, daemon->euid, - task_pid(current)); - goto out_unlock_daemon; + goto out_unlock_daemon_list; + } + rc = ecryptfs_spawn_daemon(&daemon, file); + if (rc) { + printk(KERN_ERR "%s: Error attempting to spawn daemon; " + "rc = [%d]\n", __func__, rc); + goto out_module_put_unlock_daemon_list; } + mutex_lock(&daemon->mux); if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { rc = -EBUSY; - printk(KERN_ERR "%s: Miscellaneous device handle may only be " - "opened once per daemon; pid [0x%p] already has this " - "handle open\n", __func__, daemon->pid); goto out_unlock_daemon; } daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; @@ -140,7 +119,7 @@ out_unlock_daemon_list: /** * ecryptfs_miscdev_release * @inode: inode of fs/ecryptfs/euid handle (ignored) - * @file: file for fs/ecryptfs/euid handle (ignored) + * @file: file for fs/ecryptfs/euid handle * * This keeps the daemon registered until the daemon sends another * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters. @@ -150,20 +129,18 @@ out_unlock_daemon_list: static int ecryptfs_miscdev_release(struct inode *inode, struct file *file) { - struct ecryptfs_daemon *daemon = NULL; - uid_t euid = current_euid(); + struct ecryptfs_daemon *daemon = file->private_data; int rc; - mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); - if (rc || !daemon) - daemon = file->private_data; mutex_lock(&daemon->mux); BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; atomic_dec(&ecryptfs_num_miscdev_opens); mutex_unlock(&daemon->mux); + + mutex_lock(&ecryptfs_daemon_hash_mux); rc = ecryptfs_exorcise_daemon(daemon); + mutex_unlock(&ecryptfs_daemon_hash_mux); if (rc) { printk(KERN_CRIT "%s: Fatal error whilst attempting to " "shut down daemon; rc = [%d]. Please report this " @@ -171,7 +148,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file) BUG(); } module_put(THIS_MODULE); - mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } @@ -248,7 +224,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, /** * ecryptfs_miscdev_read - format and send message from queue - * @file: fs/ecryptfs/euid miscdevfs handle (ignored) + * @file: miscdevfs handle * @buf: User buffer into which to copy the next message on the daemon queue * @count: Amount of space available in @buf * @ppos: Offset in file (ignored) @@ -262,43 +238,27 @@ static ssize_t ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct ecryptfs_daemon *daemon; + struct ecryptfs_daemon *daemon = file->private_data; struct ecryptfs_msg_ctx *msg_ctx; size_t packet_length_size; char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE]; size_t i; size_t total_length; - uid_t euid = current_euid(); int rc; - mutex_lock(&ecryptfs_daemon_hash_mux); - /* TODO: Just use file->private_data? */ - rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); - if (rc || !daemon) { - mutex_unlock(&ecryptfs_daemon_hash_mux); - return -EINVAL; - } mutex_lock(&daemon->mux); - if (task_pid(current) != daemon->pid) { - mutex_unlock(&daemon->mux); - mutex_unlock(&ecryptfs_daemon_hash_mux); - return -EPERM; - } if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { rc = 0; - mutex_unlock(&ecryptfs_daemon_hash_mux); printk(KERN_WARNING "%s: Attempt to read from zombified " "daemon\n", __func__); goto out_unlock_daemon; } if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) { rc = 0; - mutex_unlock(&ecryptfs_daemon_hash_mux); goto out_unlock_daemon; } /* This daemon will not go away so long as this flag is set */ daemon->flags |= ECRYPTFS_DAEMON_IN_READ; - mutex_unlock(&ecryptfs_daemon_hash_mux); check_list: if (list_empty(&daemon->msg_ctx_out_queue)) { mutex_unlock(&daemon->mux); @@ -382,16 +342,12 @@ out_unlock_daemon: * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon * @data: Bytes comprising struct ecryptfs_message * @data_size: sizeof(struct ecryptfs_message) + data len - * @euid: Effective user id of miscdevess sending the miscdev response - * @user_ns: The namespace in which @euid applies - * @pid: Miscdevess id of miscdevess sending the miscdev response * @seq: Sequence number for miscdev response packet * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_miscdev_response(char *data, size_t data_size, - uid_t euid, struct user_namespace *user_ns, - struct pid *pid, u32 seq) +static int ecryptfs_miscdev_response(struct ecryptfs_daemon *daemon, char *data, + size_t data_size, u32 seq) { struct ecryptfs_message *msg = (struct ecryptfs_message *)data; int rc; @@ -403,7 +359,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size, rc = -EINVAL; goto out; } - rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq); + rc = ecryptfs_process_response(daemon, msg, seq); if (rc) printk(KERN_ERR "Error processing response message; rc = [%d]\n", rc); @@ -413,7 +369,7 @@ out: /** * ecryptfs_miscdev_write - handle write to daemon miscdev handle - * @file: File for misc dev handle (ignored) + * @file: File for misc dev handle * @buf: Buffer containing user data * @count: Amount of data in @buf * @ppos: Pointer to offset in file (ignored) @@ -428,7 +384,6 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, u32 seq; size_t packet_size, packet_size_length; char *data; - uid_t euid = current_euid(); unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE]; ssize_t rc; @@ -488,10 +443,9 @@ memdup: } memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE); seq = be32_to_cpu(counter_nbo); - rc = ecryptfs_miscdev_response( + rc = ecryptfs_miscdev_response(file->private_data, &data[PKT_LEN_OFFSET + packet_size_length], - packet_size, euid, current_user_ns(), - task_pid(current), seq); + packet_size, seq); if (rc) { printk(KERN_WARNING "%s: Failed to deliver miscdev " "response to requesting operation; rc = [%zd]\n", diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index a46b3a8fee1e..bd1d57f98f74 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -66,18 +66,6 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) { int rc; - /* - * Refuse to write the page out if we are called from reclaim context - * since our writepage() path may potentially allocate memory when - * calling into the lower fs vfs_write() which may in turn invoke - * us again. - */ - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - rc = 0; - goto out; - } - rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting " @@ -498,7 +486,6 @@ static int ecryptfs_write_end(struct file *file, struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc; - int need_unlock_page = 1; ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); @@ -519,26 +506,26 @@ static int ecryptfs_write_end(struct file *file, "zeros in page with index = [0x%.16lx]\n", index); goto out; } - set_page_dirty(page); - unlock_page(page); - need_unlock_page = 0; + rc = ecryptfs_encrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " + "index [0x%.16lx])\n", index); + goto out; + } if (pos + copied > i_size_read(ecryptfs_inode)) { i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " "[0x%.16llx]\n", (unsigned long long)i_size_read(ecryptfs_inode)); - balance_dirty_pages_ratelimited(mapping); - rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); - if (rc) { - printk(KERN_ERR "Error writing inode size to metadata; " - "rc = [%d]\n", rc); - goto out; - } } - rc = copied; + rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); + if (rc) + printk(KERN_ERR "Error writing inode size to metadata; " + "rc = [%d]\n", rc); + else + rc = copied; out: - if (need_unlock_page) - unlock_page(page); + unlock_page(page); page_cache_release(page); return rc; } diff --git a/fs/exec.c b/fs/exec.c index e95aeeddd25c..574cf4de4ec3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2002,17 +2002,17 @@ static void coredump_finish(struct mm_struct *mm) void set_dumpable(struct mm_struct *mm, int value) { switch (value) { - case 0: + case SUID_DUMPABLE_DISABLED: clear_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case 1: + case SUID_DUMPABLE_ENABLED: set_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case 2: + case SUID_DUMPABLE_SAFE: set_bit(MMF_DUMP_SECURELY, &mm->flags); smp_wmb(); set_bit(MMF_DUMPABLE, &mm->flags); @@ -2025,7 +2025,7 @@ static int __get_dumpable(unsigned long mm_flags) int ret; ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret >= 2) ? 2 : ret; + return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret; } int get_dumpable(struct mm_struct *mm) @@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file) */ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { - struct file *rp, *wp; + struct file *files[2]; struct fdtable *fdt; struct coredump_params *cp = (struct coredump_params *)info->data; struct files_struct *cf = current->files; + int err = create_pipe_files(files, 0); + if (err) + return err; - wp = create_write_pipe(0); - if (IS_ERR(wp)) - return PTR_ERR(wp); - - rp = create_read_pipe(wp, 0); - if (IS_ERR(rp)) { - free_write_pipe(wp); - return PTR_ERR(rp); - } - - cp->file = wp; + cp->file = files[1]; sys_close(0); - fd_install(0, rp); + fd_install(0, files[0]); spin_lock(&cf->file_lock); fdt = files_fdtable(cf); __set_open_fd(0, fdt); @@ -2111,6 +2104,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) int retval = 0; int flag = 0; int ispipe; + bool need_nonrelative = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .signr = signr, @@ -2136,14 +2130,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (!cred) goto fail; /* - * We cannot trust fsuid as being the "true" uid of the - * process nor do we know its entire history. We only know it - * was tainted so we dump it as root in mode 2. + * We cannot trust fsuid as being the "true" uid of the process + * nor do we know its entire history. We only know it was tainted + * so we dump it as root in mode 2, and only into a controlled + * environment (pipe handler or fully qualified path). */ - if (__get_dumpable(cprm.mm_flags) == 2) { + if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ + need_nonrelative = true; } retval = coredump_wait(exit_code, &core_state); @@ -2171,15 +2167,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) } if (cprm.limit == 1) { - /* + /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. + * * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * cprm.limit of 1 here as a speacial value. Any - * non-1 limit gets set to RLIM_INFINITY below, but - * a limit of 0 skips the dump. This is a consistent - * way to catch recursive crashes. We can still crash - * if the core_pattern binary sets RLIM_CORE = !1 - * but it runs as root, and can do lots of stupid things + * cprm.limit of 1 here as a speacial value, this is a + * consistent way to catch recursive crashes. + * We can still crash if the core_pattern binary sets + * RLIM_CORE = !1, but it runs as root, and can do + * lots of stupid things. + * * Note that we use task_tgid_vnr here to grab the pid * of the process group leader. That way we get the * right pid if a thread in a multi-threaded @@ -2223,6 +2220,14 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; + if (need_nonrelative && cn.corename[0] != '/') { + printk(KERN_WARNING "Pid %d(%s) can only dump core "\ + "to fully qualified path!\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_unlock; + } + cprm.file = filp_open(cn.corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5badb0c039de..1562c27a2fab 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,15 +37,12 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; - unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) { - unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); + unsigned pages = min_t(unsigned, expected_pages, + layout->max_io_length / PAGE_SIZE); - /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } @@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol) * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = MAX_PAGES_KMALLOC; + pcol->expected_pages = + exofs_max_io_pages(&pcol->sbi->layout, ~0); } static int pcol_try_alloc(struct page_collect *pcol) @@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page) size_t len; int ret; + BUG_ON(!PageLocked(page)); + /* FIXME: Just for debugging, will be removed */ if (PageUptodate(page)) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, @@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) if (!pcol->that_locked_page || (pcol->that_locked_page->index != index)) { - struct page *page = find_get_page(pcol->inode->i_mapping, index); + struct page *page; + loff_t i_size = i_size_read(pcol->inode); + + if (offset >= i_size) { + *uptodate = true; + EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index); + return ZERO_PAGE(0); + } + page = find_get_page(pcol->inode->i_mapping, index); if (!page) { page = find_or_create_page(pcol->inode->i_mapping, index, GFP_NOFS); @@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page) { struct page_collect *pcol = priv; - if (pcol->that_locked_page != page) { + if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { EXOFS_DBGMSG("index=0x%lx\n", page->index); page_cache_release(page); return; } - EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", + ZERO_PAGE(0) == page ? -1 : page->index); } static const struct _ore_r4w_op _r4w_op = { diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 24a49d47e935..1585db1aa365 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -837,11 +837,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) bio->bi_rw |= REQ_WRITE; } - osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, - bio, per_dev->length); + osd_req_write(or, _ios_obj(ios, cur_comp), + per_dev->offset, bio, per_dev->length); ORE_DBGMSG("write(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, dev)->id), + _LLU(_ios_obj(ios, cur_comp)->id), _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { @@ -853,20 +853,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) (ios->si.unit_off + ios->length > ios->layout->stripe_unit)); - ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), + ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) goto out; ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, dev)->id), + _LLU(_ios_obj(ios, cur_comp)->id), _LLU(per_dev->offset), _LLU(ios->length), per_dev->dev); } else { - osd_req_set_attributes(or, _ios_obj(ios, dev)); + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(_ios_obj(ios, dev)->id), + _LLU(_ios_obj(ios, cur_comp)->id), ios->out_attr_len, dev); } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 433783624d10..dde41a75c7c8 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -400,8 +400,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait) ret = ore_write(ios); if (unlikely(ret)) EXOFS_ERR("%s: ore_write failed.\n", __func__); - else - sb->s_dirt = 0; unlock_super(sb); @@ -412,14 +410,6 @@ out: return ret; } -static void exofs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - exofs_sync_fs(sb, 1); - else - sb->s_dirt = 0; -} - static void _exofs_print_device(const char *msg, const char *dev_path, struct osd_dev *od, u64 pid) { @@ -952,7 +942,6 @@ static const struct super_operations exofs_sops = { .write_inode = exofs_write_inode, .evict_inode = exofs_evict_inode, .put_super = exofs_put_super, - .write_super = exofs_write_super, .sync_fs = exofs_sync_fs, .statfs = exofs_statfs, }; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 1c3613998862..376aa77f3ca7 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1444,19 +1444,9 @@ ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp) #ifdef EXT2FS_DEBUG -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - -unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) +unsigned long ext2_count_free(struct buffer_head *map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; - - if (!map) - return (0); - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); + return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); } #endif /* EXT2FS_DEBUG */ diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index c13eb7b91a11..8f370e012e61 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -644,6 +644,7 @@ unsigned long ext2_count_free_inodes (struct super_block * sb) } brelse(bitmap_bh); printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + (unsigned long) percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter), desc_count, bitmap_count); return desc_count; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 264d315f6c47..6363ac66fafa 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode) truncate_inode_pages(&inode->i_data, 0); if (want_delete) { + sb_start_intwrite(inode->i_sb); /* set dtime */ EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); @@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode) if (unlikely(rsv)) kfree(rsv); - if (want_delete) + if (want_delete) { ext2_free_inode(inode); + sb_end_intwrite(inode->i_sb); + } } typedef struct { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9f311d27b16f..af74d9e27b71 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb, static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); +static int ext2_freeze(struct super_block *sb); +static int ext2_unfreeze(struct super_block *sb); void ext2_error(struct super_block *sb, const char *function, const char *fmt, ...) @@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = { .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, .sync_fs = ext2_sync_fs, + .freeze_fs = ext2_freeze, + .unfreeze_fs = ext2_unfreeze, .statfs = ext2_statfs, .remount_fs = ext2_remount, .show_options = ext2_show_options, @@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait) return 0; } +static int ext2_freeze(struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + /* + * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared + * because we have unattached inodes and thus filesystem is not fully + * consistent. + */ + if (atomic_long_read(&sb->s_remove_count)) { + ext2_sync_fs(sb, 1); + return 0; + } + /* Set EXT2_FS_VALID flag */ + spin_lock(&sbi->s_lock); + sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, sbi->s_es, 1); + + return 0; +} + +static int ext2_unfreeze(struct super_block *sb) +{ + /* Just write sb to clear EXT2_VALID_FS flag */ + ext2_write_super(sb); + + return 0; +} void ext2_write_super(struct super_block *sb) { diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 25cd60892116..90d901f0486b 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1813,7 +1813,7 @@ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) brelse(bitmap_bh); printk("ext3_count_free_blocks: stored = "E3FSBLK ", computed = "E3FSBLK", "E3FSBLK"\n", - le32_to_cpu(es->s_free_blocks_count), + (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); return bitmap_count; #else diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c index 909d13e26560..ef9c643e8e9d 100644 --- a/fs/ext3/bitmap.c +++ b/fs/ext3/bitmap.c @@ -11,19 +11,9 @@ #ifdef EXT3FS_DEBUG -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; - - if (!map) - return (0); - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); + return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); } #endif /* EXT3FS_DEBUG */ diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 9a4a5c48b1c9..a07597307fd1 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3459,14 +3459,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index ff9bcdc5b0d5..8c892e93d8e7 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -64,11 +64,6 @@ static int ext3_freeze(struct super_block *sb); /* * Wrappers for journal_start/end. - * - * The only special thing we need to do here is to make sure that all - * journal_end calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. */ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) { @@ -90,12 +85,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) return journal_start(journal, nblocks); } -/* - * The only special thing we need to do here is to make sure that all - * journal_stop calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - */ int __ext3_journal_stop(const char *where, handle_t *handle) { struct super_block *sb; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d23b31ca9d7a..1b5089067d01 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -280,14 +280,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, return desc; } -static int ext4_valid_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) +/* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. + */ +static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) { ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; - ext4_fsblk_t bitmap_blk; + ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { @@ -297,37 +301,33 @@ static int ext4_valid_block_bitmap(struct super_block *sb, * or it has to also read the block group where the bitmaps * are located to verify they are set. */ - return 1; + return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ - bitmap_blk = ext4_block_bitmap(sb, desc); - offset = bitmap_blk - group_first_block; + blk = ext4_block_bitmap(sb, desc); + offset = blk - group_first_block; if (!ext4_test_bit(offset, bh->b_data)) /* bad block bitmap */ - goto err_out; + return blk; /* check whether the inode bitmap block number is set */ - bitmap_blk = ext4_inode_bitmap(sb, desc); - offset = bitmap_blk - group_first_block; + blk = ext4_inode_bitmap(sb, desc); + offset = blk - group_first_block; if (!ext4_test_bit(offset, bh->b_data)) /* bad block bitmap */ - goto err_out; + return blk; /* check whether the inode table block number is set */ - bitmap_blk = ext4_inode_table(sb, desc); - offset = bitmap_blk - group_first_block; + blk = ext4_inode_table(sb, desc); + offset = blk - group_first_block; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, offset + EXT4_SB(sb)->s_itb_per_group, offset); - if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) - /* good bitmap for inode tables */ - return 1; - -err_out: - ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", - block_group, bitmap_blk); + if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group) + /* bad bitmap for inode tables */ + return blk; return 0; } @@ -336,14 +336,26 @@ void ext4_validate_block_bitmap(struct super_block *sb, unsigned int block_group, struct buffer_head *bh) { + ext4_fsblk_t blk; + if (buffer_verified(bh)) return; ext4_lock_group(sb, block_group); - if (ext4_valid_block_bitmap(sb, desc, block_group, bh) && - ext4_block_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_BLOCKS_PER_GROUP(sb) / 8)) - set_buffer_verified(bh); + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); + return; + } + if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, + desc, bh, EXT4_BLOCKS_PER_GROUP(sb) / 8))) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + return; + } + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index a94b9c63ee5c..5c2d1813ebe9 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -11,16 +11,9 @@ #include <linux/jbd2.h> #include "ext4.h" -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { - unsigned int i, sum = 0; - - for (i = 0; i < numchars; i++) - sum += nibblemap[bitmap[i] & 0xf] + - nibblemap[(bitmap[i] >> 4) & 0xf]; - return sum; + return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, @@ -86,7 +79,6 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, if (provided == calculated) return 1; - ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cd0c7ed06772..aabbb3f53683 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2662,6 +2662,7 @@ cont: } path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); + i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89b59cb7f9b8..dff171c3a123 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; + /* + * Protect us against freezing - iput() caller didn't have to have any + * protection against it + */ + sb_start_intwrite(inode->i_sb); handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); @@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode) * cleaned up. */ ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } @@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode) stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } } @@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode) else ext4_free_inode(handle, inode); ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -1962,7 +1970,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); * This function can get called via... * - ext4_da_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via pdflush (no journal handle) + * - shrink_page_list via the kswapd/direct reclaim (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) * * We don't do any block allocation in this function. If we have page with @@ -4581,14 +4589,6 @@ static int ext4_expand_extra_isize(struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) { @@ -4779,11 +4779,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) get_block_t *get_block; int retries = 0; - /* - * This check is racy but catches the common case. We rely on - * __block_page_mkwrite() to do a reliable check. - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -4851,5 +4847,6 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f99a1311e847..fe7c63f4717e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); mark_buffer_dirty(bh); lock_buffer(bh); @@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) get_bh(bh); submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); + sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d51cd9af225..c6e0cb3d1f4a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -326,38 +326,17 @@ static void ext4_put_nojournal(handle_t *handle) /* * Wrappers for jbd2_journal_start/end. - * - * The only special thing we need to do here is to make sure that all - * journal_end calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - * - * To avoid j_barrier hold in userspace when a user calls freeze(), - * ext4 prevents a new handle from being started by s_frozen, which - * is in an upper layer. */ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) { journal_t *journal; - handle_t *handle; trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; - handle = ext4_journal_current_handle(); - - /* - * If a handle has been started, it should be allowed to - * finish, otherwise deadlock could happen between freeze - * and others(e.g. truncate) due to the restart of the - * journal handle if the filesystem is forzen and active - * handles are not stopped. - */ - if (!handle) - vfs_check_frozen(sb, SB_FREEZE_TRANS); - if (!journal) return ext4_get_nojournal(); /* @@ -372,12 +351,6 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) return jbd2_journal_start(journal, nblocks); } -/* - * The only special thing we need to do here is to make sure that all - * jbd2_journal_stop calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - */ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; @@ -975,6 +948,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; + ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -2747,6 +2721,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; + sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2773,6 +2748,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } + sb_end_write(sb); return ret; } @@ -3133,6 +3109,10 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + sbi->s_itb_per_group + 2); + first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; @@ -4444,6 +4424,7 @@ static void ext4_clear_journal_err(struct super_block *sb, ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); + jbd2_journal_update_sb_errno(journal); } } @@ -4460,10 +4441,8 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - vfs_check_frozen(sb, SB_FREEZE_TRANS); + if (journal) ret = ext4_journal_force_commit(journal); - } return ret; } @@ -4493,9 +4472,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean - * state independently, because ext4 prevents a new handle from being started - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from - * the upper layer. + * state independently. It relies on upper layer to stop all data & metadata + * modifications. */ static int ext4_freeze(struct super_block *sb) { @@ -4522,7 +4500,7 @@ static int ext4_freeze(struct super_block *sb) EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on s_frozen to stop further updates */ + /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return error; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 6eaa28c98ad1..dc49ed2cbffa 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -35,6 +35,11 @@ #define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) #define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) +static inline unsigned char fat_tolower(unsigned char c) +{ + return ((c >= 'A') && (c <= 'Z')) ? c+32 : c; +} + static inline loff_t fat_make_i_pos(struct super_block *sb, struct buffer_head *bh, struct msdos_dir_entry *de) @@ -333,6 +338,124 @@ parse_long: return 0; } +/** + * fat_parse_short - Parse MS-DOS (short) directory entry. + * @sb: superblock + * @de: directory entry to parse + * @name: FAT_MAX_SHORT_SIZE array in which to place extracted name + * @dot_hidden: Nonzero == prepend '.' to names with ATTR_HIDDEN + * + * Returns the number of characters extracted into 'name'. + */ +static int fat_parse_short(struct super_block *sb, + const struct msdos_dir_entry *de, + unsigned char *name, int dot_hidden) +{ + const struct msdos_sb_info *sbi = MSDOS_SB(sb); + int isvfat = sbi->options.isvfat; + int nocase = sbi->options.nocase; + unsigned short opt_shortname = sbi->options.shortname; + struct nls_table *nls_disk = sbi->nls_disk; + wchar_t uni_name[14]; + unsigned char c, work[MSDOS_NAME]; + unsigned char *ptname = name; + int chi, chl, i, j, k; + int dotoffset = 0; + int name_len = 0, uni_len = 0; + + if (!isvfat && dot_hidden && (de->attr & ATTR_HIDDEN)) { + *ptname++ = '.'; + dotoffset = 1; + } + + memcpy(work, de->name, sizeof(work)); + /* see namei.c, msdos_format_name */ + if (work[0] == 0x05) + work[0] = 0xE5; + + /* Filename */ + for (i = 0, j = 0; i < 8;) { + c = work[i]; + if (!c) + break; + chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, + &uni_name[j++], opt_shortname, + de->lcase & CASE_LOWER_BASE); + if (chl <= 1) { + if (!isvfat) + ptname[i] = nocase ? c : fat_tolower(c); + i++; + if (c != ' ') { + name_len = i; + uni_len = j; + } + } else { + uni_len = j; + if (isvfat) + i += min(chl, 8-i); + else { + for (chi = 0; chi < chl && i < 8; chi++, i++) + ptname[i] = work[i]; + } + if (chl) + name_len = i; + } + } + + i = name_len; + j = uni_len; + fat_short2uni(nls_disk, ".", 1, &uni_name[j++]); + if (!isvfat) + ptname[i] = '.'; + i++; + + /* Extension */ + for (k = 8; k < MSDOS_NAME;) { + c = work[k]; + if (!c) + break; + chl = fat_shortname2uni(nls_disk, &work[k], MSDOS_NAME - k, + &uni_name[j++], opt_shortname, + de->lcase & CASE_LOWER_EXT); + if (chl <= 1) { + k++; + if (!isvfat) + ptname[i] = nocase ? c : fat_tolower(c); + i++; + if (c != ' ') { + name_len = i; + uni_len = j; + } + } else { + uni_len = j; + if (isvfat) { + int offset = min(chl, MSDOS_NAME-k); + k += offset; + i += offset; + } else { + for (chi = 0; chi < chl && k < MSDOS_NAME; + chi++, i++, k++) { + ptname[i] = work[k]; + } + } + if (chl) + name_len = i; + } + } + + if (name_len > 0) { + name_len += dotoffset; + + if (sbi->options.isvfat) { + uni_name[uni_len] = 0x0000; + name_len = fat_uni_to_x8(sb, uni_name, name, + FAT_MAX_SHORT_SIZE); + } + } + + return name_len; +} + /* * Return values: negative -> error, 0 -> not found, positive -> found, * value is the total amount of slots, including the shortname entry. @@ -344,15 +467,11 @@ int fat_search_long(struct inode *inode, const unsigned char *name, struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh = NULL; struct msdos_dir_entry *de; - struct nls_table *nls_disk = sbi->nls_disk; unsigned char nr_slots; - wchar_t bufuname[14]; wchar_t *unicode = NULL; - unsigned char work[MSDOS_NAME]; unsigned char bufname[FAT_MAX_SHORT_SIZE]; - unsigned short opt_shortname = sbi->options.shortname; loff_t cpos = 0; - int chl, i, j, last_u, err, len; + int err, len; err = -ENOENT; while (1) { @@ -380,47 +499,16 @@ parse_record: goto end_of_dir; } - memcpy(work, de->name, sizeof(de->name)); - /* see namei.c, msdos_format_name */ - if (work[0] == 0x05) - work[0] = 0xE5; - for (i = 0, j = 0, last_u = 0; i < 8;) { - if (!work[i]) - break; - chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, - &bufuname[j++], opt_shortname, - de->lcase & CASE_LOWER_BASE); - if (chl <= 1) { - if (work[i] != ' ') - last_u = j; - } else { - last_u = j; - } - i += chl; - } - j = last_u; - fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); - for (i = 8; i < MSDOS_NAME;) { - if (!work[i]) - break; - chl = fat_shortname2uni(nls_disk, &work[i], - MSDOS_NAME - i, - &bufuname[j++], opt_shortname, - de->lcase & CASE_LOWER_EXT); - if (chl <= 1) { - if (work[i] != ' ') - last_u = j; - } else { - last_u = j; - } - i += chl; - } - if (!last_u) + /* Never prepend '.' to hidden files here. + * That is done only for msdos mounts (and only when + * 'dotsOK=yes'); if we are executing here, it is in the + * context of a vfat mount. + */ + len = fat_parse_short(sb, de, bufname, 0); + if (len == 0) continue; /* Compare shortname */ - bufuname[last_u] = 0x0000; - len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); if (fat_name_match(sbi, name, name_len, bufname, len)) goto found; @@ -469,20 +557,15 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *de; - struct nls_table *nls_disk = sbi->nls_disk; unsigned char nr_slots; - wchar_t bufuname[14]; wchar_t *unicode = NULL; - unsigned char c, work[MSDOS_NAME]; - unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname; - unsigned short opt_shortname = sbi->options.shortname; + unsigned char bufname[FAT_MAX_SHORT_SIZE]; int isvfat = sbi->options.isvfat; - int nocase = sbi->options.nocase; const char *fill_name = NULL; unsigned long inum; unsigned long lpos, dummy, *furrfu = &lpos; loff_t cpos; - int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0; + int short_len = 0, fill_len = 0; int ret = 0; lock_super(sb); @@ -556,74 +639,10 @@ parse_record: } } - if (sbi->options.dotsOK) { - ptname = bufname; - dotoffset = 0; - if (de->attr & ATTR_HIDDEN) { - *ptname++ = '.'; - dotoffset = 1; - } - } - - memcpy(work, de->name, sizeof(de->name)); - /* see namei.c, msdos_format_name */ - if (work[0] == 0x05) - work[0] = 0xE5; - for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) { - if (!(c = work[i])) - break; - chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, - &bufuname[j++], opt_shortname, - de->lcase & CASE_LOWER_BASE); - if (chl <= 1) { - ptname[i++] = (!nocase && c>='A' && c<='Z') ? c+32 : c; - if (c != ' ') { - last = i; - last_u = j; - } - } else { - last_u = j; - for (chi = 0; chi < chl && i < 8; chi++) { - ptname[i] = work[i]; - i++; last = i; - } - } - } - i = last; - j = last_u; - fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); - ptname[i++] = '.'; - for (i2 = 8; i2 < MSDOS_NAME;) { - if (!(c = work[i2])) - break; - chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2, - &bufuname[j++], opt_shortname, - de->lcase & CASE_LOWER_EXT); - if (chl <= 1) { - i2++; - ptname[i++] = (!nocase && c>='A' && c<='Z') ? c+32 : c; - if (c != ' ') { - last = i; - last_u = j; - } - } else { - last_u = j; - for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) { - ptname[i++] = work[i2++]; - last = i; - } - } - } - if (!last) + short_len = fat_parse_short(sb, de, bufname, sbi->options.dotsOK); + if (short_len == 0) goto record_end; - i = last + dotoffset; - j = last_u; - - if (isvfat) { - bufuname[j] = 0x0000; - i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); - } if (nr_slots) { /* hack for fat_ioctl_filldir() */ struct fat_ioctl_filldir_callback *p = dirent; @@ -631,12 +650,12 @@ parse_record: p->longname = fill_name; p->long_len = fill_len; p->shortname = bufname; - p->short_len = i; + p->short_len = short_len; fill_name = NULL; fill_len = 0; } else { fill_name = bufname; - fill_len = i; + fill_len = short_len; } start_filldir: diff --git a/fs/fat/fat.h b/fs/fat/fat.h index fc35c5c69136..2deeeb86f331 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -217,6 +217,21 @@ static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) #endif } +static inline int fat_get_start(const struct msdos_sb_info *sbi, + const struct msdos_dir_entry *de) +{ + int cluster = le16_to_cpu(de->start); + if (sbi->fat_bits == 32) + cluster |= (le16_to_cpu(de->starthi) << 16); + return cluster; +} + +static inline void fat_set_start(struct msdos_dir_entry *de, int cluster) +{ + de->start = cpu_to_le16(cluster); + de->starthi = cpu_to_le16(cluster >> 16); +} + static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) { #ifdef __BIG_ENDIAN diff --git a/fs/fat/file.c b/fs/fat/file.c index a71fe3715ee8..e007b8bd8e5e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (err) goto out; - mutex_lock(&inode->i_mutex); err = mnt_want_write_file(file); if (err) - goto out_unlock_inode; + goto out; + mutex_lock(&inode->i_mutex); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) /* The root directory has no attributes */ if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { err = -EINVAL; - goto out_drop_write; + goto out_unlock_inode; } if (sbi->options.sys_immutable && ((attr | oldattr) & ATTR_SYS) && !capable(CAP_LINUX_IMMUTABLE)) { err = -EPERM; - goto out_drop_write; + goto out_unlock_inode; } /* @@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) */ err = security_inode_setattr(file->f_path.dentry, &ia); if (err) - goto out_drop_write; + goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ err = fat_setattr(file->f_path.dentry, &ia); if (err) - goto out_drop_write; + goto out_unlock_inode; fsnotify_change(file->f_path.dentry, ia.ia_valid); if (sbi->options.sys_immutable) { @@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); -out_drop_write: - mnt_drop_write_file(file); out_unlock_inode: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); out: return err; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 0038b32cb362..05e897fe9866 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -369,10 +369,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_op = sbi->dir_ops; inode->i_fop = &fat_dir_operations; - MSDOS_I(inode)->i_start = le16_to_cpu(de->start); - if (sbi->fat_bits == 32) - MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16); - + MSDOS_I(inode)->i_start = fat_get_start(sbi, de); MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; error = fat_calc_dir_size(inode); if (error < 0) @@ -385,9 +382,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_mode = fat_make_mode(sbi, de->attr, ((sbi->options.showexec && !is_exec(de->name + 8)) ? S_IRUGO|S_IWUGO : S_IRWXUGO)); - MSDOS_I(inode)->i_start = le16_to_cpu(de->start); - if (sbi->fat_bits == 32) - MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16); + MSDOS_I(inode)->i_start = fat_get_start(sbi, de); MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; inode->i_size = le32_to_cpu(de->size); @@ -613,8 +608,7 @@ retry: else raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); - raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart); - raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16); + fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 70d993a93805..b0e12bf9f4a1 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -246,8 +246,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, de.ctime_cs = 0; de.time = time; de.date = date; - de.start = cpu_to_le16(cluster); - de.starthi = cpu_to_le16(cluster >> 16); + fat_set_start(&de, cluster); de.size = 0; err = fat_add_entries(dir, &de, 1, sinfo); @@ -530,9 +529,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, mark_inode_dirty(old_inode); if (update_dotdot) { - int start = MSDOS_I(new_dir)->i_logstart; - dotdot_de->start = cpu_to_le16(start); - dotdot_de->starthi = cpu_to_le16(start >> 16); + fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); mark_buffer_dirty_inode(dotdot_bh, old_inode); if (IS_DIRSYNC(new_dir)) { err = sync_dirty_buffer(dotdot_bh); @@ -572,9 +569,7 @@ error_dotdot: corrupt = 1; if (update_dotdot) { - int start = MSDOS_I(old_dir)->i_logstart; - dotdot_de->start = cpu_to_le16(start); - dotdot_de->starthi = cpu_to_le16(start >> 16); + fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); mark_buffer_dirty_inode(dotdot_bh, old_inode); corrupt |= sync_dirty_buffer(dotdot_bh); } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6cc480652433..6a6d8c0715a1 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -651,8 +651,7 @@ shortname: de->time = de->ctime = time; de->date = de->cdate = de->adate = date; de->ctime_cs = time_cs; - de->start = cpu_to_le16(cluster); - de->starthi = cpu_to_le16(cluster >> 16); + fat_set_start(de, cluster); de->size = 0; out_free: __putname(uname); @@ -965,9 +964,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, mark_inode_dirty(old_inode); if (update_dotdot) { - int start = MSDOS_I(new_dir)->i_logstart; - dotdot_de->start = cpu_to_le16(start); - dotdot_de->starthi = cpu_to_le16(start >> 16); + fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); mark_buffer_dirty_inode(dotdot_bh, old_inode); if (IS_DIRSYNC(new_dir)) { err = sync_dirty_buffer(dotdot_bh); @@ -1009,9 +1006,7 @@ error_dotdot: corrupt = 1; if (update_dotdot) { - int start = MSDOS_I(old_dir)->i_logstart; - dotdot_de->start = cpu_to_le16(start); - dotdot_de->starthi = cpu_to_le16(start >> 16); + fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); mark_buffer_dirty_inode(dotdot_bh, old_inode); corrupt |= sync_dirty_buffer(dotdot_bh); } diff --git a/fs/fcntl.c b/fs/fcntl.c index 81b70e665bf0..887b5ba8c9b5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -20,6 +20,7 @@ #include <linux/signal.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -340,6 +341,31 @@ static int f_getown_ex(struct file *filp, unsigned long arg) return ret; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int f_getowner_uids(struct file *filp, unsigned long arg) +{ + struct user_namespace *user_ns = current_user_ns(); + uid_t * __user dst = (void * __user)arg; + uid_t src[2]; + int err; + + read_lock(&filp->f_owner.lock); + src[0] = from_kuid(user_ns, filp->f_owner.uid); + src[1] = from_kuid(user_ns, filp->f_owner.euid); + read_unlock(&filp->f_owner.lock); + + err = put_user(src[0], &dst[0]); + err |= put_user(src[1], &dst[1]); + + return err; +} +#else +static int f_getowner_uids(struct file *filp, unsigned long arg) +{ + return -EINVAL; +} +#endif + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -396,6 +422,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SETOWN_EX: err = f_setown_ex(filp, arg); break; + case F_GETOWNER_UIDS: + err = f_getowner_uids(filp, arg); + break; case F_GETSIG: err = filp->f_owner.signum; break; diff --git a/fs/file_table.c b/fs/file_table.c index b3fc4d67a26b..701985e4ccda 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) +static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); @@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file) return; if (file_check_writeable(file) != 0) return; - mnt_drop_write(mnt); + __mnt_drop_write(mnt); file_release_write(file); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8f660dd6137a..be3efc4f64f4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -52,11 +52,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -628,8 +623,8 @@ static long writeback_sb_inodes(struct super_block *sb, } /* - * Don't bother with new inodes or inodes beeing freed, first - * kind does not need peridic writeout yet, and for the latter + * Don't bother with new inodes or inodes being freed, first + * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8964cf3999b2..324bc0850534 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -383,6 +383,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_file *ff; + /* Userspace expects S_IFREG in create mode */ + BUG_ON((mode & S_IFMT) != S_IFREG); + forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b321a688cde7..aba15f1b7ad2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -703,13 +703,16 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = iocb->ki_filp->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); - if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) { + /* + * In auto invalidate mode, always update attributes on read. + * Otherwise, only update if we attempt to read past EOF (to ensure + * i_size is up to date). + */ + if (fc->auto_inval_data || + (pos + iov_length(iov, nr_segs) > i_size_read(inode))) { int err; - /* - * If trying to read past EOF, make sure the i_size - * attribute is up-to-date. - */ err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); if (err) return err; @@ -944,9 +947,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -1004,6 +1006,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); return written ? written : err; } @@ -1700,7 +1703,7 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) size_t n; u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; - for (n = 0; n < count; n++) { + for (n = 0; n < count; n++, iov++) { if (iov->iov_len > (size_t) max) return -ENOMEM; max -= iov->iov_len; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 771fb6322c07..e24dd74e3068 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -484,6 +484,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Use enhanced/automatic page cache invalidation. */ + unsigned auto_inval_data:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1cd61652018c..ce0a2838ccd0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -197,6 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t oldsize; + struct timespec old_mtime; spin_lock(&fc->lock); if (attr_version != 0 && fi->attr_version > attr_version) { @@ -204,15 +205,35 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } + old_mtime = inode->i_mtime; fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; i_size_write(inode, attr->size); spin_unlock(&fc->lock); - if (S_ISREG(inode->i_mode) && oldsize != attr->size) { - truncate_pagecache(inode, oldsize, attr->size); - invalidate_inode_pages2(inode->i_mapping); + if (S_ISREG(inode->i_mode)) { + bool inval = false; + + if (oldsize != attr->size) { + truncate_pagecache(inode, oldsize, attr->size); + inval = true; + } else if (fc->auto_inval_data) { + struct timespec new_mtime = { + .tv_sec = attr->mtime, + .tv_nsec = attr->mtimensec, + }; + + /* + * Auto inval mode also checks and invalidates if mtime + * has changed. + */ + if (!timespec_equal(&old_mtime, &new_mtime)) + inval = true; + } + + if (inval) + invalidate_inode_pages2(inode->i_mapping); } } @@ -834,6 +855,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->big_writes = 1; if (arg->flags & FUSE_DONT_MASK) fc->dont_mask = 1; + if (arg->flags & FUSE_AUTO_INVAL_DATA) + fc->auto_inval_data = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -859,7 +882,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | - FUSE_FLOCK_LOCKS; + FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 9aa6af13823c..d1d791ef38de 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -373,11 +373,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret; - /* Wait if fs is frozen. This is racy so we check again later on - * and retry if the fs has been frozen after the page lock has - * been acquired - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); + + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); ret = gfs2_rs_alloc(ip); if (ret) @@ -462,14 +461,9 @@ out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); - /* This check must be post dropping of transaction lock */ - if (inode->i_sb->s_frozen == SB_UNFROZEN) { - wait_on_page_writeback(page); - } else { - ret = -EAGAIN; - unlock_page(page); - } + wait_on_page_writeback(page); } + sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 3a56c8d94de0..22255d96b27e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -52,7 +52,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the page. Note that this can - * potentially cause a busy-wait loop from pdflush and kswapd + * potentially cause a busy-wait loop from flusher thread and kswapd * activity, but those code paths have their own higher-level * throttling. */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ad3e2fb763d7..adbd27875ef9 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); + sb_start_intwrite(sdp->sd_vfs); gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); error = gfs2_glock_nq(&tr->tr_t_gh); @@ -68,6 +69,7 @@ fail_gunlock: gfs2_glock_dq(&tr->tr_t_gh); fail_holder_uninit: + sb_end_intwrite(sdp->sd_vfs); gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); @@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); } + sb_end_intwrite(sdp->sd_vfs); return; } @@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); + sb_end_intwrite(sdp->sd_vfs); } /** diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 5fd51a5833ff..b7ec224910c5 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -236,10 +236,10 @@ out: * hfs_mdb_commit() * * Description: - * This updates the MDB on disk (look also at hfs_write_super()). + * This updates the MDB on disk. * It does not check, if the superblock has been modified, or * if the filesystem has been mounted read-only. It is mainly - * called by hfs_write_super() and hfs_btree_extend(). + * called by hfs_sync_fs() and flush_mdb(). * Input Variable(s): * struct hfs_mdb *mdb: Pointer to the hfs MDB * int backup; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 473332098013..fdafb2d71654 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -365,7 +365,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) u64 last_fs_block, last_fs_page; int err; - err = -EINVAL; + err = -ENOMEM; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) goto out; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e13e9bdb0bf5..8349a899912e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -416,8 +416,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) else v_offset = 0; - __unmap_hugepage_range(vma, - vma->vm_start + v_offset, vma->vm_end, NULL); + unmap_hugepage_range(vma, vma->vm_start + v_offset, + vma->vm_end, NULL); } } diff --git a/fs/inode.c b/fs/inode.c index 3cc504320467..ac8d904b3f16 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1542,9 +1542,11 @@ void touch_atime(struct path *path) if (timespec_equal(&inode->i_atime, &now)) return; - if (mnt_want_write(mnt)) + if (!sb_start_write_trylock(inode->i_sb)) return; + if (__mnt_want_write(mnt)) + goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for @@ -1555,7 +1557,9 @@ void touch_atime(struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ update_time(inode, &now, S_ATIME); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); +skip_update: + sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); @@ -1662,11 +1666,11 @@ int file_update_time(struct file *file) return 0; /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) + if (__mnt_want_write_file(file)) return 0; ret = update_time(inode, &now, sync_it); - mnt_drop_write_file(file); + __mnt_drop_write_file(file); return ret; } diff --git a/fs/internal.h b/fs/internal.h index a6fd56c68b11..371bcc4b1697 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,6 +61,10 @@ extern void __init mnt_init(void); extern struct lglock vfsmount_lock; +extern int __mnt_want_write(struct vfsmount *); +extern int __mnt_want_write_file(struct file *); +extern void __mnt_drop_write(struct vfsmount *); +extern void __mnt_drop_write_file(struct file *); /* * fs_struct.c diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 425c2f2cf170..09357508ec9a 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -534,8 +534,8 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) ret = 1; } else if (journal->j_committing_transaction) { /* - * If ext3_write_super() recently started a commit, then we - * have to wait for completion of that transaction + * If commit has been started, then we have to wait for + * completion of that transaction. */ if (ptid) *ptid = journal->j_committing_transaction->t_tid; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e9a3c4c85594..e149b99a7ffb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -612,8 +612,8 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) ret = 1; } else if (journal->j_committing_transaction) { /* - * If ext3_write_super() recently started a commit, then we - * have to wait for completion of that transaction + * If commit has been started, then we have to wait for + * completion of that transaction. */ if (ptid) *ptid = journal->j_committing_transaction->t_tid; @@ -1377,7 +1377,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) * Update a journal's errno. Write updated superblock to disk waiting for IO * to complete. */ -static void jbd2_journal_update_sb_errno(journal_t *journal) +void jbd2_journal_update_sb_errno(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; @@ -1390,6 +1390,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal) jbd2_write_superblock(journal, WRITE_SYNC); } +EXPORT_SYMBOL(jbd2_journal_update_sb_errno); /* * Read the superblock for a given journal, performing initial diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 8392cb85bd54..05d29124c6ab 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -156,12 +156,16 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) struct nlm_rqst *call; int status; - nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; nlmclnt_locks_init_private(fl, host); + if (!fl->fl_u.nfs_fl.owner) { + /* lockowner allocation has failed */ + nlmclnt_release_call(call); + return -ENOMEM; + } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); @@ -185,9 +189,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc); /* * Allocate an NLM RPC call struct - * - * Note: the caller must hold a reference to host. In case of failure, - * this reference will be released. */ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) { @@ -199,7 +200,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) atomic_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); - call->a_host = host; + call->a_host = nlm_get_host(host); return call; } if (signalled()) @@ -207,7 +208,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) printk("nlm_alloc_call: failed, waiting for memory\n"); schedule_timeout_interruptible(5*HZ); } - nlmclnt_release_host(host); return NULL; } @@ -750,7 +750,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" " Attempting to cancel lock.\n"); - req = nlm_alloc_call(nlm_get_host(host)); + req = nlm_alloc_call(host); if (!req) return -ENOMEM; req->a_flags = RPC_TASK_ASYNC; diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c index 183cc1f0af1c..6d1ee7204c88 100644 --- a/fs/lockd/grace.c +++ b/fs/lockd/grace.c @@ -4,8 +4,10 @@ #include <linux/module.h> #include <linux/lockd/bind.h> +#include <net/net_namespace.h> + +#include "netns.h" -static LIST_HEAD(grace_list); static DEFINE_SPINLOCK(grace_lock); /** @@ -19,10 +21,12 @@ static DEFINE_SPINLOCK(grace_lock); * * This function is called to start a grace period. */ -void locks_start_grace(struct lock_manager *lm) +void locks_start_grace(struct net *net, struct lock_manager *lm) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + spin_lock(&grace_lock); - list_add(&lm->list, &grace_list); + list_add(&lm->list, &ln->grace_list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); @@ -52,8 +56,10 @@ EXPORT_SYMBOL_GPL(locks_end_grace); * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ -int locks_in_grace(void) +int locks_in_grace(struct net *net) { - return !list_empty(&grace_list); + struct lockd_net *ln = net_generic(net, lockd_net_id); + + return !list_empty(&ln->grace_list); } EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index eb75ca7c2d6e..f9b22e58f78f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -21,6 +21,8 @@ #include <net/ipv6.h> +#include "netns.h" + #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 #define NLM_HOST_REBIND (60 * HZ) @@ -41,11 +43,10 @@ static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; hlist_for_each_entry_safe((host), (pos), (next), \ (chain), h_hash) -static unsigned long next_gc; static unsigned long nrhosts; static DEFINE_MUTEX(nlm_host_mutex); -static void nlm_gc_hosts(void); +static void nlm_gc_hosts(struct net *net); struct nlm_lookup_host_info { const int server; /* search for server|client */ @@ -172,6 +173,7 @@ out: static void nlm_destroy_host_locked(struct nlm_host *host) { struct rpc_clnt *clnt; + struct lockd_net *ln = net_generic(host->net, lockd_net_id); dprintk("lockd: destroy host %s\n", host->h_name); @@ -188,6 +190,7 @@ static void nlm_destroy_host_locked(struct nlm_host *host) rpc_shutdown_client(clnt); kfree(host); + ln->nrhosts--; nrhosts--; } @@ -228,6 +231,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; + struct lockd_net *ln = net_generic(net, lockd_net_id); dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, (hostname ? hostname : "<none>"), version, @@ -262,6 +266,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, goto out; hlist_add_head(&host->h_hash, chain); + ln->nrhosts++; nrhosts++; dprintk("lockd: %s created host %s (%s)\n", __func__, @@ -326,7 +331,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct nsm_handle *nsm = NULL; struct sockaddr *src_sap = svc_daddr(rqstp); size_t src_len = rqstp->rq_daddrlen; - struct net *net = rqstp->rq_xprt->xpt_net; + struct net *net = SVC_NET(rqstp); struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -337,6 +342,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, .hostname_len = hostname_len, .net = net, }; + struct lockd_net *ln = net_generic(net, lockd_net_id); dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, @@ -344,8 +350,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, mutex_lock(&nlm_host_mutex); - if (time_after_eq(jiffies, next_gc)) - nlm_gc_hosts(); + if (time_after_eq(jiffies, ln->next_gc)) + nlm_gc_hosts(net); chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { @@ -382,6 +388,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, memcpy(nlm_srcaddr(host), src_sap, src_len); host->h_srcaddrlen = src_len; hlist_add_head(&host->h_hash, chain); + ln->nrhosts++; nrhosts++; dprintk("lockd: %s created host %s (%s)\n", @@ -565,6 +572,35 @@ void nlm_host_rebooted(const struct nlm_reboot *info) nsm_release(nsm); } +static void nlm_complain_hosts(struct net *net) +{ + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + + if (net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + if (ln->nrhosts == 0) + return; + printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net); + dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net); + } else { + if (nrhosts == 0) + return; + printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); + dprintk("lockd: %lu hosts left:\n", nrhosts); + } + + for_each_host(host, pos, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; + dprintk(" %s (cnt %d use %d exp %ld net %p)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires, host->net); + } +} + void nlm_shutdown_hosts_net(struct net *net) { @@ -572,11 +608,10 @@ nlm_shutdown_hosts_net(struct net *net) struct hlist_node *pos; struct nlm_host *host; - dprintk("lockd: shutting down host module\n"); mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ - dprintk("lockd: nuking all hosts...\n"); + dprintk("lockd: nuking all hosts in net %p...\n", net); for_each_host(host, pos, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -588,8 +623,10 @@ nlm_shutdown_hosts_net(struct net *net) } /* Then, perform a garbage collection pass */ - nlm_gc_hosts(); + nlm_gc_hosts(net); mutex_unlock(&nlm_host_mutex); + + nlm_complain_hosts(net); } /* @@ -599,22 +636,8 @@ nlm_shutdown_hosts_net(struct net *net) void nlm_shutdown_hosts(void) { - struct hlist_head *chain; - struct hlist_node *pos; - struct nlm_host *host; - + dprintk("lockd: shutting down host module\n"); nlm_shutdown_hosts_net(NULL); - - /* complain if any hosts are left */ - if (nrhosts != 0) { - printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); - dprintk("lockd: %lu hosts left:\n", nrhosts); - for_each_host(host, pos, chain, nlm_server_hosts) { - dprintk(" %s (cnt %d use %d exp %ld net %p)\n", - host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); - } - } } /* @@ -623,30 +646,39 @@ nlm_shutdown_hosts(void) * mark & sweep for resources held by remote clients. */ static void -nlm_gc_hosts(void) +nlm_gc_hosts(struct net *net) { struct hlist_head *chain; struct hlist_node *pos, *next; struct nlm_host *host; - dprintk("lockd: host garbage collection\n"); - for_each_host(host, pos, chain, nlm_server_hosts) + dprintk("lockd: host garbage collection for net %p\n", net); + for_each_host(host, pos, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; host->h_inuse = 0; + } /* Mark all hosts that hold locks, blocks or shares */ - nlmsvc_mark_resources(); + nlmsvc_mark_resources(net); for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; if (atomic_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " - "(cnt %d use %d exp %ld)\n", + "(cnt %d use %d exp %ld net %p)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires); + host->h_inuse, host->h_expires, host->net); continue; } nlm_destroy_host_locked(host); } - next_gc = jiffies + NLM_HOST_COLLECT; + if (net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + ln->next_gc = jiffies + NLM_HOST_COLLECT; + } } diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index ce227e0fbc5c..4eee248ba96e 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -1,10 +1,17 @@ #ifndef __LOCKD_NETNS_H__ #define __LOCKD_NETNS_H__ +#include <linux/fs.h> #include <net/netns/generic.h> struct lockd_net { unsigned int nlmsvc_users; + unsigned long next_gc; + unsigned long nrhosts; + + struct delayed_work grace_period_end; + struct lock_manager lockd_manager; + struct list_head grace_list; }; extern int lockd_net_id; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 80938fda67e0..31a63f87b806 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -87,32 +87,36 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -static struct lock_manager lockd_manager = { -}; - -static void grace_ender(struct work_struct *not_used) +static void grace_ender(struct work_struct *grace) { - locks_end_grace(&lockd_manager); -} + struct delayed_work *dwork = container_of(grace, struct delayed_work, + work); + struct lockd_net *ln = container_of(dwork, struct lockd_net, + grace_period_end); -static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + locks_end_grace(&ln->lockd_manager); +} -static void set_grace_period(void) +static void set_grace_period(struct net *net) { unsigned long grace_period = get_lockd_grace_period(); + struct lockd_net *ln = net_generic(net, lockd_net_id); - locks_start_grace(&lockd_manager); - cancel_delayed_work_sync(&grace_period_end); - schedule_delayed_work(&grace_period_end, grace_period); + locks_start_grace(net, &ln->lockd_manager); + cancel_delayed_work_sync(&ln->grace_period_end); + schedule_delayed_work(&ln->grace_period_end, grace_period); } static void restart_grace(void) { if (nlmsvc_ops) { - cancel_delayed_work_sync(&grace_period_end); - locks_end_grace(&lockd_manager); + struct net *net = &init_net; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); nlmsvc_invalidate_all(); - set_grace_period(); + set_grace_period(net); } } @@ -137,8 +141,6 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - set_grace_period(); - /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. @@ -184,8 +186,6 @@ lockd(void *vrqstp) svc_process(rqstp); } flush_signals(current); - cancel_delayed_work_sync(&grace_period_end); - locks_end_grace(&lockd_manager); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -266,6 +266,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) error = make_socks(serv, net); if (error < 0) goto err_socks; + set_grace_period(net); dprintk("lockd_up_net: per-net data created; net=%p\n", net); return 0; @@ -283,6 +284,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); } @@ -589,6 +592,10 @@ module_param(nlm_max_connections, uint, 0644); static int lockd_init_net(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); + INIT_LIST_HEAD(&ln->grace_list); return 0; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 9a41fdc19511..b147d1ae71fd 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> +#include <linux/sunrpc/svc_xprt.h> #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -151,7 +152,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -161,7 +162,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Try to cancel request. */ - resp->status = nlmsvc_cancel_blocked(file, &argp->lock); + resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -184,7 +185,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -194,7 +195,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now try to remove the lock */ - resp->status = nlmsvc_unlock(file, &argp->lock); + resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -256,6 +257,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args return rpc_system_err; call = nlm_alloc_call(host); + nlmsvc_release_host(host); if (call == NULL) return rpc_system_err; @@ -321,7 +323,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace() && !argp->reclaim) { + if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -354,7 +356,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e46353f41a42..fb1a2bedbe97 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -26,7 +26,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> @@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, struct nlm_block *block; struct nlm_rqst *call = NULL; - nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return NULL; @@ -447,11 +446,11 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - if (locks_in_grace() && !reclaim) { + if (locks_in_grace(SVC_NET(rqstp)) && !reclaim) { ret = nlm_lck_denied_grace_period; goto out; } - if (reclaim && !locks_in_grace()) { + if (reclaim && !locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } @@ -559,7 +558,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } @@ -603,7 +602,7 @@ out: * must be removed. */ __be32 -nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) +nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { int error; @@ -615,7 +614,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_end); /* First, cancel any lock that might be there */ - nlmsvc_cancel_blocked(file, lock); + nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); @@ -631,7 +630,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) * The calling procedure must check whether the file can be closed. */ __be32 -nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) +nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { struct nlm_block *block; int status = 0; @@ -643,7 +642,7 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - if (locks_in_grace()) + if (locks_in_grace(net)) return nlm_lck_denied_grace_period; mutex_lock(&file->f_mutex); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index d27aab11f324..3009a365e082 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> +#include <linux/sunrpc/svc_xprt.h> #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -175,13 +176,14 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + struct net *net = SVC_NET(rqstp); dprintk("lockd: CANCEL called\n"); resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(net)) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -191,7 +193,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Try to cancel request. */ - resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); + resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock)); dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -208,13 +210,14 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + struct net *net = SVC_NET(rqstp); dprintk("lockd: UNLOCK called\n"); resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(net)) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -224,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now try to remove the lock */ - resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); + resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock)); dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -294,6 +297,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args return rpc_system_err; call = nlm_alloc_call(host); + nlmsvc_release_host(host); if (call == NULL) return rpc_system_err; @@ -361,7 +365,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace() && !argp->reclaim) { + if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -394,7 +398,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 2240d384d787..0deb5f6c9dd4 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -309,7 +309,8 @@ nlm_release_file(struct nlm_file *file) * Helpers function for resource traversal * * nlmsvc_mark_host: - * used by the garbage collector; simply sets h_inuse. + * used by the garbage collector; simply sets h_inuse only for those + * hosts, which passed network check. * Always returns 0. * * nlmsvc_same_host: @@ -320,12 +321,15 @@ nlm_release_file(struct nlm_file *file) * returns 1 iff the host is a client. * Used by nlmsvc_invalidate_all */ + static int -nlmsvc_mark_host(void *data, struct nlm_host *dummy) +nlmsvc_mark_host(void *data, struct nlm_host *hint) { struct nlm_host *host = data; - host->h_inuse = 1; + if ((hint->net == NULL) || + (host->net == hint->net)) + host->h_inuse = 1; return 0; } @@ -358,10 +362,13 @@ nlmsvc_is_client(void *data, struct nlm_host *dummy) * Mark all hosts that still hold resources */ void -nlmsvc_mark_resources(void) +nlmsvc_mark_resources(struct net *net) { - dprintk("lockd: nlmsvc_mark_resources\n"); - nlm_traverse_files(NULL, nlmsvc_mark_host, NULL); + struct nlm_host hint; + + dprintk("lockd: nlmsvc_mark_resources for net %p\n", net); + hint.net = net; + nlm_traverse_files(&hint, nlmsvc_mark_host, NULL); } /* diff --git a/fs/locks.c b/fs/locks.c index 82c353304f9e..7e81bfc75164 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -200,11 +200,7 @@ void locks_release_private(struct file_lock *fl) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; } - if (fl->fl_lmops) { - if (fl->fl_lmops->lm_release_private) - fl->fl_lmops->lm_release_private(fl); - fl->fl_lmops = NULL; - } + fl->fl_lmops = NULL; } EXPORT_SYMBOL_GPL(locks_release_private); @@ -427,18 +423,8 @@ static void lease_break_callback(struct file_lock *fl) kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); } -static void lease_release_private_callback(struct file_lock *fl) -{ - if (!fl->fl_file) - return; - - f_delown(fl->fl_file); - fl->fl_file->f_owner.signum = 0; -} - static const struct lock_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, - .lm_release_private = lease_release_private_callback, .lm_change = lease_modify, }; @@ -580,12 +566,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p) fl->fl_next = NULL; list_del_init(&fl->fl_link); - fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); - if (fl->fl_fasync != NULL) { - printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); - fl->fl_fasync = NULL; - } - if (fl->fl_nspid) { put_pid(fl->fl_nspid); fl->fl_nspid = NULL; @@ -1155,8 +1135,18 @@ int lease_modify(struct file_lock **before, int arg) return error; lease_clear_pending(fl, arg); locks_wake_up_blocks(fl); - if (arg == F_UNLCK) + if (arg == F_UNLCK) { + struct file *filp = fl->fl_file; + + f_delown(filp); + filp->f_owner.signum = 0; + fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); + if (fl->fl_fasync != NULL) { + printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); + fl->fl_fasync = NULL; + } locks_delete_lock(before); + } return 0; } diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index 13487ad16894..78e2d93e5c83 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -32,7 +32,8 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) if (block < 0) { printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", block, bdevname(sb->s_bdev, b)); - } else if (block >= (minix_sb(inode->i_sb)->s_max_size/sb->s_blocksize)) { + } else if ((u64)block * (u64)sb->s_blocksize >= + minix_sb(sb)->s_max_size) { if (printk_ratelimit()) printk("MINIX-fs: block_to_path: " "block %ld too big on dev %s\n", diff --git a/fs/namei.c b/fs/namei.c index 2ccc35c4dc24..db76b866a097 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -650,6 +650,121 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki path_put(link); } +int sysctl_protected_symlinks __read_mostly = 1; +int sysctl_protected_hardlinks __read_mostly = 1; + +/** + * may_follow_link - Check symlink following for unsafe situations + * @link: The path of the symlink + * + * In the case of the sysctl_protected_symlinks sysctl being enabled, + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is + * in a sticky world-writable directory. This is to protect privileged + * processes from failing races against path names that may change out + * from under them by way of other users creating malicious symlinks. + * It will permit symlinks to be followed only when outside a sticky + * world-writable directory, or when the uid of the symlink and follower + * match, or when the directory owner matches the symlink's owner. + * + * Returns 0 if following the symlink is allowed, -ve on error. + */ +static inline int may_follow_link(struct path *link, struct nameidata *nd) +{ + const struct inode *inode; + const struct inode *parent; + + if (!sysctl_protected_symlinks) + return 0; + + /* Allowed if owner and follower match. */ + inode = link->dentry->d_inode; + if (current_cred()->fsuid == inode->i_uid) + return 0; + + /* Allowed if parent directory not sticky and world-writable. */ + parent = nd->path.dentry->d_inode; + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + return 0; + + /* Allowed if parent directory and link owner match. */ + if (parent->i_uid == inode->i_uid) + return 0; + + path_put_conditional(link, nd); + path_put(&nd->path); + audit_log_link_denied("follow_link", link); + return -EACCES; +} + +/** + * safe_hardlink_source - Check for safe hardlink conditions + * @inode: the source inode to hardlink from + * + * Return false if at least one of the following conditions: + * - inode is not a regular file + * - inode is setuid + * - inode is setgid and group-exec + * - access failure for read and write + * + * Otherwise returns true. + */ +static bool safe_hardlink_source(struct inode *inode) +{ + umode_t mode = inode->i_mode; + + /* Special files should not get pinned to the filesystem. */ + if (!S_ISREG(mode)) + return false; + + /* Setuid files should not get pinned to the filesystem. */ + if (mode & S_ISUID) + return false; + + /* Executable setgid files should not get pinned to the filesystem. */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + return false; + + /* Hardlinking to unreadable or unwritable sources is dangerous. */ + if (inode_permission(inode, MAY_READ | MAY_WRITE)) + return false; + + return true; +} + +/** + * may_linkat - Check permissions for creating a hardlink + * @link: the source to hardlink from + * + * Block hardlink when all of: + * - sysctl_protected_hardlinks enabled + * - fsuid does not match inode + * - hardlink source is unsafe (see safe_hardlink_source() above) + * - not CAP_FOWNER + * + * Returns 0 if successful, -ve on error. + */ +static int may_linkat(struct path *link) +{ + const struct cred *cred; + struct inode *inode; + + if (!sysctl_protected_hardlinks) + return 0; + + cred = current_cred(); + inode = link->dentry->d_inode; + + /* Source inode owner (or CAP_FOWNER) can hardlink all they like, + * otherwise, it must be a safe source. + */ + if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || + capable(CAP_FOWNER)) + return 0; + + audit_log_link_denied("linkat", link); + return -EPERM; +} + static __always_inline int follow_link(struct path *link, struct nameidata *nd, void **p) { @@ -1818,6 +1933,9 @@ static int path_lookupat(int dfd, const char *name, while (err > 0) { void *cookie; struct path link = path; + err = may_follow_link(&link, nd); + if (unlikely(err)) + break; nd->flags |= LOOKUP_PARENT; err = follow_link(&link, nd, &cookie); if (err) @@ -2277,7 +2395,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) static int atomic_open(struct nameidata *nd, struct dentry *dentry, struct path *path, struct file *file, const struct open_flags *op, - bool *want_write, bool need_lookup, + bool got_write, bool need_lookup, int *opened) { struct inode *dir = nd->path.dentry->d_inode; @@ -2296,11 +2414,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, goto out; } - mode = op->mode & S_IALLUGO; + mode = op->mode; if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) mode &= ~current_umask(); - if (open_flag & O_EXCL) { + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { open_flag &= ~O_TRUNC; *opened |= FILE_CREATED; } @@ -2314,12 +2432,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ - if ((open_flag & (O_CREAT | O_TRUNC)) || - (open_flag & O_ACCMODE) != O_RDONLY) { - error = mnt_want_write(nd->path.mnt); - if (!error) { - *want_write = true; - } else if (!(open_flag & O_CREAT)) { + if (((open_flag & (O_CREAT | O_TRUNC)) || + (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { + if (!(open_flag & O_CREAT)) { /* * No O_CREATE -> atomicity not a requirement -> fall * back to lookup + open @@ -2327,17 +2442,17 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, goto no_open; } else if (open_flag & (O_EXCL | O_TRUNC)) { /* Fall back and fail with the right error */ - create_error = error; + create_error = -EROFS; goto no_open; } else { /* No side effects, safe to clear O_CREAT */ - create_error = error; + create_error = -EROFS; open_flag &= ~O_CREAT; } } if (open_flag & O_CREAT) { - error = may_o_create(&nd->path, dentry, op->mode); + error = may_o_create(&nd->path, dentry, mode); if (error) { create_error = error; if (open_flag & O_EXCL) @@ -2374,6 +2489,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = file->f_path.dentry; } + if (create_error && dentry->d_inode == NULL) { + error = create_error; + goto out; + } goto looked_up; } @@ -2438,7 +2557,7 @@ looked_up: static int lookup_open(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, - bool *want_write, int *opened) + bool got_write, int *opened) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -2456,7 +2575,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, goto out_no_open; if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { - return atomic_open(nd, dentry, path, file, op, want_write, + return atomic_open(nd, dentry, path, file, op, got_write, need_lookup, opened); } @@ -2480,10 +2599,10 @@ static int lookup_open(struct nameidata *nd, struct path *path, * a permanent write count is taken through * the 'struct file' in finish_open(). */ - error = mnt_want_write(nd->path.mnt); - if (error) + if (!got_write) { + error = -EROFS; goto out_dput; - *want_write = true; + } *opened |= FILE_CREATED; error = security_path_mknod(&nd->path, dentry, mode, 0); if (error) @@ -2513,7 +2632,7 @@ static int do_last(struct nameidata *nd, struct path *path, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; - bool want_write = false; + bool got_write = false; int acc_mode = op->acc_mode; struct inode *inode; bool symlink_ok = false; @@ -2582,8 +2701,18 @@ static int do_last(struct nameidata *nd, struct path *path, } retry_lookup: + if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + error = mnt_want_write(nd->path.mnt); + if (!error) + got_write = true; + /* + * do _not_ fail yet - we might not need that or fail with + * a different error; let lookup_open() decide; we'll be + * dropping this one anyway. + */ + } mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, &want_write, opened); + error = lookup_open(nd, path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { @@ -2608,22 +2737,23 @@ retry_lookup: } /* - * It already exists. + * create/update audit record if it already exists. */ - audit_inode(pathname, path->dentry); + if (path->dentry->d_inode) + audit_inode(pathname, path->dentry); /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) */ - if (want_write) { + if (got_write) { mnt_drop_write(nd->path.mnt); - want_write = false; + got_write = false; } error = -EEXIST; - if (open_flag & O_EXCL) + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) goto exit_dput; error = follow_managed(path, nd->flags); @@ -2684,7 +2814,7 @@ finish_open: error = mnt_want_write(nd->path.mnt); if (error) goto out; - want_write = true; + got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); @@ -2711,7 +2841,7 @@ opened: goto exit_fput; } out: - if (want_write) + if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); terminate_walk(nd); @@ -2735,9 +2865,9 @@ stale_open: nd->inode = dir->d_inode; save_parent.mnt = NULL; save_parent.dentry = NULL; - if (want_write) { + if (got_write) { mnt_drop_write(nd->path.mnt); - want_write = false; + got_write = false; } retried = true; goto retry_lookup; @@ -2777,6 +2907,9 @@ static struct file *path_openat(int dfd, const char *pathname, error = -ELOOP; break; } + error = may_follow_link(&link, nd); + if (unlikely(error)) + break; nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = follow_link(&link, nd, &cookie); @@ -2846,6 +2979,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path { struct dentry *dentry = ERR_PTR(-EEXIST); struct nameidata nd; + int err2; int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); @@ -2859,16 +2993,19 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; + /* don't fail immediately if it's r/o, at least try to report other errors */ + err2 = mnt_want_write(nd.path.mnt); /* * Do the final lookup. */ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); if (IS_ERR(dentry)) - goto fail; + goto unlock; + error = -EEXIST; if (dentry->d_inode) - goto eexist; + goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -2876,23 +3013,37 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!is_dir && nd.last.name[nd.last.len])) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); + error = -ENOENT; + goto fail; + } + if (unlikely(err2)) { + error = err2; goto fail; } *path = nd.path; return dentry; -eexist: - dput(dentry); - dentry = ERR_PTR(-EEXIST); fail: + dput(dentry); + dentry = ERR_PTR(error); +unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + if (!err2) + mnt_drop_write(nd.path.mnt); out: path_put(&nd.path); return dentry; } EXPORT_SYMBOL(kern_path_create); +void done_path_create(struct path *path, struct dentry *dentry) +{ + dput(dentry); + mutex_unlock(&path->dentry->d_inode->i_mutex); + mnt_drop_write(path->mnt); + path_put(path); +} +EXPORT_SYMBOL(done_path_create); + struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) { char *tmp = getname(pathname); @@ -2956,8 +3107,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, struct path path; int error; - if (S_ISDIR(mode)) - return -EPERM; + error = may_mknod(mode); + if (error) + return error; dentry = user_path_create(dfd, filename, &path, 0); if (IS_ERR(dentry)) @@ -2965,15 +3117,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = may_mknod(mode); - if (error) - goto out_dput; - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mknod(&path, dentry, mode, dev); if (error) - goto out_drop_write; + goto out; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); @@ -2986,13 +3132,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); break; } -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); - +out: + done_path_create(&path, dentry); return error; } @@ -3038,19 +3179,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mkdir(&path, dentry, mode); - if (error) - goto out_drop_write; - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + done_path_create(&path, dentry); return error; } @@ -3144,6 +3276,9 @@ static long do_rmdir(int dfd, const char __user *pathname) } nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); @@ -3154,19 +3289,15 @@ static long do_rmdir(int dfd, const char __user *pathname) error = -ENOENT; goto exit3; } - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit3; error = security_path_rmdir(&nd.path, dentry); if (error) - goto exit4; + goto exit3; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); -exit4: - mnt_drop_write(nd.path.mnt); exit3: dput(dentry); exit2: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); @@ -3233,6 +3364,9 @@ static long do_unlinkat(int dfd, const char __user *pathname) goto exit1; nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); @@ -3245,21 +3379,17 @@ static long do_unlinkat(int dfd, const char __user *pathname) if (!inode) goto slashes; ihold(inode); - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit2; error = security_path_unlink(&nd.path, dentry); if (error) - goto exit3; + goto exit2; error = vfs_unlink(nd.path.dentry->d_inode, dentry); -exit3: - mnt_drop_write(nd.path.mnt); - exit2: +exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); @@ -3324,19 +3454,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, if (IS_ERR(dentry)) goto out_putname; - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_symlink(&path, dentry, from); - if (error) - goto out_drop_write; - error = vfs_symlink(path.dentry->d_inode, dentry, from); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + if (!error) + error = vfs_symlink(path.dentry->d_inode, dentry, from); + done_path_create(&path, dentry); out_putname: putname(from); return error; @@ -3436,19 +3557,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = mnt_want_write(new_path.mnt); - if (error) + error = may_linkat(&old_path); + if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) - goto out_drop_write; + goto out_dput; error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); -out_drop_write: - mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); @@ -3644,6 +3761,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, if (newnd.last_type != LAST_NORM) goto exit2; + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit2; + oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; newnd.flags |= LOOKUP_RENAME_TARGET; @@ -3679,23 +3800,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, if (new_dentry == trap) goto exit5; - error = mnt_want_write(oldnd.path.mnt); - if (error) - goto exit5; error = security_path_rename(&oldnd.path, old_dentry, &newnd.path, new_dentry); if (error) - goto exit6; + goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); -exit6: - mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + mnt_drop_write(oldnd.path.mnt); exit2: path_put(&newnd.path); putname(to); diff --git a/fs/namespace.c b/fs/namespace.c index c53d3381b0d0..4d31f73e2561 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt) } /* - * Most r/o checks on a fs are for operations that take - * discrete amounts of time, like a write() or unlink(). - * We must keep track of when those operations start - * (for permission checks) and when they end, so that - * we can determine when writes are able to occur to - * a filesystem. + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. */ /** - * mnt_want_write - get write access to a mount + * __mnt_want_write - get write access to a mount without freeze protection * @m: the mount on which to take a write * - * This tells the low-level filesystem that a write is - * about to be performed to it, and makes sure that - * writes are allowed before returning success. When - * the write operation is finished, mnt_drop_write() - * must be called. This is effectively a refcount. + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. */ -int mnt_want_write(struct vfsmount *m) +int __mnt_want_write(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m) ret = -EROFS; } preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); @@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt) EXPORT_SYMBOL_GPL(mnt_clone_write); /** - * mnt_want_write_file - get write access to a file's mount + * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can + * This is like __mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already */ -int mnt_want_write_file(struct file *file) +int __mnt_want_write_file(struct file *file) { struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) - return mnt_want_write(file->f_path.mnt); + return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); } + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + sb_start_write(file->f_path.mnt->mnt_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file->f_path.mnt->mnt_sb); + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * mnt_drop_write - give up write access to a mount + * __mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * mnt_want_write() call above. + * __mnt_want_write() call above. */ -void mnt_drop_write(struct vfsmount *mnt) +void __mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} EXPORT_SYMBOL_GPL(mnt_drop_write); +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + void mnt_drop_write_file(struct file *file) { mnt_drop_write(file->f_path.mnt); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f90f4f5cd421..db7ad719628a 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -30,7 +30,7 @@ config NFS_FS If unsure, say N. config NFS_V2 - bool "NFS client support for NFS version 2" + tristate "NFS client support for NFS version 2" depends on NFS_FS default y help @@ -40,7 +40,7 @@ config NFS_V2 If unsure, say Y. config NFS_V3 - bool "NFS client support for NFS version 3" + tristate "NFS client support for NFS version 3" depends on NFS_FS default y help @@ -72,7 +72,7 @@ config NFS_V3_ACL If unsure, say N. config NFS_V4 - bool "NFS client support for NFS version 4" + tristate "NFS client support for NFS version 4" depends on NFS_FS select SUNRPC_GSS select KEYS @@ -86,11 +86,18 @@ config NFS_V4 If unsure, say Y. +config NFS_SWAP + bool "Provide swap over NFS support" + default n + depends on NFS_FS + select SUNRPC_SWAP + help + This option enables swapon to work on files located on NFS mounts. + config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" - depends on NFS_FS && NFS_V4 && EXPERIMENTAL + depends on NFS_V4 && EXPERIMENTAL select SUNRPC_BACKCHANNEL - select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol (RFC 5661) in the kernel's NFS client. @@ -99,15 +106,17 @@ config NFS_V4_1 config PNFS_FILE_LAYOUT tristate + depends on NFS_V4_1 + default m config PNFS_BLOCK tristate - depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM + depends on NFS_V4_1 && BLK_DEV_DM default m config PNFS_OBJLAYOUT tristate - depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD + depends on NFS_V4_1 && SCSI_OSD_ULD default m config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 7ddd45d9f170..8bf3a3f6925a 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -9,17 +9,23 @@ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ write.o namespace.o mount_clnt.o \ dns_resolve.o cache_lib.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o -nfs-$(CONFIG_NFS_V2) += proc.o nfs2xdr.o -nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o -nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o -nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ - delegation.o idmap.o \ - callback.o callback_xdr.o callback_proc.o \ - nfs4namespace.o -nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o -nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o +obj-$(CONFIG_NFS_V2) += nfs2.o +nfs2-y := nfs2super.o proc.o nfs2xdr.o + +obj-$(CONFIG_NFS_V3) += nfs3.o +nfs3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o +nfs3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o + +obj-$(CONFIG_NFS_V4) += nfs4.o +nfs4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ + delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o nfs4getroot.o nfs4client.o +nfs4-$(CONFIG_SYSCTL) += nfs4sysctl.o +nfs4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o + obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 7ae8a608956f..dd392ed5f2e2 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -228,6 +228,14 @@ bl_end_par_io_read(void *data, int unused) schedule_work(&rdata->task.u.tk_work); } +static bool +bl_check_alignment(u64 offset, u32 len, unsigned long blkmask) +{ + if ((offset & blkmask) || (len & blkmask)) + return false; + return true; +} + static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { @@ -244,6 +252,9 @@ bl_read_pagelist(struct nfs_read_data *rdata) dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); + if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK)) + goto use_mds; + par = alloc_parallel(rdata); if (!par) goto use_mds; @@ -552,7 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) struct bio *bio = NULL; struct pnfs_block_extent *be = NULL, *cow_read = NULL; sector_t isect, last_isect = 0, extent_length = 0; - struct parallel_io *par; + struct parallel_io *par = NULL; loff_t offset = wdata->args.offset; size_t count = wdata->args.count; struct page **pages = wdata->args.pages; @@ -563,6 +574,10 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + /* Check for alignment first */ + if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK)) + goto out_mds; + /* At this point, wdata->pages is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. @@ -996,14 +1011,32 @@ bl_clear_layoutdriver(struct nfs_server *server) return 0; } +static void +bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) + nfs_pageio_reset_read_mds(pgio); + else + pnfs_generic_pg_init_read(pgio, req); +} + +static void +bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) + nfs_pageio_reset_write_mds(pgio); + else + pnfs_generic_pg_init_write(pgio, req); +} + static const struct nfs_pageio_ops bl_pg_read_ops = { - .pg_init = pnfs_generic_pg_init_read, + .pg_init = bl_pg_init_read, .pg_test = pnfs_generic_pg_test, .pg_doio = pnfs_generic_pg_readpages, }; static const struct nfs_pageio_ops bl_pg_write_ops = { - .pg_init = pnfs_generic_pg_init_write, + .pg_init = bl_pg_init_write, .pg_test = pnfs_generic_pg_test, .pg_doio = pnfs_generic_pg_writepages, }; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 23ff18fe080a..4c8459e5bdee 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -37,31 +37,7 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; static DEFINE_MUTEX(nfs_callback_mutex); static struct svc_program nfs4_callback_program; -unsigned int nfs_callback_set_tcpport; -unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; -#define NFS_CALLBACK_MAXPORTNR (65535U) - -static int param_set_portnr(const char *val, const struct kernel_param *kp) -{ - unsigned long num; - int ret; - - if (!val) - return -EINVAL; - ret = strict_strtoul(val, 0, &num); - if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} -static struct kernel_param_ops param_ops_portnr = { - .set = param_set_portnr, - .get = param_get_uint, -}; -#define param_check_portnr(name, p) __param_check(name, p, unsigned int); - -module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); /* * This is the NFSv4 callback kernel thread. @@ -265,6 +241,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) ret = -ENOMEM; goto out_err; } + /* As there is only one thread we need to over-ride the + * default maximum of 80 connections + */ + serv->sv_maxconn = 1024; ret = svc_bind(serv, net); if (ret < 0) { diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index a5527c90a5aa..b44d7b128b71 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -192,7 +192,7 @@ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_process_state *cps); extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, struct cb_process_state *cps); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); extern void nfs_callback_down(int minorversion); extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e64b01d2a338..742ff4ffced7 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -863,7 +863,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r .drc_status = 0, .clp = NULL, .slotid = NFS4_NO_SLOT, - .net = rqstp->rq_xprt->xpt_net, + .net = SVC_NET(rqstp), }; unsigned int nops = 0; @@ -879,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_garbage_args; if (hdr_arg.minorversion == 0) { - cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident); + cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) return rpc_drop_reply; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f005b5bebdc7..9fc0d9dfc91b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -51,54 +51,23 @@ #include "internal.h" #include "fscache.h" #include "pnfs.h" +#include "nfs.h" #include "netns.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); -#ifdef CONFIG_NFS_V4 - -/* - * Get a unique NFSv4.0 callback identifier which will be used - * by the V4.0 callback service to lookup the nfs_client struct - */ -static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) -{ - int ret = 0; - struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); - - if (clp->rpc_ops->version != 4 || minorversion != 0) - return ret; -retry: - if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL)) - return -ENOMEM; - spin_lock(&nn->nfs_client_lock); - ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); - spin_unlock(&nn->nfs_client_lock); - if (ret == -EAGAIN) - goto retry; - return ret; -} -#endif /* CONFIG_NFS_V4 */ - -/* - * Turn off NFSv4 uid/gid mapping when using AUTH_SYS - */ -static bool nfs4_disable_idmapping = true; +static DEFINE_SPINLOCK(nfs_version_lock); +static DEFINE_MUTEX(nfs_version_mutex); +static LIST_HEAD(nfs_versions); /* * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { -#ifdef CONFIG_NFS_V2 - [2] = &nfs_version2, -#endif -#ifdef CONFIG_NFS_V3 - [3] = &nfs_version3, -#endif -#ifdef CONFIG_NFS_V4 - [4] = &nfs_version4, -#endif + [2] = NULL, + [3] = NULL, + [4] = NULL, }; const struct rpc_program nfs_program = { @@ -114,32 +83,64 @@ struct rpc_stat nfs_rpcstat = { .program = &nfs_program }; +static struct nfs_subversion *find_nfs_version(unsigned int version) +{ + struct nfs_subversion *nfs; + spin_lock(&nfs_version_lock); -#ifdef CONFIG_NFS_V3_ACL -static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; -static const struct rpc_version *nfsacl_version[] = { - [3] = &nfsacl_version3, -}; + list_for_each_entry(nfs, &nfs_versions, list) { + if (nfs->rpc_ops->version == version) { + spin_unlock(&nfs_version_lock); + return nfs; + } + }; -const struct rpc_program nfsacl_program = { - .name = "nfsacl", - .number = NFS_ACL_PROGRAM, - .nrvers = ARRAY_SIZE(nfsacl_version), - .version = nfsacl_version, - .stats = &nfsacl_rpcstat, -}; -#endif /* CONFIG_NFS_V3_ACL */ - -struct nfs_client_initdata { - unsigned long init_flags; - const char *hostname; - const struct sockaddr *addr; - size_t addrlen; - const struct nfs_rpc_ops *rpc_ops; - int proto; - u32 minorversion; - struct net *net; -}; + spin_unlock(&nfs_version_lock); + return ERR_PTR(-EPROTONOSUPPORT);; +} + +struct nfs_subversion *get_nfs_version(unsigned int version) +{ + struct nfs_subversion *nfs = find_nfs_version(version); + + if (IS_ERR(nfs)) { + mutex_lock(&nfs_version_mutex); + request_module("nfs%d", version); + nfs = find_nfs_version(version); + mutex_unlock(&nfs_version_mutex); + } + + if (!IS_ERR(nfs)) + try_module_get(nfs->owner); + return nfs; +} + +void put_nfs_version(struct nfs_subversion *nfs) +{ + module_put(nfs->owner); +} + +void register_nfs_version(struct nfs_subversion *nfs) +{ + spin_lock(&nfs_version_lock); + + list_add(&nfs->list, &nfs_versions); + nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; + + spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(register_nfs_version); + +void unregister_nfs_version(struct nfs_subversion *nfs) +{ + spin_lock(&nfs_version_lock); + + nfs_version[nfs->rpc_ops->version] = NULL; + list_del(&nfs->list); + + spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record @@ -147,7 +148,7 @@ struct nfs_client_initdata { * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ -static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) +struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; struct rpc_cred *cred; @@ -156,7 +157,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; - clp->rpc_ops = cl_init->rpc_ops; + clp->cl_nfs_mod = cl_init->nfs_mod; + try_module_get(clp->cl_nfs_mod->owner); + + clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; atomic_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; @@ -177,18 +181,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_proto = cl_init->proto; clp->cl_net = get_net(cl_init->net); -#ifdef CONFIG_NFS_V4 - err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); - if (err) - goto error_cleanup; - - spin_lock_init(&clp->cl_lock); - INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); - rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); - clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; - clp->cl_minorversion = cl_init->minorversion; - clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; -#endif cred = rpc_lookup_machine_cred("*"); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; @@ -197,51 +189,14 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ return clp; error_cleanup: + put_nfs_version(clp->cl_nfs_mod); kfree(clp); error_0: return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(nfs_alloc_client); -#ifdef CONFIG_NFS_V4 -#ifdef CONFIG_NFS_V4_1 -static void nfs4_shutdown_session(struct nfs_client *clp) -{ - if (nfs4_has_session(clp)) { - nfs4_destroy_session(clp->cl_session); - nfs4_destroy_clientid(clp); - } - -} -#else /* CONFIG_NFS_V4_1 */ -static void nfs4_shutdown_session(struct nfs_client *clp) -{ -} -#endif /* CONFIG_NFS_V4_1 */ - -/* - * Destroy the NFS4 callback service - */ -static void nfs4_destroy_callback(struct nfs_client *clp) -{ - if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_mvops->minor_version); -} - -static void nfs4_shutdown_client(struct nfs_client *clp) -{ - if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) - nfs4_kill_renewd(clp); - nfs4_shutdown_session(clp); - nfs4_destroy_callback(clp); - if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) - nfs_idmap_delete(clp); - - rpc_destroy_wait_queue(&clp->cl_rpcwaitq); - kfree(clp->cl_serverowner); - kfree(clp->cl_serverscope); - kfree(clp->cl_implid); -} - +#if IS_ENABLED(CONFIG_NFS_V4) /* idr_remove_all is not needed as all id's are removed by nfs_put_client */ void nfs_cleanup_cb_ident_idr(struct net *net) { @@ -264,16 +219,7 @@ static void pnfs_init_server(struct nfs_server *server) rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } -static void nfs4_destroy_server(struct nfs_server *server) -{ - nfs4_purge_state_owners(server); -} - #else -static void nfs4_shutdown_client(struct nfs_client *clp) -{ -} - void nfs_cleanup_cb_ident_idr(struct net *net) { } @@ -291,12 +237,10 @@ static void pnfs_init_server(struct nfs_server *server) /* * Destroy a shared client record */ -static void nfs_free_client(struct nfs_client *clp) +void nfs_free_client(struct nfs_client *clp) { dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs4_shutdown_client(clp); - nfs_fscache_release_client_cookie(clp); /* -EIO all pending I/O */ @@ -307,11 +251,13 @@ static void nfs_free_client(struct nfs_client *clp) put_rpccred(clp->cl_machine_cred); put_net(clp->cl_net); + put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp); dprintk("<-- nfs_free_client()\n"); } +EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record @@ -333,7 +279,7 @@ void nfs_put_client(struct nfs_client *clp) BUG_ON(!list_empty(&clp->cl_superblocks)); - nfs_free_client(clp); + clp->rpc_ops->free_client(clp); } } EXPORT_SYMBOL_GPL(nfs_put_client); @@ -412,8 +358,8 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields, excluding the port number. */ -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) +int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) { if (sa1->sa_family != sa2->sa_family) return 0; @@ -426,6 +372,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } return 0; } +EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); #endif /* CONFIG_NFS_V4_1 */ /* @@ -447,33 +394,6 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, return 0; } -#if defined(CONFIG_NFS_V4_1) -/* Common match routine for v4.0 and v4.1 callback services */ -static bool nfs4_cb_match_client(const struct sockaddr *addr, - struct nfs_client *clp, u32 minorversion) -{ - struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; - - /* Don't match clients that failed to initialise */ - if (!(clp->cl_cons_state == NFS_CS_READY || - clp->cl_cons_state == NFS_CS_SESSION_INITING)) - return false; - - smp_rmb(); - - /* Match the version and minorversion */ - if (clp->rpc_ops->version != 4 || - clp->cl_minorversion != minorversion) - return false; - - /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(addr, clap)) - return false; - - return true; -} -#endif /* CONFIG_NFS_V4_1 */ - /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. @@ -491,7 +411,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Different NFS versions cannot share the same nfs_client */ - if (clp->rpc_ops != data->rpc_ops) + if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) @@ -519,6 +439,7 @@ int nfs_wait_client_init_complete(const struct nfs_client *clp) return wait_event_killable(nfs_client_active_wq, nfs_client_init_is_complete(clp)); } +EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); /* * Found an existing client. Make sure it's ready before returning. @@ -552,7 +473,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ -static struct nfs_client * +struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, const struct rpc_timeout *timeparms, const char *ip_addr, @@ -560,9 +481,10 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); + const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname ?: "", cl_init->rpc_ops->version); + cl_init->hostname ?: "", rpc_ops->version); /* see if the client already exists */ do { @@ -572,27 +494,27 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, if (clp) { spin_unlock(&nn->nfs_client_lock); if (new) - nfs_free_client(new); + new->rpc_ops->free_client(new); return nfs_found_client(cl_init, clp); } if (new) { list_add(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return cl_init->rpc_ops->init_client(new, - timeparms, ip_addr, - authflavour); + return rpc_ops->init_client(new, timeparms, ip_addr, + authflavour); } spin_unlock(&nn->nfs_client_lock); - new = nfs_alloc_client(cl_init); + new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", cl_init->hostname ?: "", PTR_ERR(new)); return new; } +EXPORT_SYMBOL_GPL(nfs_get_client); /* * Mark a server as ready or failed @@ -603,11 +525,12 @@ void nfs_mark_client_ready(struct nfs_client *clp, int state) clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } +EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ -static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans) { to->to_initval = timeo * HZ / 10; @@ -644,13 +567,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, BUG(); } } +EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ -static int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - rpc_authflavor_t flavor) +int nfs_create_rpc_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { @@ -683,6 +607,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, clp->cl_rpcclient = clnt; return 0; } +EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction @@ -735,39 +660,9 @@ static int nfs_start_lockd(struct nfs_server *server) } /* - * Initialise an NFSv3 ACL client connection - */ -#ifdef CONFIG_NFS_V3_ACL -static void nfs_init_server_aclclient(struct nfs_server *server) -{ - if (server->nfs_client->rpc_ops->version != 3) - goto out_noacl; - if (server->flags & NFS_MOUNT_NOACL) - goto out_noacl; - - server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); - if (IS_ERR(server->client_acl)) - goto out_noacl; - - /* No errors! Assume that Sun nfsacls are supported */ - server->caps |= NFS_CAP_ACLS; - return; - -out_noacl: - server->caps &= ~NFS_CAP_ACLS; -} -#else -static inline void nfs_init_server_aclclient(struct nfs_server *server) -{ - server->flags &= ~NFS_MOUNT_NOACL; - server->caps &= ~NFS_CAP_ACLS; -} -#endif - -/* * Create a general RPC client */ -static int nfs_init_server_rpcclient(struct nfs_server *server, +int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { @@ -799,6 +694,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, return 0; } +EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); /** * nfs_init_client - Initialise an NFS2 or NFS3 client @@ -838,18 +734,20 @@ error: dprintk("<-- nfs_init_client() = xerror %d\n", error); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(nfs_init_client); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) + const struct nfs_parsed_mount_data *data, + struct nfs_subversion *nfs_mod) { struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, .addrlen = data->nfs_server.addrlen, - .rpc_ops = NULL, + .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, .net = data->net, }; @@ -859,21 +757,6 @@ static int nfs_init_server(struct nfs_server *server, dprintk("--> nfs_init_server()\n"); - switch (data->version) { -#ifdef CONFIG_NFS_V2 - case 2: - cl_init.rpc_ops = &nfs_v2_clientops; - break; -#endif -#ifdef CONFIG_NFS_V3 - case 3: - cl_init.rpc_ops = &nfs_v3_clientops; - break; -#endif - default: - return -EPROTONOSUPPORT; - } - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) @@ -927,8 +810,6 @@ static int nfs_init_server(struct nfs_server *server, server->mountd_protocol = data->mount_server.protocol; server->namelen = data->namlen; - /* Create a client RPC handle for the NFSv3 ACL management interface */ - nfs_init_server_aclclient(server); dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; @@ -975,7 +856,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -1001,7 +881,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, /* * Probe filesystem information, including the FSID on v2/v3 */ -static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; @@ -1041,11 +921,12 @@ out_error: dprintk("nfs_probe_fsinfo: error = %d\n", -error); return error; } +EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); /* * Copy useful information when duplicating a server record */ -static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; target->rsize = source->rsize; @@ -1057,8 +938,9 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve target->caps = source->caps; target->options = source->options; } +EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); -static void nfs_server_insert_lists(struct nfs_server *server) +void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); @@ -1070,6 +952,7 @@ static void nfs_server_insert_lists(struct nfs_server *server) spin_unlock(&nn->nfs_client_lock); } +EXPORT_SYMBOL_GPL(nfs_server_insert_lists); static void nfs_server_remove_lists(struct nfs_server *server) { @@ -1092,7 +975,7 @@ static void nfs_server_remove_lists(struct nfs_server *server) /* * Allocate and initialise a server record */ -static struct nfs_server *nfs_alloc_server(void) +struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; @@ -1129,6 +1012,7 @@ static struct nfs_server *nfs_alloc_server(void) return server; } +EXPORT_SYMBOL_GPL(nfs_alloc_server); /* * Free up a server record @@ -1138,7 +1022,6 @@ void nfs_free_server(struct nfs_server *server) dprintk("--> nfs_free_server()\n"); nfs_server_remove_lists(server); - unset_pnfs_layoutdriver(server); if (server->destroy != NULL) server->destroy(server); @@ -1158,13 +1041,14 @@ void nfs_free_server(struct nfs_server *server) nfs_release_automount_timer(); dprintk("<-- nfs_free_server()\n"); } +EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ -struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, - struct nfs_fh *mntfh) +struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) { struct nfs_server *server; struct nfs_fattr *fattr; @@ -1180,7 +1064,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, goto error; /* Get a client representation */ - error = nfs_init_server(server, data); + error = nfs_init_server(server, mount_info->parsed, nfs_mod); if (error < 0) goto error; @@ -1189,13 +1073,13 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); /* Probe the root fh to retrieve its FSID */ - error = nfs_probe_fsinfo(server, mntfh, fattr); + error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + if (!(mount_info->parsed->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) @@ -1203,7 +1087,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, } if (!(fattr->valid & NFS_ATTR_FATTR)) { - error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; @@ -1225,522 +1109,7 @@ error: nfs_free_server(server); return ERR_PTR(error); } - -#ifdef CONFIG_NFS_V4 -/* - * NFSv4.0 callback thread helper - * - * Find a client by callback identifier - */ -struct nfs_client * -nfs4_find_client_ident(struct net *net, int cb_ident) -{ - struct nfs_client *clp; - struct nfs_net *nn = net_generic(net, nfs_net_id); - - spin_lock(&nn->nfs_client_lock); - clp = idr_find(&nn->cb_ident_idr, cb_ident); - if (clp) - atomic_inc(&clp->cl_count); - spin_unlock(&nn->nfs_client_lock); - return clp; -} - -#if defined(CONFIG_NFS_V4_1) -/* - * NFSv4.1 callback thread helper - * For CB_COMPOUND calls, find a client by IP address, protocol version, - * minorversion, and sessionID - * - * Returns NULL if no such client - */ -struct nfs_client * -nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, - struct nfs4_sessionid *sid) -{ - struct nfs_client *clp; - struct nfs_net *nn = net_generic(net, nfs_net_id); - - spin_lock(&nn->nfs_client_lock); - list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, 1) == false) - continue; - - if (!nfs4_has_session(clp)) - continue; - - /* Match sessionid*/ - if (memcmp(clp->cl_session->sess_id.data, - sid->data, NFS4_MAX_SESSIONID_LEN) != 0) - continue; - - atomic_inc(&clp->cl_count); - spin_unlock(&nn->nfs_client_lock); - return clp; - } - spin_unlock(&nn->nfs_client_lock); - return NULL; -} - -#else /* CONFIG_NFS_V4_1 */ - -struct nfs_client * -nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, - struct nfs4_sessionid *sid) -{ - return NULL; -} -#endif /* CONFIG_NFS_V4_1 */ - -/* - * Initialize the NFS4 callback service - */ -static int nfs4_init_callback(struct nfs_client *clp) -{ - int error; - - if (clp->rpc_ops->version == 4) { - struct rpc_xprt *xprt; - - xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); - - if (nfs4_has_session(clp)) { - error = xprt_setup_backchannel(xprt, - NFS41_BC_MIN_CALLBACKS); - if (error < 0) - return error; - } - - error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); - if (error < 0) { - dprintk("%s: failed to start callback. Error = %d\n", - __func__, error); - return error; - } - __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); - } - return 0; -} - -/* - * Initialize the minor version specific parts of an NFS4 client record - */ -static int nfs4_init_client_minor_version(struct nfs_client *clp) -{ -#if defined(CONFIG_NFS_V4_1) - if (clp->cl_mvops->minor_version) { - struct nfs4_session *session = NULL; - /* - * Create the session and mark it expired. - * When a SEQUENCE operation encounters the expired session - * it will do session recovery to initialize it. - */ - session = nfs4_alloc_session(clp); - if (!session) - return -ENOMEM; - - clp->cl_session = session; - /* - * The create session reply races with the server back - * channel probe. Mark the client NFS_CS_SESSION_INITING - * so that the client back channel can find the - * nfs_client struct - */ - nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); - } -#endif /* CONFIG_NFS_V4_1 */ - - return nfs4_init_callback(clp); -} - -/** - * nfs4_init_client - Initialise an NFS4 client record - * - * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: callback IP address in presentation format - * @authflavor: authentication flavor for underlying RPC transport - * - * Returns pointer to an NFS client, or an ERR_PTR value. - */ -struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour) -{ - char buf[INET6_ADDRSTRLEN + 1]; - int error; - - if (clp->cl_cons_state == NFS_CS_READY) { - /* the client is initialised already */ - dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); - return clp; - } - - /* Check NFS protocol revision and initialize RPC op vector */ - clp->rpc_ops = &nfs_v4_clientops; - - __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, authflavour); - if (error < 0) - goto error; - - /* If no clientaddr= option was specified, find a usable cb address */ - if (ip_addr == NULL) { - struct sockaddr_storage cb_addr; - struct sockaddr *sap = (struct sockaddr *)&cb_addr; - - error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); - if (error < 0) - goto error; - error = rpc_ntop(sap, buf, sizeof(buf)); - if (error < 0) - goto error; - ip_addr = (const char *)buf; - } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - - error = nfs_idmap_new(clp); - if (error < 0) { - dprintk("%s: failed to create idmapper. Error = %d\n", - __func__, error); - goto error; - } - __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - - error = nfs4_init_client_minor_version(clp); - if (error < 0) - goto error; - - if (!nfs4_has_session(clp)) - nfs_mark_client_ready(clp, NFS_CS_READY); - return clp; - -error: - nfs_mark_client_ready(clp, error); - nfs_put_client(clp); - dprintk("<-- nfs4_init_client() = xerror %d\n", error); - return ERR_PTR(error); -} - -/* - * Set up an NFS4 client - */ -static int nfs4_set_client(struct nfs_server *server, - const char *hostname, - const struct sockaddr *addr, - const size_t addrlen, - const char *ip_addr, - rpc_authflavor_t authflavour, - int proto, const struct rpc_timeout *timeparms, - u32 minorversion, struct net *net) -{ - struct nfs_client_initdata cl_init = { - .hostname = hostname, - .addr = addr, - .addrlen = addrlen, - .rpc_ops = &nfs_v4_clientops, - .proto = proto, - .minorversion = minorversion, - .net = net, - }; - struct nfs_client *clp; - int error; - - dprintk("--> nfs4_set_client()\n"); - - if (server->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); - - /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); - if (IS_ERR(clp)) { - error = PTR_ERR(clp); - goto error; - } - - /* - * Query for the lease time on clientid setup or renewal - * - * Note that this will be set on nfs_clients that were created - * only for the DS role and did not set this bit, but now will - * serve a dual role. - */ - set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); - - server->nfs_client = clp; - dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); - return 0; -error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; -} - -/* - * Set up a pNFS Data Server client. - * - * Return any existing nfs_client that matches server address,port,version - * and minorversion. - * - * For a new nfs_client, use a soft mount (default), a low retrans and a - * low timeout interval so that if a connection is lost, we retry through - * the MDS. - */ -struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, - const struct sockaddr *ds_addr, int ds_addrlen, - int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) -{ - struct nfs_client_initdata cl_init = { - .addr = ds_addr, - .addrlen = ds_addrlen, - .rpc_ops = &nfs_v4_clientops, - .proto = ds_proto, - .minorversion = mds_clp->cl_minorversion, - .net = mds_clp->cl_net, - }; - struct rpc_timeout ds_timeout; - struct nfs_client *clp; - - /* - * Set an authflavor equual to the MDS value. Use the MDS nfs_client - * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS - * (section 13.1 RFC 5661). - */ - nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - mds_clp->cl_rpcclient->cl_auth->au_flavor); - - dprintk("<-- %s %p\n", __func__, clp); - return clp; -} -EXPORT_SYMBOL_GPL(nfs4_set_ds_client); - -/* - * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. - */ -static void nfs4_session_set_rwsize(struct nfs_server *server) -{ -#ifdef CONFIG_NFS_V4_1 - struct nfs4_session *sess; - u32 server_resp_sz; - u32 server_rqst_sz; - - if (!nfs4_has_session(server->nfs_client)) - return; - sess = server->nfs_client->cl_session; - server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; - server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - - if (server->rsize > server_resp_sz) - server->rsize = server_resp_sz; - if (server->wsize > server_rqst_sz) - server->wsize = server_rqst_sz; -#endif /* CONFIG_NFS_V4_1 */ -} - -static int nfs4_server_common_setup(struct nfs_server *server, - struct nfs_fh *mntfh) -{ - struct nfs_fattr *fattr; - int error; - - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - /* data servers support only a subset of NFSv4.1 */ - if (is_ds_only_client(server->nfs_client)) - return -EPROTONOSUPPORT; - - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - return -ENOMEM; - - /* We must ensure the session is initialised first */ - error = nfs4_init_session(server); - if (error < 0) - goto out; - - /* Probe the root fh to retrieve its FSID and filehandle */ - error = nfs4_get_rootfh(server, mntfh); - if (error < 0) - goto out; - - dprintk("Server FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); - - nfs4_session_set_rwsize(server); - - error = nfs_probe_fsinfo(server, mntfh, fattr); - if (error < 0) - goto out; - - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - nfs_server_insert_lists(server); - server->mount_time = jiffies; - server->destroy = nfs4_destroy_server; -out: - nfs_free_fattr(fattr); - return error; -} - -/* - * Create a version 4 volume record - */ -static int nfs4_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) -{ - struct rpc_timeout timeparms; - int error; - - dprintk("--> nfs4_init_server()\n"); - - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - - /* Initialise the client representation from the mount data */ - server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) - server->caps |= NFS_CAP_READDIRPLUS; - server->options = data->options; - - /* Get a client record */ - error = nfs4_set_client(server, - data->nfs_server.hostname, - (const struct sockaddr *)&data->nfs_server.address, - data->nfs_server.addrlen, - data->client_address, - data->auth_flavors[0], - data->nfs_server.protocol, - &timeparms, - data->minorversion, - data->net); - if (error < 0) - goto error; - - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; - - if (data->rsize) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize) - server->wsize = nfs_block_size(data->wsize, NULL); - - server->acregmin = data->acregmin * HZ; - server->acregmax = data->acregmax * HZ; - server->acdirmin = data->acdirmin * HZ; - server->acdirmax = data->acdirmax * HZ; - - server->port = data->nfs_server.port; - - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); - -error: - /* Done */ - dprintk("<-- nfs4_init_server() = %d\n", error); - return error; -} - -/* - * Create a version 4 volume record - * - keyed on server and FSID - */ -struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, - struct nfs_fh *mntfh) -{ - struct nfs_server *server; - int error; - - dprintk("--> nfs4_create_server()\n"); - - server = nfs_alloc_server(); - if (!server) - return ERR_PTR(-ENOMEM); - - /* set up the general RPC client */ - error = nfs4_init_server(server, data); - if (error < 0) - goto error; - - error = nfs4_server_common_setup(server, mntfh); - if (error < 0) - goto error; - - dprintk("<-- nfs4_create_server() = %p\n", server); - return server; - -error: - nfs_free_server(server); - dprintk("<-- nfs4_create_server() = error %d\n", error); - return ERR_PTR(error); -} - -/* - * Create an NFS4 referral server record - */ -struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, - struct nfs_fh *mntfh) -{ - struct nfs_client *parent_client; - struct nfs_server *server, *parent_server; - int error; - - dprintk("--> nfs4_create_referral_server()\n"); - - server = nfs_alloc_server(); - if (!server) - return ERR_PTR(-ENOMEM); - - parent_server = NFS_SB(data->sb); - parent_client = parent_server->nfs_client; - - /* Initialise the client representation from the parent server */ - nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; - - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ - error = nfs4_set_client(server, data->hostname, - data->addr, - data->addrlen, - parent_client->cl_ipaddr, - data->authflavor, - rpc_protocol(parent_server->client), - parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version, - parent_client->cl_net); - if (error < 0) - goto error; - - error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); - if (error < 0) - goto error; - - error = nfs4_server_common_setup(server, mntfh); - if (error < 0) - goto error; - - dprintk("<-- nfs_create_referral_server() = %p\n", server); - return server; - -error: - nfs_free_server(server); - dprintk("<-- nfs4_create_referral_server() = error %d\n", error); - return ERR_PTR(error); -} - -#endif /* CONFIG_NFS_V4 */ +EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record @@ -1780,8 +1149,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, flavor); if (error < 0) goto out_free_server; - if (!IS_ERR(source->client_acl)) - nfs_init_server_aclclient(server); /* probe the filesystem info for this server filesystem */ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); @@ -1812,6 +1179,7 @@ out_free_server: dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(nfs_clone_server); void nfs_clients_init(struct net *net) { @@ -1819,7 +1187,7 @@ void nfs_clients_init(struct net *net) INIT_LIST_HEAD(&nn->nfs_client_list); INIT_LIST_HEAD(&nn->nfs_volume_list); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); @@ -2091,7 +1459,3 @@ void nfs_fs_proc_exit(void) } #endif /* CONFIG_PROC_FS */ - -module_param(nfs4_disable_idmapping, bool, 0644); -MODULE_PARM_DESC(nfs4_disable_idmapping, - "Turn off NFSv4 idmapping when using 'sec=sys'"); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index bd3a9601d32d..81c5eec3cf38 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -47,7 +47,7 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) * * Returns one if inode has the indicated delegation, otherwise zero. */ -int nfs_have_delegation(struct inode *inode, fmode_t flags) +int nfs4_have_delegation(struct inode *inode, fmode_t flags) { struct nfs_delegation *delegation; int ret = 0; @@ -388,7 +388,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) * * Returns zero on success, or a negative errno value. */ -int nfs_inode_return_delegation(struct inode *inode) +int nfs4_inode_return_delegation(struct inode *inode) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); @@ -417,9 +417,8 @@ static void nfs_mark_return_delegation(struct nfs_server *server, * @sb: sb to process * */ -void nfs_super_return_all_delegations(struct super_block *sb) +void nfs_server_return_all_delegations(struct nfs_server *server) { - struct nfs_server *server = NFS_SB(sb); struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 72709c4193fa..bbc6a4dba0d8 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -8,7 +8,7 @@ #ifndef FS_NFS_DELEGATION_H #define FS_NFS_DELEGATION_H -#if defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V4) /* * NFSv4 delegation */ @@ -33,12 +33,12 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -int nfs_inode_return_delegation(struct inode *inode); +int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); -void nfs_super_return_all_delegations(struct super_block *sb); +void nfs_server_return_all_delegations(struct nfs_server *); void nfs_expire_all_delegations(struct nfs_client *clp); void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); @@ -56,24 +56,13 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); -int nfs_have_delegation(struct inode *inode, fmode_t flags); +int nfs4_have_delegation(struct inode *inode, fmode_t flags); -#else -static inline int nfs_have_delegation(struct inode *inode, fmode_t flags) -{ - return 0; -} - -static inline int nfs_inode_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} #endif static inline int nfs_have_delegated_attributes(struct inode *inode) { - return nfs_have_delegation(inode, FMODE_READ) && + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a6b1c7fb8232..627f108ede23 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -17,6 +17,7 @@ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ +#include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> #include <linux/stat.h> @@ -46,16 +47,6 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); -static struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); -static int nfs_create(struct inode *, struct dentry *, umode_t, bool); -static int nfs_mkdir(struct inode *, struct dentry *, umode_t); -static int nfs_rmdir(struct inode *, struct dentry *); -static int nfs_unlink(struct inode *, struct dentry *); -static int nfs_symlink(struct inode *, struct dentry *, const char *); -static int nfs_link(struct dentry *, struct inode *, struct dentry *); -static int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -static int nfs_rename(struct inode *, struct dentry *, - struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct page*); @@ -69,73 +60,10 @@ const struct file_operations nfs_dir_operations = { .fsync = nfs_fsync_dir, }; -const struct inode_operations nfs_dir_inode_operations = { - .create = nfs_create, - .lookup = nfs_lookup, - .link = nfs_link, - .unlink = nfs_unlink, - .symlink = nfs_symlink, - .mkdir = nfs_mkdir, - .rmdir = nfs_rmdir, - .mknod = nfs_mknod, - .rename = nfs_rename, - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, -}; - const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; -#ifdef CONFIG_NFS_V3 -const struct inode_operations nfs3_dir_inode_operations = { - .create = nfs_create, - .lookup = nfs_lookup, - .link = nfs_link, - .unlink = nfs_unlink, - .symlink = nfs_symlink, - .mkdir = nfs_mkdir, - .rmdir = nfs_rmdir, - .mknod = nfs_mknod, - .rename = nfs_rename, - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, -}; -#endif /* CONFIG_NFS_V3 */ - -#ifdef CONFIG_NFS_V4 - -static int nfs_atomic_open(struct inode *, struct dentry *, - struct file *, unsigned, umode_t, - int *); -const struct inode_operations nfs4_dir_inode_operations = { - .create = nfs_create, - .lookup = nfs_lookup, - .atomic_open = nfs_atomic_open, - .link = nfs_link, - .unlink = nfs_unlink, - .symlink = nfs_symlink, - .mkdir = nfs_mkdir, - .rmdir = nfs_rmdir, - .mknod = nfs_mknod, - .rename = nfs_rename, - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, - .getxattr = generic_getxattr, - .setxattr = generic_setxattr, - .listxattr = generic_listxattr, - .removexattr = generic_removexattr, -}; - -#endif /* CONFIG_NFS_V4 */ - static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) { struct nfs_open_dir_context *ctx; @@ -1008,6 +936,7 @@ void nfs_force_lookup_revalidate(struct inode *dir) { NFS_I(dir)->cache_change_attribute++; } +EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); /* * A check for whether or not the parent directory has changed. @@ -1128,7 +1057,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; } - if (nfs_have_delegation(inode, FMODE_READ)) + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) goto out_set_verifier; /* Force a full look up iff the parent directory has changed */ @@ -1269,8 +1198,9 @@ const struct dentry_operations nfs_dentry_operations = { .d_automount = nfs_d_automount, .d_release = nfs_d_release, }; +EXPORT_SYMBOL_GPL(nfs_dentry_operations); -static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; struct dentry *parent; @@ -1336,8 +1266,9 @@ out: nfs_free_fhandle(fhandle); return res; } +EXPORT_SYMBOL_GPL(nfs_lookup); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) static int nfs4_lookup_revalidate(struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { @@ -1347,6 +1278,7 @@ const struct dentry_operations nfs4_dentry_operations = { .d_automount = nfs_d_automount, .d_release = nfs_d_release, }; +EXPORT_SYMBOL_GPL(nfs4_dentry_operations); static fmode_t flags_to_mode(int flags) { @@ -1398,9 +1330,9 @@ out: return err; } -static int nfs_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned open_flags, - umode_t mode, int *opened) +int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode, int *opened) { struct nfs_open_context *ctx; struct dentry *res; @@ -1489,6 +1421,7 @@ no_open: return finish_no_open(file, res); } +EXPORT_SYMBOL_GPL(nfs_atomic_open); static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) { @@ -1581,6 +1514,7 @@ out_error: dput(parent); return error; } +EXPORT_SYMBOL_GPL(nfs_instantiate); /* * Following a failed create operation, we drop the dentry rather @@ -1588,7 +1522,7 @@ out_error: * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -static int nfs_create(struct inode *dir, struct dentry *dentry, +int nfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct iattr attr; @@ -1609,11 +1543,12 @@ out_err: d_drop(dentry); return error; } +EXPORT_SYMBOL_GPL(nfs_create); /* * See comments for nfs_proc_create regarding failed operations. */ -static int +int nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; @@ -1636,11 +1571,12 @@ out_err: d_drop(dentry); return status; } +EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; int error; @@ -1659,6 +1595,7 @@ out_err: d_drop(dentry); return error; } +EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { @@ -1666,7 +1603,7 @@ static void nfs_dentry_handle_enoent(struct dentry *dentry) d_delete(dentry); } -static int nfs_rmdir(struct inode *dir, struct dentry *dentry) +int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; @@ -1682,6 +1619,7 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) return error; } +EXPORT_SYMBOL_GPL(nfs_rmdir); /* * Remove a file after making sure there are no pending writes, @@ -1706,7 +1644,7 @@ static int nfs_safe_remove(struct dentry *dentry) } if (inode != NULL) { - nfs_inode_return_delegation(inode); + NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); /* The VFS may want to delete this inode */ if (error == 0) @@ -1725,7 +1663,7 @@ out: * * If sillyrename() returns 0, we do nothing, otherwise we unlink. */ -static int nfs_unlink(struct inode *dir, struct dentry *dentry) +int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; int need_rehash = 0; @@ -1753,6 +1691,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) d_rehash(dentry); return error; } +EXPORT_SYMBOL_GPL(nfs_unlink); /* * To create a symbolic link, most file systems instantiate a new inode, @@ -1769,7 +1708,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { struct pagevec lru_pvec; struct page *page; @@ -1823,8 +1762,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym return 0; } +EXPORT_SYMBOL_GPL(nfs_symlink); -static int +int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; @@ -1834,7 +1774,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); - nfs_inode_return_delegation(inode); + NFS_PROTO(inode)->return_delegation(inode); d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); @@ -1844,6 +1784,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } return error; } +EXPORT_SYMBOL_GPL(nfs_link); /* * RENAME @@ -1869,7 +1810,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) * If these conditions are met, we can drop the dentries before doing * the rename. */ -static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, +int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *old_inode = old_dentry->d_inode; @@ -1918,9 +1859,9 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - nfs_inode_return_delegation(old_inode); + NFS_PROTO(old_inode)->return_delegation(old_inode); if (new_inode != NULL) - nfs_inode_return_delegation(new_inode); + NFS_PROTO(new_inode)->return_delegation(new_inode); error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); @@ -1942,6 +1883,7 @@ out: dput(dentry); return error; } +EXPORT_SYMBOL_GPL(nfs_rename); static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); @@ -2042,6 +1984,7 @@ void nfs_access_zap_cache(struct inode *inode) spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); } +EXPORT_SYMBOL_GPL(nfs_access_zap_cache); static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) { @@ -2202,6 +2145,7 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) { return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } +EXPORT_SYMBOL_GPL(nfs_may_open); int nfs_permission(struct inode *inode, int mask) { @@ -2261,6 +2205,7 @@ out_notsup: res = generic_permission(inode, mask); goto out; } +EXPORT_SYMBOL_GPL(nfs_permission); /* * Local variables: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 48253372ab1d..1ba385b7c90d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -115,17 +115,28 @@ static inline int put_dreq(struct nfs_direct_req *dreq) * @nr_segs: size of iovec array * * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, we shunt off direct - * read and write requests before the VFS gets them, so this method - * should never be called. + * the NFS client supports direct I/O. However, for most direct IO, we + * shunt off direct read and write requests before the VFS gets them, + * so this method is only ever called for swap. */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { +#ifndef CONFIG_NFS_SWAP dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", iocb->ki_filp->f_path.dentry->d_name.name, (long long) pos, nr_segs); return -EINVAL; +#else + VM_BUG_ON(iocb->ki_left != PAGE_SIZE); + VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); + + if (rw == READ || rw == KERNEL_READ) + return nfs_file_direct_read(iocb, iov, nr_segs, pos, + rw == READ ? true : false); + return nfs_file_direct_write(iocb, iov, nr_segs, pos, + rw == WRITE ? true : false); +#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -303,7 +314,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { */ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, const struct iovec *iov, - loff_t pos) + loff_t pos, bool uio) { struct nfs_direct_req *dreq = desc->pg_dreq; struct nfs_open_context *ctx = dreq->ctx; @@ -331,12 +342,20 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de GFP_KERNEL); if (!pagevec) break; - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, + if (uio) { + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, npages, 1, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; + up_read(¤t->mm->mmap_sem); + if (result < 0) + break; + } else { + WARN_ON(npages != 1); + result = get_kernel_page(user_addr, 1, pagevec); + if (WARN_ON(result != 1)) + break; + } + if ((unsigned)result < npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { @@ -386,21 +405,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, - loff_t pos) + loff_t pos, bool uio) { struct nfs_pageio_descriptor desc; ssize_t result = -EINVAL; size_t requested_bytes = 0; unsigned long seg; - nfs_pageio_init_read(&desc, dreq->inode, + NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule_segment(&desc, vec, pos); + result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); if (result < 0) break; requested_bytes += result; @@ -426,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos, bool uio) { ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -444,7 +463,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); if (!result) result = nfs_direct_wait(dreq); NFS_I(inode)->read_io += result; @@ -460,7 +479,7 @@ static void nfs_inode_dio_write_done(struct inode *inode) inode_dio_done(inode); } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; @@ -478,7 +497,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) dreq->count = 0; get_dreq(dreq); - nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, + NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; @@ -610,7 +629,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, const struct iovec *iov, - loff_t pos) + loff_t pos, bool uio) { struct nfs_direct_req *dreq = desc->pg_dreq; struct nfs_open_context *ctx = dreq->ctx; @@ -638,12 +657,19 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d if (!pagevec) break; - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - npages, 0, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; + if (uio) { + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + npages, 0, 0, pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) + break; + } else { + WARN_ON(npages != 1); + result = get_kernel_page(user_addr, 0, pagevec); + if (WARN_ON(result != 1)) + break; + } if ((unsigned)result < npages) { bytes = result * PAGE_SIZE; @@ -774,7 +800,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, - loff_t pos) + loff_t pos, bool uio) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -782,7 +808,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; unsigned long seg; - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, + NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -790,7 +816,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule_segment(&desc, vec, pos); + result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); if (result < 0) break; requested_bytes += result; @@ -818,7 +844,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, - size_t count) + size_t count, bool uio) { ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -836,7 +862,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos); + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); if (!result) result = nfs_direct_wait(dreq); out_release: @@ -867,7 +893,7 @@ out: * cache. */ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos, bool uio) { ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; @@ -892,7 +918,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, task_io_account_read(count); - retval = nfs_direct_read(iocb, iov, nr_segs, pos); + retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); if (retval > 0) iocb->ki_pos = pos + retval; @@ -923,7 +949,7 @@ out: * is no atomic O_APPEND write facility in the NFS protocol. */ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos, bool uio) { ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; @@ -955,7 +981,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, task_io_account_write(count); - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); if (retval > 0) { struct inode *inode = mapping->host; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index b3924b8a6000..31c26c4dcc23 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -8,6 +8,7 @@ #ifdef CONFIG_NFS_USE_KERNEL_DNS +#include <linux/module.h> #include <linux/sunrpc/clnt.h> #include <linux/dns_resolver.h> #include "dns_resolve.h" @@ -27,9 +28,11 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, kfree(ip_addr); return ret; } +EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); #else +#include <linux/module.h> #include <linux/hash.h> #include <linux/string.h> #include <linux/kmod.h> @@ -345,6 +348,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = -ESRCH; return ret; } +EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); int nfs_dns_resolver_cache_init(struct net *net) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a6708e6b438d..75d6d0a3d32e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -16,6 +16,7 @@ * nfs regular file handling functions */ +#include <linux/module.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -35,42 +36,24 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" -#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_FILE static const struct vm_operations_struct nfs_file_vm_ops; -const struct inode_operations nfs_file_inode_operations = { - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, -}; - -#ifdef CONFIG_NFS_V3 -const struct inode_operations nfs3_file_inode_operations = { - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, -}; -#endif /* CONFIG_NFS_v3 */ - /* Hack for future NFS swap support */ #ifndef IS_SWAPFILE # define IS_SWAPFILE(inode) (0) #endif -static int nfs_check_flags(int flags) +int nfs_check_flags(int flags) { if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(nfs_check_flags); /* * Open file @@ -93,7 +76,7 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; } -static int +int nfs_file_release(struct inode *inode, struct file *filp) { dprintk("NFS: release(%s/%s)\n", @@ -103,6 +86,7 @@ nfs_file_release(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); } +EXPORT_SYMBOL_GPL(nfs_file_release); /** * nfs_revalidate_size - Revalidate the file size @@ -135,7 +119,7 @@ force_reval: return __nfs_revalidate_inode(server, inode); } -static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) +loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", filp->f_path.dentry->d_parent->d_name.name, @@ -156,11 +140,12 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) return generic_file_llseek(filp, offset, origin); } +EXPORT_SYMBOL_GPL(nfs_file_llseek); /* * Flush all dirty pages, and check for write errors. */ -static int +int nfs_file_flush(struct file *file, fl_owner_t id) { struct dentry *dentry = file->f_path.dentry; @@ -178,14 +163,15 @@ nfs_file_flush(struct file *file, fl_owner_t id) * If we're holding a write delegation, then just start the i/o * but don't wait for completion (or send a commit). */ - if (nfs_have_delegation(inode, FMODE_WRITE)) + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return filemap_fdatawrite(file->f_mapping); /* Flush writes to the server and return any errors */ return vfs_fsync(file, 0); } +EXPORT_SYMBOL_GPL(nfs_file_flush); -static ssize_t +ssize_t nfs_file_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -194,7 +180,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, ssize_t result; if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, iov, nr_segs, pos); + return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -208,8 +194,9 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, } return result; } +EXPORT_SYMBOL_GPL(nfs_file_read); -static ssize_t +ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, unsigned int flags) @@ -230,8 +217,9 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, } return res; } +EXPORT_SYMBOL_GPL(nfs_file_splice_read); -static int +int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { struct dentry *dentry = file->f_path.dentry; @@ -251,6 +239,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) } return status; } +EXPORT_SYMBOL_GPL(nfs_file_mmap); /* * Flush any dirty pages for this process, and check for write errors. @@ -264,8 +253,8 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * nfs_file_write() that a write error occurred, and hence cause it to * fall back to doing a synchronous write. */ -static int -nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +int +nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); @@ -277,9 +266,6 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) dentry->d_parent->d_name.name, dentry->d_name.name, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - mutex_lock(&inode->i_mutex); - nfs_inc_stats(inode, NFSIOS_VFSFSYNC); have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); @@ -290,10 +276,21 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = xchg(&ctx->error, 0); if (!ret && status < 0) ret = status; - if (!ret && !datasync) - /* application has asked for meta-data sync */ - ret = pnfs_layoutcommit_inode(inode, true); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); + +static int +nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct inode *inode = file->f_path.dentry->d_inode; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); mutex_unlock(&inode->i_mutex); + return ret; } @@ -442,7 +439,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) if (offset != 0) return; /* Cancel any unstarted writes on this page */ - nfs_wb_page_cancel(page->mapping->host, page); + nfs_wb_page_cancel(page_file_mapping(page)->host, page); nfs_fscache_invalidate_page(page, page->mapping->host); } @@ -459,8 +456,11 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not + * doing this memory reclaim for a fs-related allocation. + */ + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && + !(current->flags & PF_FSTRANS)) { int how = FLUSH_SYNC; /* Don't let kswapd deadlock waiting for OOM RPC calls */ @@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) */ static int nfs_launder_page(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", @@ -494,6 +494,20 @@ static int nfs_launder_page(struct page *page) return nfs_wb_page(inode, page); } +#ifdef CONFIG_NFS_SWAP +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + *span = sis->pages; + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); +} + +static void nfs_swap_deactivate(struct file *file) +{ + xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); +} +#endif + const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, @@ -508,6 +522,10 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, +#ifdef CONFIG_NFS_SWAP + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, +#endif }; /* @@ -533,7 +551,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); lock_page(page); - mapping = page->mapping; + mapping = page_file_mapping(page); if (mapping != dentry->d_inode->i_mapping) goto out_unlock; @@ -572,8 +590,8 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode) return 0; } -static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; @@ -582,7 +600,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, iov, nr_segs, pos); + return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); dprintk("NFS: write(%s/%s, %lu@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -623,10 +641,11 @@ out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); goto out; } +EXPORT_SYMBOL_GPL(nfs_file_write); -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags) +ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, + struct file *filp, loff_t *ppos, + size_t count, unsigned int flags) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -654,6 +673,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); return ret; } +EXPORT_SYMBOL_GPL(nfs_file_splice_write); static int do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) @@ -670,7 +690,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) } fl->fl_type = saved_type; - if (nfs_have_delegation(inode, FMODE_READ)) + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) goto out_noconflict; if (is_local) @@ -765,7 +785,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!nfs_have_delegation(inode, FMODE_READ)) { + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { if (is_time_granular(&NFS_SERVER(inode)->time_delta)) __nfs_revalidate_inode(NFS_SERVER(inode), inode); else @@ -778,7 +798,7 @@ out: /* * Lock a (portion of) a file */ -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) +int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = filp->f_mapping->host; int ret = -ENOLCK; @@ -814,11 +834,12 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) out_err: return ret; } +EXPORT_SYMBOL_GPL(nfs_lock); /* * Lock a (portion of) a file */ -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) +int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = filp->f_mapping->host; int is_local = 0; @@ -831,6 +852,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; + /* + * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of + * any standard. In principle we might be able to support LOCK_MAND + * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the + * NFS code is not set up for it. + */ + if (fl->fl_type & LOCK_MAND) + return -EINVAL; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; @@ -843,18 +873,20 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) return do_unlk(filp, cmd, fl, is_local); return do_setlk(filp, cmd, fl, is_local); } +EXPORT_SYMBOL_GPL(nfs_flock); /* * There is no protocol support for leases, so we have no way to implement * them correctly in the face of opens by other clients. */ -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) +int nfs_setlease(struct file *file, long arg, struct file_lock **fl) { dprintk("NFS: setlease(%s/%s, arg=%ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, arg); return -EINVAL; } +EXPORT_SYMBOL_GPL(nfs_setlease); const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, @@ -874,104 +906,4 @@ const struct file_operations nfs_file_operations = { .check_flags = nfs_check_flags, .setlease = nfs_setlease, }; - -#ifdef CONFIG_NFS_V4 -static int -nfs4_file_open(struct inode *inode, struct file *filp) -{ - struct nfs_open_context *ctx; - struct dentry *dentry = filp->f_path.dentry; - struct dentry *parent = NULL; - struct inode *dir; - unsigned openflags = filp->f_flags; - struct iattr attr; - int err; - - BUG_ON(inode != dentry->d_inode); - /* - * If no cached dentry exists or if it's negative, NFSv4 handled the - * opens in ->lookup() or ->create(). - * - * We only get this far for a cached positive dentry. We skipped - * revalidation, so handle it here by dropping the dentry and returning - * -EOPENSTALE. The VFS will retry the lookup/create/open. - */ - - dprintk("NFS: open file(%s/%s)\n", - dentry->d_parent->d_name.name, - dentry->d_name.name); - - if ((openflags & O_ACCMODE) == 3) - openflags--; - - /* We can't create new files here */ - openflags &= ~(O_CREAT|O_EXCL); - - parent = dget_parent(dentry); - dir = parent->d_inode; - - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); - err = PTR_ERR(ctx); - if (IS_ERR(ctx)) - goto out; - - attr.ia_valid = ATTR_OPEN; - if (openflags & O_TRUNC) { - attr.ia_valid |= ATTR_SIZE; - attr.ia_size = 0; - nfs_wb_all(inode); - } - - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - switch (err) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - goto out_put_ctx; - default: - goto out_drop; - } - } - iput(inode); - if (inode != dentry->d_inode) - goto out_drop; - - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - nfs_file_set_open_context(filp, ctx); - err = 0; - -out_put_ctx: - put_nfs_open_context(ctx); -out: - dput(parent); - return err; - -out_drop: - d_drop(dentry); - err = -EOPENSTALE; - goto out_put_ctx; -} - -const struct file_operations nfs4_file_operations = { - .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, - .mmap = nfs_file_mmap, - .open = nfs4_file_open, - .flush = nfs_file_flush, - .release = nfs_file_release, - .fsync = nfs_file_fsync, - .lock = nfs_lock, - .flock = nfs_flock, - .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, - .check_flags = nfs_check_flags, - .setlease = nfs_setlease, -}; -#endif /* CONFIG_NFS_V4 */ +EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index a67990f90bd7..4654ced096a6 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -23,21 +23,15 @@ #include <linux/sunrpc/stats.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> -#include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/namei.h> #include <linux/security.h> #include <asm/uaccess.h> -#include "nfs4_fs.h" -#include "delegation.h" -#include "internal.h" - #define NFSDBG_FACILITY NFSDBG_CLIENT /* @@ -135,47 +129,3 @@ out: nfs_free_fattr(fsinfo.fattr); return ret; } - -#ifdef CONFIG_NFS_V4 - -int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) -{ - struct nfs_fsinfo fsinfo; - int ret = -ENOMEM; - - dprintk("--> nfs4_get_rootfh()\n"); - - fsinfo.fattr = nfs_alloc_fattr(); - if (fsinfo.fattr == NULL) - goto out; - - /* Start by getting the root filehandle from the server */ - ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo); - if (ret < 0) { - dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); - goto out; - } - - if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) - || !S_ISDIR(fsinfo.fattr->mode)) { - printk(KERN_ERR "nfs4_get_rootfh:" - " getroot encountered non-directory\n"); - ret = -ENOTDIR; - goto out; - } - - if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_rootfh:" - " getroot obtained referral\n"); - ret = -EREMOTE; - goto out; - } - - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); -out: - nfs_free_fattr(fsinfo.fattr); - dprintk("<-- nfs4_get_rootfh() = %d\n", ret); - return ret; -} - -#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 864c51e4b400..b701358c39c3 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -52,8 +52,6 @@ #define NFS_UINT_MAXLEN 11 -/* Default cache timeout is 10 minutes */ -unsigned int nfs_idmap_cache_timeout = 600; static const struct cred *id_resolver_cache; static struct key_type key_type_id_resolver_legacy; @@ -205,12 +203,18 @@ static int nfs_idmap_init_keyring(void) if (ret < 0) goto failed_put_key; + ret = register_key_type(&key_type_id_resolver_legacy); + if (ret < 0) + goto failed_reg_legacy; + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); cred->thread_keyring = keyring; cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; id_resolver_cache = cred; return 0; +failed_reg_legacy: + unregister_key_type(&key_type_id_resolver); failed_put_key: key_put(keyring); failed_put_cred: @@ -222,6 +226,7 @@ static void nfs_idmap_quit_keyring(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); + unregister_key_type(&key_type_id_resolver_legacy); put_cred(id_resolver_cache); } @@ -359,7 +364,6 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ } /* idmap classic begins here */ -module_param(nfs_idmap_cache_timeout, int, 0644); enum { Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err @@ -385,7 +389,7 @@ static const struct rpc_pipe_ops idmap_upcall_ops = { }; static struct key_type key_type_id_resolver_legacy = { - .name = "id_resolver", + .name = "id_legacy", .instantiate = user_instantiate, .match = user_match, .revoke = user_revoke, @@ -674,6 +678,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, if (ret < 0) goto out2; + BUG_ON(idmap->idmap_key_cons != NULL); idmap->idmap_key_cons = cons; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); @@ -687,8 +692,7 @@ out2: out1: kfree(msg); out0: - key_revoke(cons->key); - key_revoke(cons->authkey); + complete_request_key(cons, ret); return ret; } @@ -722,11 +726,18 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons = idmap->idmap_key_cons; + struct key_construction *cons; struct idmap_msg im; size_t namelen_in; int ret; + /* If instantiation is successful, anyone waiting for key construction + * will have been woken up and someone else may now have used + * idmap_key_cons - so after this point we may no longer touch it. + */ + cons = ACCESS_ONCE(idmap->idmap_key_cons); + idmap->idmap_key_cons = NULL; + if (mlen != sizeof(im)) { ret = -ENOSPC; goto out; @@ -739,7 +750,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { ret = mlen; - complete_request_key(idmap->idmap_key_cons, -ENOKEY); + complete_request_key(cons, -ENOKEY); goto out_incomplete; } @@ -756,7 +767,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) } out: - complete_request_key(idmap->idmap_key_cons, ret); + complete_request_key(cons, ret); out_incomplete: return ret; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f7296983eba6..c6e895f0fbf3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -32,7 +32,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/nfs_xdr.h> @@ -51,6 +50,7 @@ #include "fscache.h" #include "dns_resolve.h" #include "pnfs.h" +#include "nfs.h" #include "netns.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -82,6 +82,7 @@ int nfs_wait_bit_killable(void *word) freezable_schedule(); return 0; } +EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** * nfs_compat_user_ino64 - returns the user-visible inode number @@ -106,7 +107,7 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -static void nfs_clear_inode(struct inode *inode) +void nfs_clear_inode(struct inode *inode) { /* * The following should never happen... @@ -117,6 +118,7 @@ static void nfs_clear_inode(struct inode *inode) nfs_access_zap_cache(inode); nfs_fscache_release_inode_cookie(inode); } +EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { @@ -186,6 +188,7 @@ void nfs_zap_acl_cache(struct inode *inode) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { @@ -193,6 +196,7 @@ void nfs_invalidate_atime(struct inode *inode) NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_invalidate_atime); /* * Invalidate, but do not unhash, the inode. @@ -391,6 +395,7 @@ out_no_inode: dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); goto out; } +EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) @@ -430,7 +435,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) * Return any delegations if we're going to change ACLs */ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - nfs_inode_return_delegation(inode); + NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) nfs_refresh_inode(inode, fattr); @@ -438,6 +443,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) out: return error; } +EXPORT_SYMBOL_GPL(nfs_setattr); /** * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall @@ -496,6 +502,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) nfs_vmtruncate(inode, attr->ia_size); } } +EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -535,6 +542,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) out: return err; } +EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { @@ -623,6 +631,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) return; nfs_revalidate_inode(server, inode); } +EXPORT_SYMBOL_GPL(nfs_close_context); struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) { @@ -649,6 +658,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f ctx->mdsthreshold = NULL; return ctx; } +EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { @@ -656,6 +666,7 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) atomic_inc(&ctx->lock_context.count); return ctx; } +EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { @@ -683,6 +694,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 0); } +EXPORT_SYMBOL_GPL(put_nfs_open_context); /* * Ensure that mmap has a recent RPC credential for use when writing out @@ -698,6 +710,7 @@ void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) list_add(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* * Given an inode, search for an open context with the desired characteristics @@ -842,6 +855,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } +EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { @@ -883,6 +897,10 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) @@ -1028,6 +1046,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->owner_name = NULL; fattr->group_name = NULL; } +EXPORT_SYMBOL_GPL(nfs_fattr_init); struct nfs_fattr *nfs_alloc_fattr(void) { @@ -1038,6 +1057,7 @@ struct nfs_fattr *nfs_alloc_fattr(void) nfs_fattr_init(fattr); return fattr; } +EXPORT_SYMBOL_GPL(nfs_alloc_fattr); struct nfs_fh *nfs_alloc_fhandle(void) { @@ -1048,6 +1068,7 @@ struct nfs_fh *nfs_alloc_fhandle(void) fh->size = 0; return fh; } +EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); #ifdef NFS_DEBUG /* @@ -1168,6 +1189,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) return status; } +EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { @@ -1204,6 +1226,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_unlock(&inode->i_lock); return status; } +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache @@ -1255,6 +1278,7 @@ out_noforce: spin_unlock(&inode->i_lock); return status; } +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); /* * Many nfs protocol calls return the new file attributes after @@ -1457,7 +1481,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; - if (!nfs_have_delegation(inode, FMODE_READ) || + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || (save_cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; @@ -1472,27 +1496,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) return -ESTALE; } - -#ifdef CONFIG_NFS_V4 - -/* - * Clean out any remaining NFSv4 state that might be left over due - * to open() calls that passed nfs_atomic_lookup, but failed to call - * nfs_open(). - */ -void nfs4_evict_inode(struct inode *inode) -{ - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); - pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); - /* If we are holding a delegation, return it! */ - nfs_inode_return_delegation_noreclaim(inode); - /* First call standard NFS clear_inode() code */ - nfs_clear_inode(inode); -} -#endif - struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; @@ -1505,11 +1508,12 @@ struct inode *nfs_alloc_inode(struct super_block *sb) nfsi->acl_access = ERR_PTR(-EAGAIN); nfsi->acl_default = ERR_PTR(-EAGAIN); #endif -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ return &nfsi->vfs_inode; } +EXPORT_SYMBOL_GPL(nfs_alloc_inode); static void nfs_i_callback(struct rcu_head *head) { @@ -1521,10 +1525,11 @@ void nfs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, nfs_i_callback); } +EXPORT_SYMBOL_GPL(nfs_destroy_inode); static inline void nfs4_init_once(struct nfs_inode *nfsi) { -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; nfsi->delegation_state = 0; @@ -1570,6 +1575,7 @@ static void nfs_destroy_inodecache(void) } struct workqueue_struct *nfsiod_workqueue; +EXPORT_SYMBOL_GPL(nfsiod_workqueue); /* * start up the nfsiod workqueue @@ -1628,81 +1634,76 @@ static int __init init_nfs_fs(void) { int err; - err = nfs_idmap_init(); - if (err < 0) - goto out10; - err = nfs_dns_resolver_init(); if (err < 0) - goto out9; + goto out10;; err = register_pernet_subsys(&nfs_net_ops); if (err < 0) - goto out8; + goto out9; err = nfs_fscache_register(); if (err < 0) - goto out7; + goto out8; err = nfsiod_start(); if (err) - goto out6; + goto out7; err = nfs_fs_proc_init(); if (err) - goto out5; + goto out6; err = nfs_init_nfspagecache(); if (err) - goto out4; + goto out5; err = nfs_init_inodecache(); if (err) - goto out3; + goto out4; err = nfs_init_readpagecache(); if (err) - goto out2; + goto out3; err = nfs_init_writepagecache(); if (err) - goto out1; + goto out2; err = nfs_init_directcache(); if (err) - goto out0; + goto out1; #ifdef CONFIG_PROC_FS rpc_proc_register(&init_net, &nfs_rpcstat); #endif if ((err = register_nfs_fs()) != 0) - goto out; + goto out0; + return 0; -out: +out0: #ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); #endif nfs_destroy_directcache(); -out0: - nfs_destroy_writepagecache(); out1: - nfs_destroy_readpagecache(); + nfs_destroy_writepagecache(); out2: - nfs_destroy_inodecache(); + nfs_destroy_readpagecache(); out3: - nfs_destroy_nfspagecache(); + nfs_destroy_inodecache(); out4: - nfs_fs_proc_exit(); + nfs_destroy_nfspagecache(); out5: - nfsiod_stop(); + nfs_fs_proc_exit(); out6: - nfs_fscache_unregister(); + nfsiod_stop(); out7: - unregister_pernet_subsys(&nfs_net_ops); + nfs_fscache_unregister(); out8: - nfs_dns_resolver_destroy(); + unregister_pernet_subsys(&nfs_net_ops); out9: - nfs_idmap_quit(); + nfs_dns_resolver_destroy(); out10: return err; } @@ -1717,7 +1718,6 @@ static void __exit exit_nfs_fs(void) nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); nfs_dns_resolver_destroy(); - nfs_idmap_quit(); #ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 18f99ef71343..31fdb03225cd 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -85,6 +85,17 @@ struct nfs_clone_mount { */ #define NFS_MAX_READDIR_PAGES 8 +struct nfs_client_initdata { + unsigned long init_flags; + const char *hostname; + const struct sockaddr *addr; + size_t addrlen; + struct nfs_subversion *nfs_mod; + int proto; + u32 minorversion; + struct net *net; +}; + /* * In-kernel mount arguments */ @@ -142,25 +153,45 @@ struct nfs_mount_request { struct net *net; }; +struct nfs_mount_info { + void (*fill_super)(struct super_block *, struct nfs_mount_info *); + int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); + struct nfs_parsed_mount_data *parsed; + struct nfs_clone_mount *cloned; + struct nfs_fh *mntfh; +}; + extern int nfs_mount(struct nfs_mount_request *info); extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); +extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); +int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, + const struct rpc_timeout *, const char *, + rpc_authflavor_t); +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); +void nfs_server_insert_lists(struct nfs_server *); +void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, + rpc_authflavor_t); +struct nfs_server *nfs_alloc_server(void); +void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); extern void nfs_cleanup_cb_ident_idr(struct net *); extern void nfs_put_client(struct nfs_client *); +extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *); -extern struct nfs_server *nfs_create_server( - const struct nfs_parsed_mount_data *, - struct nfs_fh *); +extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, + struct nfs_subversion *); extern struct nfs_server *nfs4_create_server( - const struct nfs_parsed_mount_data *, - struct nfs_fh *); + struct nfs_mount_info *, + struct nfs_subversion *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); extern void nfs_free_server(struct nfs_server *server); @@ -188,6 +219,17 @@ static inline void nfs_fs_proc_exit(void) } #endif +#ifdef CONFIG_NFS_V4_1 +int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); +#endif + +/* nfs3client.c */ +#if IS_ENABLED(CONFIG_NFS_V3) +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); +#endif + /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; @@ -220,7 +262,7 @@ extern int nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); /* nfs4xdr.c */ -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); #endif @@ -230,7 +272,7 @@ extern const u32 nfs41_maxwrite_overhead; #endif /* nfs4proc.c */ -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern struct rpc_procinfo nfs4_procedures[]; #endif @@ -245,25 +287,63 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc); +struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +int nfs_create(struct inode *, struct dentry *, umode_t, bool); +int nfs_mkdir(struct inode *, struct dentry *, umode_t); +int nfs_rmdir(struct inode *, struct dentry *); +int nfs_unlink(struct inode *, struct dentry *); +int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_link(struct dentry *, struct inode *, struct dentry *); +int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* file.c */ +int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); +loff_t nfs_file_llseek(struct file *, loff_t, int); +int nfs_file_flush(struct file *, fl_owner_t); +ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, + size_t, unsigned int); +int nfs_file_mmap(struct file *, struct vm_area_struct *); +ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); +int nfs_file_release(struct inode *, struct file *); +int nfs_lock(struct file *, int, struct file_lock *); +int nfs_flock(struct file *, int, struct file_lock *); +ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, + size_t, unsigned int); +int nfs_check_flags(int); +int nfs_setlease(struct file *, long, struct file_lock **); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); -#ifdef CONFIG_NFS_V4 -extern void nfs4_evict_inode(struct inode *); -#endif void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(void *word); /* super.c */ +extern const struct super_operations nfs_sops; +extern struct file_system_type nfs_fs_type; extern struct file_system_type nfs_xdev_fs_type; -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern struct file_system_type nfs4_xdev_fs_type; extern struct file_system_type nfs4_referral_fs_type; #endif +struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, + struct nfs_subversion *); +void nfs_initialise_sb(struct super_block *); +int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, + struct nfs_mount_info *, struct nfs_subversion *); +struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); +struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, + const char *, struct nfs_mount_info *); +void nfs_kill_super(struct super_block *); +void nfs_fill_super(struct super_block *, struct nfs_mount_info *); extern struct rpc_stat nfs_rpcstat; @@ -284,7 +364,7 @@ struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *, /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, const char *); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, const char *); @@ -304,12 +384,23 @@ extern int nfs_initiate_read(struct rpc_clnt *clnt, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr); -extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); +/* super.c */ +void nfs_clone_super(struct super_block *, struct nfs_mount_info *); +void nfs_umount_begin(struct super_block *); +int nfs_statfs(struct dentry *, struct kstatfs *); +int nfs_show_options(struct seq_file *, struct dentry *); +int nfs_show_devname(struct seq_file *, struct dentry *); +int nfs_show_path(struct seq_file *, struct dentry *); +int nfs_show_stats(struct seq_file *, struct dentry *); +void nfs_put_super(struct super_block *); +int nfs_remount(struct super_block *sb, int *flags, char *raw_data); + /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, @@ -318,7 +409,7 @@ extern struct nfs_write_header *nfs_writehdr_alloc(void); extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr); -extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, +extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); @@ -463,13 +554,14 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) static inline unsigned int nfs_page_length(struct page *page) { - loff_t i_size = i_size_read(page->mapping->host); + loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { + pgoff_t page_index = page_file_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (page->index < end_index) + if (page_index < end_index) return PAGE_CACHE_SIZE; - if (page->index == end_index) + if (page_index == end_index) return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; } return 0; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 08b9c93675da..655925373b91 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -7,6 +7,7 @@ * NFS namespace */ +#include <linux/module.h> #include <linux/dcache.h> #include <linux/gfp.h> #include <linux/mount.h> @@ -112,6 +113,7 @@ Elong_unlock: Elong: return ERR_PTR(-ENAMETOOLONG); } +EXPORT_SYMBOL_GPL(nfs_path); /* * nfs_d_automount - Handle crossing a mountpoint on the server @@ -195,20 +197,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { -#ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = ERR_PTR(-EINVAL); - switch (server->nfs_client->rpc_ops->version) { - case 2: - case 3: - mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); - break; - case 4: - mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata); - } - return mnt; -#else return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); -#endif } /** @@ -253,6 +242,7 @@ out: dprintk("<-- nfs_do_submount() = %p\n", mnt); return mnt; } +EXPORT_SYMBOL_GPL(nfs_do_submount); struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr) @@ -268,3 +258,4 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor); } +EXPORT_SYMBOL_GPL(nfs_submount); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 8a6394edb8b0..0539de1b8d1f 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -20,7 +20,7 @@ struct nfs_net { wait_queue_head_t bl_wq; struct list_head nfs_client_list; struct list_head nfs_volume_list; -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) struct idr cb_ident_idr; /* Protected by nfs_client_lock */ #endif spinlock_t nfs_client_lock; diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h new file mode 100644 index 000000000000..43679df56cd0 --- /dev/null +++ b/fs/nfs/nfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + * + * Function and structures exported by the NFS module + * for use by NFS version-specific modules. + */ +#ifndef __LINUX_INTERNAL_NFS_H +#define __LINUX_INTERNAL_NFS_H + +#include <linux/fs.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs_xdr.h> + +struct nfs_subversion { + struct module *owner; /* THIS_MODULE pointer */ + struct file_system_type *nfs_fs; /* NFS filesystem type */ + const struct rpc_version *rpc_vers; /* NFS version information */ + const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ + const struct super_operations *sops; /* NFS Super operations */ + const struct xattr_handler **xattr; /* NFS xattr handlers */ + struct list_head list; /* List of NFS versions */ +}; + +struct nfs_subversion *get_nfs_version(unsigned int); +void put_nfs_version(struct nfs_subversion *); +void register_nfs_version(struct nfs_subversion *); +void unregister_nfs_version(struct nfs_subversion *); + +#endif /* __LINUX_INTERNAL_NFS_H */ diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c new file mode 100644 index 000000000000..0a9782c9171a --- /dev/null +++ b/fs/nfs/nfs2super.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v2 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs_fs_type, + .rpc_vers = &nfs_version2, + .rpc_ops = &nfs_v2_clientops, + .sops = &nfs_sops, +}; + +static int __init init_nfs_v2(void) +{ + register_nfs_version(&nfs_v2); + return 0; +} + +static void __exit exit_nfs_v2(void) +{ + unregister_nfs_version(&nfs_v2); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v2); +module_exit(exit_nfs_v2); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index baf759bccd05..d04f0df7be55 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -106,19 +106,16 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) { u32 recvd, count; - size_t hdrlen; __be32 *p; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) goto out_overflow; count = be32_to_cpup(p); - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; - recvd = xdr->buf->len - hdrlen; + recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) goto out_cheating; out: - xdr_read_pages(xdr, count); result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */ result->count = count; return count; @@ -440,7 +437,6 @@ static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) static int decode_path(struct xdr_stream *xdr) { u32 length, recvd; - size_t hdrlen; __be32 *p; p = xdr_inline_decode(xdr, 4); @@ -449,12 +445,9 @@ static int decode_path(struct xdr_stream *xdr) length = be32_to_cpup(p); if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) goto out_size; - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; - recvd = xdr->buf->len - hdrlen; + recvd = xdr_read_pages(xdr, length); if (unlikely(length > recvd)) goto out_cheating; - - xdr_read_pages(xdr, length); xdr_terminate_string(xdr->buf, length); return 0; out_size: @@ -972,22 +965,7 @@ out_overflow: */ static int decode_readdirok(struct xdr_stream *xdr) { - u32 recvd, pglen; - size_t hdrlen; - - pglen = xdr->buf->page_len; - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; - recvd = xdr->buf->len - hdrlen; - if (unlikely(pglen > recvd)) - goto out_cheating; -out: - xdr_read_pages(xdr, pglen); - return pglen; -out_cheating: - dprintk("NFS: server cheating in readdir result: " - "pglen %u > recvd %u\n", pglen, recvd); - pglen = recvd; - goto out; + return xdr_read_pages(xdr, xdr->buf->page_len); } static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c new file mode 100644 index 000000000000..b3fc65ef39ca --- /dev/null +++ b/fs/nfs/nfs3client.c @@ -0,0 +1,65 @@ +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include "internal.h" + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static const struct rpc_version *nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +const struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; + +/* + * Initialise an NFSv3 ACL client connection + */ +static void nfs_init_server_aclclient(struct nfs_server *server) +{ + if (server->flags & NFS_MOUNT_NOACL) + goto out_noacl; + + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + if (IS_ERR(server->client_acl)) + goto out_noacl; + + /* No errors! Assume that Sun nfsacls are supported */ + server->caps |= NFS_CAP_ACLS; + return; + +out_noacl: + server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ + server->flags &= ~NFS_MOUNT_NOACL; + server->caps &= ~NFS_CAP_ACLS; +} +#endif + +struct nfs_server *nfs3_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server = nfs_create_server(mount_info, nfs_mod); + /* Create a client RPC handle for the NFS v3 ACL management interface */ + if (!IS_ERR(server)) + nfs_init_server_aclclient(server); + return server; +} + +struct nfs_server *nfs3_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor); + if (!IS_ERR(server) && !IS_ERR(source->client_acl)) + nfs_init_server_aclclient(server); + return server; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 3187e24e8f78..0952c791df36 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -877,6 +877,46 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } +static int nfs3_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static int nfs3_return_delegation(struct inode *inode) +{ + nfs_wb_all(inode); + return 0; +} + +static const struct inode_operations nfs3_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; + +static const struct inode_operations nfs3_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; + const struct nfs_rpc_ops nfs_v3_clientops = { .version = 3, /* protocol version */ .dentry_ops = &nfs_dentry_operations, @@ -885,6 +925,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, .submount = nfs_submount, + .try_mount = nfs_try_mount, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, .lookup = nfs3_proc_lookup, @@ -910,9 +951,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, .read_setup = nfs3_proc_read_setup, + .read_pageio_init = nfs_pageio_init_read, .read_rpc_prepare = nfs3_proc_read_rpc_prepare, .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, + .write_pageio_init = nfs_pageio_init_write, .write_rpc_prepare = nfs3_proc_write_rpc_prepare, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, @@ -921,5 +964,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, .close_context = nfs_close_context, + .have_delegation = nfs3_have_delegation, + .return_delegation = nfs3_return_delegation, + .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, + .free_client = nfs_free_client, + .create_server = nfs3_create_server, + .clone_server = nfs3_clone_server, }; diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c new file mode 100644 index 000000000000..cc471c725230 --- /dev/null +++ b/fs/nfs/nfs3super.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v3 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs_fs_type, + .rpc_vers = &nfs_version3, + .rpc_ops = &nfs_v3_clientops, + .sops = &nfs_sops, +}; + +static int __init init_nfs_v3(void) +{ + register_nfs_version(&nfs_v3); + return 0; +} + +static void __exit exit_nfs_v3(void) +{ + unregister_nfs_version(&nfs_v3); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v3); +module_exit(exit_nfs_v3); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 902de489ec9b..6cbe89400dfc 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -246,7 +246,6 @@ static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, static int decode_nfspath3(struct xdr_stream *xdr) { u32 recvd, count; - size_t hdrlen; __be32 *p; p = xdr_inline_decode(xdr, 4); @@ -255,12 +254,9 @@ static int decode_nfspath3(struct xdr_stream *xdr) count = be32_to_cpup(p); if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) goto out_nametoolong; - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; - recvd = xdr->buf->len - hdrlen; + recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) goto out_cheating; - - xdr_read_pages(xdr, count); xdr_terminate_string(xdr->buf, count); return 0; @@ -329,14 +325,14 @@ static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier) memcpy(p, verifier, NFS3_CREATEVERFSIZE); } -static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier) +static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) { __be32 *p; p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); if (unlikely(p == NULL)) goto out_overflow; - memcpy(verifier, p, NFS3_WRITEVERFSIZE); + memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -1587,7 +1583,6 @@ static int decode_read3resok(struct xdr_stream *xdr, struct nfs_readres *result) { u32 eof, count, ocount, recvd; - size_t hdrlen; __be32 *p; p = xdr_inline_decode(xdr, 4 + 4 + 4); @@ -1598,13 +1593,10 @@ static int decode_read3resok(struct xdr_stream *xdr, ocount = be32_to_cpup(p++); if (unlikely(ocount != count)) goto out_mismatch; - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; - recvd = xdr->buf->len - hdrlen; + recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) goto out_cheating; - out: - xdr_read_pages(xdr, count); result->eof = eof; result->count = count; return count; @@ -1676,20 +1668,22 @@ static int decode_write3resok(struct xdr_stream *xdr, { __be32 *p; - p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE); + p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(p == NULL)) goto out_overflow; result->count = be32_to_cpup(p++); result->verf->committed = be32_to_cpup(p++); if (unlikely(result->verf->committed > NFS_FILE_SYNC)) goto out_badvalue; - memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE); + if (decode_writeverf3(xdr, &result->verf->verifier)) + goto out_eio; return result->count; out_badvalue: dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); return -EIO; out_overflow: print_overflow_msg(__func__, xdr); +out_eio: return -EIO; } @@ -2039,22 +2033,7 @@ out_truncated: */ static int decode_dirlist3(struct xdr_stream *xdr) { - u32 recvd, pglen; - size_t hdrlen; - - pglen = xdr->buf->page_len; - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; - recvd = xdr->buf->len - hdrlen; - if (unlikely(pglen > recvd)) - goto out_cheating; -out: - xdr_read_pages(xdr, pglen); - return pglen; -out_cheating: - dprintk("NFS: server cheating in readdir result: " - "pglen %u > recvd %u\n", pglen, recvd); - pglen = recvd; - goto out; + return xdr_read_pages(xdr, xdr->buf->page_len); } static int decode_readdir3resok(struct xdr_stream *xdr, @@ -2337,7 +2316,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_status; - error = decode_writeverf3(xdr, result->verf->verifier); + error = decode_writeverf3(xdr, &result->verf->verifier); out: return error; out_status: @@ -2364,7 +2343,7 @@ static inline int decode_getacl3resok(struct xdr_stream *xdr, if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) goto out; - hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + hdrlen = xdr_stream_pos(xdr); acl = NULL; if (result->mask & NFS_ACL) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index cc5900ac61b5..3b950dd81e81 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -9,7 +9,7 @@ #ifndef __LINUX_FS_NFS_NFS4_FS_H #define __LINUX_FS_NFS_NFS4_FS_H -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) struct idmap; @@ -200,7 +200,10 @@ struct nfs4_state_maintenance_ops { }; extern const struct dentry_operations nfs4_dentry_operations; -extern const struct inode_operations nfs4_dir_inode_operations; + +/* dir.c */ +int nfs_atomic_open(struct inode *, struct dentry *, struct file *, + unsigned, umode_t, int *); /* nfs4namespace.c */ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); @@ -301,6 +304,10 @@ extern const u32 nfs4_pathconf_bitmap[2]; extern const u32 nfs4_fsinfo_bitmap[3]; extern const u32 nfs4_fs_locations_bitmap[2]; +void nfs4_free_client(struct nfs_client *); + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); + /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); @@ -354,6 +361,29 @@ extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta extern const nfs4_stateid zero_stateid; +/* nfs4super.c */ +struct nfs_mount_info; +extern struct nfs_subversion nfs_v4; +struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); +extern bool nfs4_disable_idmapping; +extern unsigned short max_session_slots; +extern unsigned short send_implementation_id; + +/* nfs4sysctl.c */ +#ifdef CONFIG_SYSCTL +int nfs4_register_sysctl(void); +void nfs4_unregister_sysctl(void); +#else +static inline int nfs4_register_sysctl(void) +{ + return 0; +} + +static inline void nfs4_unregister_sysctl(void) +{ +} +#endif + /* nfs4xdr.c */ extern struct rpc_procinfo nfs4_procedures[]; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c new file mode 100644 index 000000000000..cbcdfaf32505 --- /dev/null +++ b/fs/nfs/nfs4client.c @@ -0,0 +1,656 @@ +/* + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_mount.h> +#include <linux/sunrpc/auth.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/bc_xprt.h> +#include "internal.h" +#include "callback.h" +#include "delegation.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ + int ret = 0; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (clp->rpc_ops->version != 4 || minorversion != 0) + return ret; +retry: + if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL)) + return -ENOMEM; + spin_lock(&nn->nfs_client_lock); + ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); + spin_unlock(&nn->nfs_client_lock); + if (ret == -EAGAIN) + goto retry; + return ret; +} + +#ifdef CONFIG_NFS_V4_1 +static void nfs4_shutdown_session(struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) { + nfs4_destroy_session(clp->cl_session); + nfs4_destroy_clientid(clp); + } + +} +#else /* CONFIG_NFS_V4_1 */ +static void nfs4_shutdown_session(struct nfs_client *clp) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) +{ + int err; + struct nfs_client *clp = nfs_alloc_client(cl_init); + if (IS_ERR(clp)) + return clp; + + err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); + if (err) + goto error; + + spin_lock_init(&clp->cl_lock); + INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + return clp; + +error: + kfree(clp); + return ERR_PTR(err); +} + +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_mvops->minor_version); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + nfs4_shutdown_session(clp); + nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); + kfree(clp->cl_serverowner); + kfree(clp->cl_serverscope); + kfree(clp->cl_implid); +} + +void nfs4_free_client(struct nfs_client *clp) +{ + nfs4_shutdown_client(clp); + nfs_free_client(clp); +} + +/* + * Initialize the NFS4 callback service + */ +static int nfs4_init_callback(struct nfs_client *clp) +{ + int error; + + if (clp->rpc_ops->version == 4) { + struct rpc_xprt *xprt; + + xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); + + if (nfs4_has_session(clp)) { + error = xprt_setup_backchannel(xprt, + NFS41_BC_MIN_CALLBACKS); + if (error < 0) + return error; + } + + error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", + __func__, error); + return error; + } + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + } + return 0; +} + +/* + * Initialize the minor version specific parts of an NFS4 client record + */ +static int nfs4_init_client_minor_version(struct nfs_client *clp) +{ +#if defined(CONFIG_NFS_V4_1) + if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. + * When a SEQUENCE operation encounters the expired session + * it will do session recovery to initialize it. + */ + session = nfs4_alloc_session(clp); + if (!session) + return -ENOMEM; + + clp->cl_session = session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); + } +#endif /* CONFIG_NFS_V4_1 */ + + return nfs4_init_callback(clp); +} + +/** + * nfs4_init_client - Initialise an NFS4 client record + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: callback IP address in presentation format + * @authflavor: authentication flavor for underlying RPC transport + * + * Returns pointer to an NFS client, or an ERR_PTR value. + */ +struct nfs_client *nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour) +{ + char buf[INET6_ADDRSTRLEN + 1]; + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is initialised already */ + dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); + return clp; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v4_clientops; + + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + error = nfs_create_rpc_client(clp, timeparms, authflavour); + if (error < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (error < 0) + goto error; + error = rpc_ntop(sap, buf, sizeof(buf)); + if (error < 0) + goto error; + ip_addr = (const char *)buf; + } + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + error = nfs_idmap_new(clp); + if (error < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, error); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + + error = nfs4_init_client_minor_version(clp); + if (error < 0) + goto error; + + if (!nfs4_has_session(clp)) + nfs_mark_client_ready(clp, NFS_CS_READY); + return clp; + +error: + nfs_mark_client_ready(clp, error); + nfs_put_client(clp); + dprintk("<-- nfs4_init_client() = xerror %d\n", error); + return ERR_PTR(error); +} + +static void nfs4_destroy_server(struct nfs_server *server) +{ + nfs_server_return_all_delegations(server); + unset_pnfs_layoutdriver(server); + nfs4_purge_state_owners(server); +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(struct net *net, int cb_ident) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + clp = idr_find(&nn->cb_ident_idr, cb_ident); + if (clp) + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* Common match routine for v4.0 and v4.1 callback services */ +static bool nfs4_cb_match_client(const struct sockaddr *addr, + struct nfs_client *clp, u32 minorversion) +{ + struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + + /* Don't match clients that failed to initialise */ + if (!(clp->cl_cons_state == NFS_CS_READY || + clp->cl_cons_state == NFS_CS_SESSION_INITING)) + return false; + + smp_rmb(); + + /* Match the version and minorversion */ + if (clp->rpc_ops->version != 4 || + clp->cl_minorversion != minorversion) + return false; + + /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(addr, clap)) + return false; + + return true; +} + +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, + struct nfs4_sessionid *sid) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 1) == false) + continue; + + if (!nfs4_has_session(clp)) + continue; + + /* Match sessionid*/ + if (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; + } + spin_unlock(&nn->nfs_client_lock); + return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, + struct nfs4_sessionid *sid) +{ + return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, + const char *ip_addr, + rpc_authflavor_t authflavour, + int proto, const struct rpc_timeout *timeparms, + u32 minorversion, struct net *net) +{ + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = addr, + .addrlen = addrlen, + .nfs_mod = &nfs_v4, + .proto = proto, + .minorversion = minorversion, + .net = net, + }; + struct nfs_client *clp; + int error; + + dprintk("--> nfs4_set_client()\n"); + + if (server->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + if (IS_ERR(clp)) { + error = PTR_ERR(clp); + goto error; + } + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + + server->nfs_client = clp; + dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); + return 0; +error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; +} + +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .nfs_mod = &nfs_v4, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + .net = mds_clp->cl_net, + }; + struct rpc_timeout ds_timeout; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); + +/* + * Session has been established, and the client marked ready. + * Set the mount rsize and wsize with negotiated fore channel + * attributes which will be bound checked in nfs_server_set_fsinfo. + */ +static void nfs4_session_set_rwsize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; +#endif /* CONFIG_NFS_V4_1 */ +} + +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh) +{ + struct nfs_fattr *fattr; + int error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server); + if (error < 0) + goto out; + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + dprintk("Mount FH: %d\n", mntfh->size); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + server->destroy = nfs4_destroy_server; +out: + nfs_free_fattr(fattr); + return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data) +{ + struct rpc_timeout timeparms; + int error; + + dprintk("--> nfs4_init_server()\n"); + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + server->options = data->options; + + /* Get a client record */ + error = nfs4_set_client(server, + data->nfs_server.hostname, + (const struct sockaddr *)&data->nfs_server.address, + data->nfs_server.addrlen, + data->client_address, + data->auth_flavors[0], + data->nfs_server.protocol, + &timeparms, + data->minorversion, + data->net); + if (error < 0) + goto error; + + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + +error: + /* Done */ + dprintk("<-- nfs4_init_server() = %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +/*struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh)*/ +struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + int error; + + dprintk("--> nfs4_create_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + /* set up the general RPC client */ + error = nfs4_init_server(server, mount_info->parsed); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mount_info->mntfh); + if (error < 0) + goto error; + + dprintk("<-- nfs4_create_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, + struct nfs_fh *mntfh) +{ + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; + + /* Get a client representation. + * Note: NFSv4 always uses TCP, */ + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + data->authflavor, + rpc_protocol(parent_server->client), + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (error < 0) + goto error; + + error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mntfh); + if (error < 0) + goto error; + + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +} diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c new file mode 100644 index 000000000000..acb65e7887f8 --- /dev/null +++ b/fs/nfs/nfs4file.c @@ -0,0 +1,126 @@ +/* + * linux/fs/nfs/file.c + * + * Copyright (C) 1992 Rick Sladkey + */ +#include <linux/nfs_fs.h> +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_FILE + +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + struct nfs_open_context *ctx; + struct dentry *dentry = filp->f_path.dentry; + struct dentry *parent = NULL; + struct inode *dir; + unsigned openflags = filp->f_flags; + struct iattr attr; + int err; + + BUG_ON(inode != dentry->d_inode); + /* + * If no cached dentry exists or if it's negative, NFSv4 handled the + * opens in ->lookup() or ->create(). + * + * We only get this far for a cached positive dentry. We skipped + * revalidation, so handle it here by dropping the dentry and returning + * -EOPENSTALE. The VFS will retry the lookup/create/open. + */ + + dprintk("NFS: open file(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + + if ((openflags & O_ACCMODE) == 3) + openflags--; + + /* We can't create new files here */ + openflags &= ~(O_CREAT|O_EXCL); + + parent = dget_parent(dentry); + dir = parent->d_inode; + + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + + attr.ia_valid = ATTR_OPEN; + if (openflags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; + nfs_wb_all(inode); + } + + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + switch (err) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + iput(inode); + if (inode != dentry->d_inode) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_file_set_open_context(filp, ctx); + err = 0; + +out_put_ctx: + put_nfs_open_context(ctx); +out: + dput(parent); + return err; + +out_drop: + d_drop(dentry); + err = -EOPENSTALE; + goto out_put_ctx; +} + +static int +nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct inode *inode = file->f_path.dentry->d_inode; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs4_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index e1340293872c..53f94d915bd1 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -205,9 +205,9 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EPIPE: dprintk("%s DS connection error %d\n", __func__, task->tk_status); - if (!filelayout_test_devid_invalid(devid)) - _pnfs_return_layout(inode); filelayout_mark_devid_invalid(devid); + clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); + _pnfs_return_layout(inode); rpc_wake_up(&tbl->slot_tbl_waitq); nfs4_ds_disconnect(clp); /* fall through */ @@ -351,9 +351,9 @@ static void prepare_to_resend_writes(struct nfs_commit_data *data) struct nfs_page *first = nfs_list_entry(data->pages.next); data->task.tk_status = 0; - memcpy(data->verf.verifier, first->wb_verf.verifier, - sizeof(first->wb_verf.verifier)); - data->verf.verifier[0]++; /* ensure verifier mismatch */ + memcpy(&data->verf.verifier, &first->wb_verf, + sizeof(data->verf.verifier)); + data->verf.verifier.data[0]++; /* ensure verifier mismatch */ } static int filelayout_commit_done_cb(struct rpc_task *task, diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index a1fab8da7f03..f81231f30d94 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -728,7 +728,7 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_fla pdev->layout_type = LAYOUT_NFSV4_1_FILES; pdev->pages = pages; pdev->pgbase = 0; - pdev->pglen = PAGE_SIZE * max_pages; + pdev->pglen = max_resp_sz; pdev->mincount = 0; rc = nfs4_proc_getdeviceinfo(server, pdev); diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c new file mode 100644 index 000000000000..6a83780e0ce6 --- /dev/null +++ b/fs/nfs/nfs4getroot.c @@ -0,0 +1,49 @@ +/* +* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. +* Written by David Howells (dhowells@redhat.com) +*/ + +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) +{ + struct nfs_fsinfo fsinfo; + int ret = -ENOMEM; + + dprintk("--> nfs4_get_rootfh()\n"); + + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + goto out; + + /* Start by getting the root filehandle from the server */ + ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo); + if (ret < 0) { + dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); + goto out; + } + + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) + || !S_ISDIR(fsinfo.fattr->mode)) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); + ret = -ENOTDIR; + goto out; + } + + if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); + ret = -EREMOTE; + goto out; + } + + memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: + nfs_free_fattr(fsinfo.fattr); + dprintk("<-- nfs4_get_rootfh() = %d\n", ret); + return ret; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c157b2089b47..a99a8d948721 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -43,7 +43,6 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/gss_api.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> @@ -73,8 +72,6 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) -static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; - struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -259,7 +256,12 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp) res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, nfs_wait_bit_killable, TASK_KILLABLE); - return res; + if (res) + return res; + + if (clp->cl_cons_state < 0) + return clp->cl_cons_state; + return 0; } static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) @@ -294,8 +296,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case 0: return 0; case -NFS4ERR_OPENMODE: - if (inode && nfs_have_delegation(inode, FMODE_READ)) { - nfs_inode_return_delegation(inode); + if (inode && nfs4_have_delegation(inode, FMODE_READ)) { + nfs4_inode_return_delegation(inode); exception->retry = 1; return 0; } @@ -1065,7 +1067,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo return; } rcu_read_unlock(); - nfs_inode_return_delegation(inode); + nfs4_inode_return_delegation(inode); } static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) @@ -1756,33 +1758,70 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta } #if defined(CONFIG_NFS_V4_1) -static int nfs41_check_expired_stateid(struct nfs4_state *state, nfs4_stateid *stateid, unsigned int flags) +static void nfs41_clear_delegation_stateid(struct nfs4_state *state) { - int status = NFS_OK; struct nfs_server *server = NFS_SERVER(state->inode); + nfs4_stateid *stateid = &state->stateid; + int status; - if (state->flags & flags) { - status = nfs41_test_stateid(server, stateid); - if (status != NFS_OK) { + /* If a state reset has been done, test_stateid is unneeded */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + return; + + status = nfs41_test_stateid(server, stateid); + if (status != NFS_OK) { + /* Free the stateid unless the server explicitly + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, stateid); - state->flags &= ~flags; - } + + clear_bit(NFS_DELEGATED_STATE, &state->flags); + } +} + +/** + * nfs41_check_open_stateid - possibly free an open stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_open_stateid(struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + nfs4_stateid *stateid = &state->stateid; + int status; + + /* If a state reset has been done, test_stateid is unneeded */ + if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && + (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && + (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + return -NFS4ERR_BAD_STATEID; + + status = nfs41_test_stateid(server, stateid); + if (status != NFS_OK) { + /* Free the stateid unless the server explicitly + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, stateid); + + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); } return status; } static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) { - int deleg_status, open_status; - int deleg_flags = 1 << NFS_DELEGATED_STATE; - int open_flags = (1 << NFS_O_RDONLY_STATE) | (1 << NFS_O_WRONLY_STATE) | (1 << NFS_O_RDWR_STATE); - - deleg_status = nfs41_check_expired_stateid(state, &state->stateid, deleg_flags); - open_status = nfs41_check_expired_stateid(state, &state->open_stateid, open_flags); + int status; - if ((deleg_status == NFS_OK) && (open_status == NFS_OK)) - return NFS_OK; - return nfs4_open_expired(sp, state); + nfs41_clear_delegation_stateid(state); + status = nfs41_check_open_stateid(state); + if (status != NFS_OK) + status = nfs4_open_expired(sp, state); + return status; } #endif @@ -2375,11 +2414,15 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, int i, len, status = 0; rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; - len = gss_mech_list_pseudoflavors(&flav_array[0]); - flav_array[len] = RPC_AUTH_NULL; - len += 1; + len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); + BUG_ON(len < 0); for (i = 0; i < len; i++) { + /* AUTH_UNIX is the default flavor if none was specified, + * thus has already been tried. */ + if (flav_array[i] == RPC_AUTH_UNIX) + continue; + status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; @@ -2766,9 +2809,7 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) * * In the case of WRITE, we also want to put the GETATTR after * the operation -- in this case because we want to make sure - * we get the post-operation mtime and size. This means that - * we can't use xdr_encode_pages() as written: we need a variant - * of it which would leave room in the 'tail' iovec. + * we get the post-operation mtime and size. * * Both of these changes to the XDR layer would in fact be quite * minor, but I decided to leave them for a subsequent patch. @@ -2821,7 +2862,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, return PTR_ERR(ctx); sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, dentry, ctx->mode, flags, sattr, ctx->cred, NULL); + state = nfs4_do_open(dir, dentry, ctx->mode, + flags, sattr, ctx->cred, + &ctx->mdsthreshold); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -3315,8 +3358,14 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { + int error; + nfs_fattr_init(fsinfo->fattr); - return nfs4_do_fsinfo(server, fhandle, fsinfo); + error = nfs4_do_fsinfo(server, fhandle, fsinfo); + if (error == 0) + set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + + return error; } static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -3443,7 +3492,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) /* Otherwise, request attributes if and only if we don't hold * a delegation */ - return nfs_have_delegation(hdr->inode, FMODE_READ) == 0; + return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) @@ -3732,7 +3781,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - int ret = -ENOMEM, npages, i, acl_len = 0; + int ret = -ENOMEM, npages, i; + size_t acl_len = 0; npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; /* As long as we're doing a round trip to the server anyway, @@ -3847,7 +3897,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); if (i < 0) return i; - nfs_inode_return_delegation(inode); + nfs4_inode_return_delegation(inode); ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* @@ -3961,6 +4011,16 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } +/** + * nfs4_proc_setclientid - Negotiate client ID + * @clp: state data structure + * @program: RPC program for NFSv4 callback service + * @port: IP port number for NFS4 callback service + * @cred: RPC credential to use for this call + * @res: where to place the result + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred, struct nfs4_setclientid_res *res) @@ -3977,44 +4037,44 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - int loop = 0; int status; + /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); - - for(;;) { - rcu_read_lock(); - setclientid.sc_name_len = scnprintf(setclientid.sc_name, - sizeof(setclientid.sc_name), "%s/%s %s %s %u", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO), - clp->cl_rpcclient->cl_auth->au_ops->au_name, - clp->cl_id_uniquifier); - setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, + rcu_read_lock(); + setclientid.sc_name_len = scnprintf(setclientid.sc_name, + sizeof(setclientid.sc_name), "%s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_PROTO)); + /* cb_client4 */ + setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_NETID)); - setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, + rcu_read_unlock(); + setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - rcu_read_unlock(); - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (status != -NFS4ERR_CLID_INUSE) - break; - if (loop != 0) { - ++clp->cl_id_uniquifier; - break; - } - ++loop; - ssleep(clp->cl_lease_time / HZ + 1); - } + dprintk("NFS call setclientid auth=%s, '%.*s'\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + setclientid.sc_name_len, setclientid.sc_name); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + dprintk("NFS reply setclientid: %d\n", status); return status; } +/** + * nfs4_proc_setclientid_confirm - Confirm client ID + * @clp: state data structure + * @res: result of a previous SETCLIENTID + * @cred: RPC credential to use for this call + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct nfs4_setclientid_res *arg, struct rpc_cred *cred) @@ -4029,6 +4089,9 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, unsigned long now; int status; + dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + clp->cl_clientid); now = jiffies; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status == 0) { @@ -4037,6 +4100,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, clp->cl_last_renewal = now; spin_unlock(&clp->cl_lock); } + dprintk("NFS reply setclientid_confirm: %d\n", status); return status; } @@ -4681,9 +4745,17 @@ out: } #if defined(CONFIG_NFS_V4_1) +/** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ static int nfs41_check_expired_locks(struct nfs4_state *state) { - int status, ret = NFS_OK; + int status, ret = -NFS4ERR_BAD_STATEID; struct nfs4_lock_state *lsp; struct nfs_server *server = NFS_SERVER(state->inode); @@ -4691,7 +4763,11 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { status = nfs41_test_stateid(server, &lsp->ls_stateid); if (status != NFS_OK) { - nfs41_free_stateid(server, &lsp->ls_stateid); + /* Free the stateid unless the server + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, + &lsp->ls_stateid); lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; ret = status; } @@ -4707,9 +4783,9 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques if (test_bit(LK_STATE_IN_USE, &state->flags)) status = nfs41_check_expired_locks(state); - if (status == NFS_OK) - return status; - return nfs4_lock_expired(state, request); + if (status != NFS_OK) + status = nfs4_lock_expired(state, request); + return status; } #endif @@ -4807,7 +4883,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. */ - switch (request->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) { + switch (request->fl_type) { case F_RDLCK: if (!(filp->f_mode & FMODE_READ)) return -EBADF; @@ -5168,6 +5244,8 @@ out: /* * nfs4_proc_exchange_id() * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + * * Since the clientid has expired, all compounds using sessions * associated with the stale clientid will be returning * NFS4ERR_BADSESSION in the sequence operation, and will therefore @@ -5192,16 +5270,14 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) .rpc_cred = cred, }; - dprintk("--> %s\n", __func__); - BUG_ON(clp == NULL); - nfs4_init_boot_verifier(clp, &verifier); - args.id_len = scnprintf(args.id, sizeof(args.id), - "%s/%s/%u", + "%s/%s", clp->cl_ipaddr, - clp->cl_rpcclient->cl_nodename, - clp->cl_rpcclient->cl_auth->au_flavor); + clp->cl_rpcclient->cl_nodename); + dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + args.id_len, args.id); res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); @@ -5264,12 +5340,12 @@ out_server_scope: kfree(res.server_scope); out: if (clp->cl_implid != NULL) - dprintk("%s: Server Implementation ID: " + dprintk("NFS reply exchange_id: Server Implementation ID: " "domain: %s, name: %s, date: %llu,%u\n", - __func__, clp->cl_implid->domain, clp->cl_implid->name, + clp->cl_implid->domain, clp->cl_implid->name, clp->cl_implid->date.seconds, clp->cl_implid->date.nseconds); - dprintk("<-- %s status= %d\n", __func__, status); + dprintk("NFS reply exchange_id: %d\n", status); return status; } @@ -6570,22 +6646,36 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) .rpc_resp = &res, }; + dprintk("NFS call test_stateid %p\n", stateid); nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); - - if (status == NFS_OK) - return res.status; - return status; + if (status != NFS_OK) { + dprintk("NFS reply test_stateid: failed, %d\n", status); + return status; + } + dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status); + return -res.status; } +/** + * nfs41_test_stateid - perform a TEST_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to test + * + * Returns NFS_OK if the server recognizes that "stateid" is valid. + * Otherwise a negative NFS4ERR value is returned if the operation + * failed or the state ID is not currently valid. + */ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) { struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs41_test_stateid(server, stateid), - &exception); + err = _nfs41_test_stateid(server, stateid); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -6601,19 +6691,34 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) .rpc_argp = &args, .rpc_resp = &res, }; + int status; + dprintk("NFS call free_stateid %p\n", stateid); nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); - return nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + status = nfs4_call_sync_sequence(server->client, server, &msg, + &args.seq_args, &res.seq_res, 1); + dprintk("NFS reply free_stateid: %d\n", status); + return status; } +/** + * nfs41_free_stateid - perform a FREE_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to release + * + * Returns NFS_OK if the server freed "stateid". Otherwise a + * negative NFS4ERR value is returned. + */ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) { struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_free_stateid(server, stateid), - &exception); + err = _nfs4_free_stateid(server, stateid); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -6725,6 +6830,26 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; +const struct inode_operations nfs4_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +}; + static const struct inode_operations nfs4_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, @@ -6743,6 +6868,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, .submount = nfs4_submount, + .try_mount = nfs4_try_mount, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, .lookup = nfs4_proc_lookup, @@ -6769,9 +6895,11 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, .read_setup = nfs4_proc_read_setup, + .read_pageio_init = pnfs_pageio_init_read, .read_rpc_prepare = nfs4_proc_read_rpc_prepare, .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, + .write_pageio_init = pnfs_pageio_init_write, .write_rpc_prepare = nfs4_proc_write_rpc_prepare, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, @@ -6781,7 +6909,13 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, + .have_delegation = nfs4_have_delegation, + .return_delegation = nfs4_inode_return_delegation, + .alloc_client = nfs4_alloc_client, .init_client = nfs4_init_client, + .free_client = nfs4_free_client, + .create_server = nfs4_create_server, + .clone_server = nfs_clone_server, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { @@ -6796,10 +6930,6 @@ const struct xattr_handler *nfs4_xattr_handlers[] = { NULL }; -module_param(max_session_slots, ushort, 0644); -MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " - "requests the client will negotiate"); - /* * Local variables: * c-basic-offset: 8 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f38300e9f171..55148def5540 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1606,10 +1606,15 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) return -ESERVERFAULT; /* Lease confirmation error: retry after purging the lease */ ssleep(1); - case -NFS4ERR_CLID_INUSE: case -NFS4ERR_STALE_CLIENTID: clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); break; + case -NFS4ERR_CLID_INUSE: + pr_err("NFS: Server %s reports our clientid is in use\n", + clp->cl_hostname); + nfs_mark_client_ready(clp, -EPERM); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + return -EPERM; case -EACCES: if (clp->cl_machine_cred == NULL) return -EACCES; @@ -1642,7 +1647,7 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) return 0; } -static int nfs4_reclaim_lease(struct nfs_client *clp) +static int nfs4_establish_lease(struct nfs_client *clp) { struct rpc_cred *cred; const struct nfs4_state_recovery_ops *ops = @@ -1655,7 +1660,41 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) status = ops->establish_clid(clp, cred); put_rpccred(cred); if (status != 0) + return status; + pnfs_destroy_all_layouts(clp); + return 0; +} + +/* + * Returns zero or a negative errno. NFS4ERR values are converted + * to local errno values. + */ +static int nfs4_reclaim_lease(struct nfs_client *clp) +{ + int status; + + status = nfs4_establish_lease(clp); + if (status < 0) + return nfs4_handle_reclaim_lease_error(clp, status); + if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state)) + nfs4_state_start_reclaim_nograce(clp); + if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + return 0; +} + +static int nfs4_purge_lease(struct nfs_client *clp) +{ + int status; + + status = nfs4_establish_lease(clp); + if (status < 0) return nfs4_handle_reclaim_lease_error(clp, status); + clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); return 0; } @@ -1764,6 +1803,8 @@ static int nfs4_reset_session(struct nfs_client *clp) struct rpc_cred *cred; int status; + if (!nfs4_has_session(clp)) + return 0; nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); @@ -1792,12 +1833,14 @@ out: static int nfs4_recall_slot(struct nfs_client *clp) { - struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table; - struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs; + struct nfs4_slot_table *fc_tbl; struct nfs4_slot *new, *old; int i; + if (!nfs4_has_session(clp)) + return 0; nfs4_begin_drain_session(clp); + fc_tbl = &clp->cl_session->fc_slot_table; new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), GFP_NOFS); if (!new) @@ -1810,11 +1853,10 @@ static int nfs4_recall_slot(struct nfs_client *clp) fc_tbl->slots = new; fc_tbl->max_slots = fc_tbl->target_max_slots; fc_tbl->target_max_slots = 0; - fc_attrs->max_reqs = fc_tbl->max_slots; + clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots; spin_unlock(&fc_tbl->slot_tbl_lock); kfree(old); - nfs4_end_drain_session(clp); return 0; } @@ -1823,6 +1865,8 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) struct rpc_cred *cred; int ret; + if (!nfs4_has_session(clp)) + return 0; nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); ret = nfs4_proc_bind_conn_to_session(clp, cred); @@ -1857,37 +1901,29 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) static void nfs4_state_manager(struct nfs_client *clp) { int status = 0; + const char *section = "", *section_sep = ""; /* Ensure exclusive access to NFSv4 state */ do { if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { - status = nfs4_reclaim_lease(clp); + section = "purge state"; + status = nfs4_purge_lease(clp); if (status < 0) goto out_error; - clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + continue; } - if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + section = "lease expired"; /* We're going to have to re-establish a clientid */ status = nfs4_reclaim_lease(clp); if (status < 0) goto out_error; - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) - continue; - clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); - - if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, - &clp->cl_state)) - nfs4_state_start_reclaim_nograce(clp); - else - set_bit(NFS4CLNT_RECLAIM_REBOOT, - &clp->cl_state); - - pnfs_destroy_all_layouts(clp); + continue; } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + section = "check lease"; status = nfs4_check_lease(clp); if (status < 0) goto out_error; @@ -1896,8 +1932,8 @@ static void nfs4_state_manager(struct nfs_client *clp) } /* Initialize or reset the session */ - if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) - && nfs4_has_session(clp)) { + if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { + section = "reset session"; status = nfs4_reset_session(clp); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) continue; @@ -1907,15 +1943,26 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Send BIND_CONN_TO_SESSION */ if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, - &clp->cl_state) && nfs4_has_session(clp)) { + &clp->cl_state)) { + section = "bind conn to session"; status = nfs4_bind_conn_to_session(clp); if (status < 0) goto out_error; continue; } + /* Recall session slots */ + if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) { + section = "recall slot"; + status = nfs4_recall_slot(clp); + if (status < 0) + goto out_error; + continue; + } + /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + section = "reclaim reboot"; status = nfs4_do_reclaim(clp, clp->cl_mvops->reboot_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || @@ -1930,6 +1977,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + section = "reclaim nograce"; status = nfs4_do_reclaim(clp, clp->cl_mvops->nograce_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || @@ -1945,15 +1993,6 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs_client_return_marked_delegations(clp); continue; } - /* Recall session slots */ - if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state) - && nfs4_has_session(clp)) { - status = nfs4_recall_slot(clp); - if (status < 0) - goto out_error; - continue; - } - nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ @@ -1964,8 +2003,11 @@ static void nfs4_state_manager(struct nfs_client *clp) } while (atomic_read(&clp->cl_count) > 1); return; out_error: - pr_warn_ratelimited("NFS: state manager failed on NFSv4 server %s" - " with error %d\n", clp->cl_hostname, -status); + if (strlen(section)) + section_sep = ": "; + pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" + " with error %d\n", section_sep, section, + clp->cl_hostname, -status); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c new file mode 100644 index 000000000000..12a31a9dbcdd --- /dev/null +++ b/fs/nfs/nfs4super.c @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com> + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs4_mount.h> +#include <linux/nfs_fs.h> +#include "delegation.h" +#include "internal.h" +#include "nfs4_fs.h" +#include "pnfs.h" +#include "nfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc); +static void nfs4_evict_inode(struct inode *inode); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); + +static struct file_system_type nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs_fs_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_referral_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs4_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_referral_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static const struct super_operations nfs4_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs4_write_inode, + .put_super = nfs_put_super, + .statfs = nfs_statfs, + .evict_inode = nfs4_evict_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, + .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, +}; + +struct nfs_subversion nfs_v4 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs4_fs_type, + .rpc_vers = &nfs_version4, + .rpc_ops = &nfs_v4_clientops, + .sops = &nfs4_sops, + .xattr = nfs4_xattr_handlers, +}; + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret = nfs_write_inode(inode, wbc); + + if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { + int status; + bool sync = true; + + if (wbc->sync_mode == WB_SYNC_NONE) + sync = false; + + status = pnfs_layoutcommit_inode(inode, sync); + if (status < 0) + return status; + } + return ret; +} + +/* + * Clean out any remaining NFSv4 state that might be left over due + * to open() calls that passed nfs_atomic_lookup, but failed to call + * nfs_open(). + */ +static void nfs4_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); +} + +/* + * Get the superblock for the NFS4 root partition + */ +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *info) +{ + struct nfs_mount_info *mount_info = info; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + + mount_info->set_security = nfs_set_sb_security; + + /* Get a volume representation */ + server = nfs4_create_server(mount_info, &nfs_v4); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + + mntroot = nfs_fs_mount_common(server, flags, dev_name, mount_info, &nfs_v4); + +out: + return mntroot; +} + +static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, + int flags, void *data, const char *hostname) +{ + struct vfsmount *root_mnt; + char *root_devname; + size_t len; + + len = strlen(hostname) + 5; + root_devname = kmalloc(len, GFP_KERNEL); + if (root_devname == NULL) + return ERR_PTR(-ENOMEM); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + snprintf(root_devname, len, "[%s]:/", hostname); + else + snprintf(root_devname, len, "%s:/", hostname); + root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); + kfree(root_devname); + return root_mnt; +} + +struct nfs_referral_count { + struct list_head list; + const struct task_struct *task; + unsigned int referral_count; +}; + +static LIST_HEAD(nfs_referral_count_list); +static DEFINE_SPINLOCK(nfs_referral_count_list_lock); + +static struct nfs_referral_count *nfs_find_referral_count(void) +{ + struct nfs_referral_count *p; + + list_for_each_entry(p, &nfs_referral_count_list, list) { + if (p->task == current) + return p; + } + return NULL; +} + +#define NFS_MAX_NESTED_REFERRALS 2 + +static int nfs_referral_loop_protect(void) +{ + struct nfs_referral_count *p, *new; + int ret = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + new->task = current; + new->referral_count = 1; + + ret = 0; + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + if (p != NULL) { + if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) + ret = -ELOOP; + else + p->referral_count++; + } else { + list_add(&new->list, &nfs_referral_count_list); + new = NULL; + } + spin_unlock(&nfs_referral_count_list_lock); + kfree(new); +out: + return ret; +} + +static void nfs_referral_loop_unprotect(void) +{ + struct nfs_referral_count *p; + + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + p->referral_count--; + if (p->referral_count == 0) + list_del(&p->list); + else + p = NULL; + spin_unlock(&nfs_referral_count_list_lock); + kfree(p); +} + +static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path) +{ + struct dentry *dentry; + int err; + + if (IS_ERR(root_mnt)) + return ERR_CAST(root_mnt); + + err = nfs_referral_loop_protect(); + if (err) { + mntput(root_mnt); + return ERR_PTR(err); + } + + dentry = mount_subtree(root_mnt, export_path); + nfs_referral_loop_unprotect(); + + return dentry; +} + +struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + struct nfs_parsed_mount_data *data = mount_info->parsed; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + res = nfs_follow_remote_path(root_mnt, export_path); + + dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", + IS_ERR(res) ? PTR_ERR(res) : 0, + IS_ERR(res) ? " [error]" : ""); + return res; +} + +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_mount_info mount_info = { + .fill_super = nfs_fill_super, + .set_security = nfs_clone_sb_security, + .cloned = raw_data, + }; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + + dprintk("--> nfs4_referral_get_sb()\n"); + + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.cloned == NULL || mount_info.mntfh == NULL) + goto out; + + /* create a new volume representation */ + server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + + mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, &nfs_v4); +out: + nfs_free_fhandle(mount_info.mntfh); + return mntroot; +} + +/* + * Create an NFS4 server record on referral traversal + */ +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + + dprintk("--> nfs4_referral_mount()\n"); + + export_path = data->mnt_path; + data->mnt_path = "/"; + + root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, + flags, data, data->hostname); + data->mnt_path = export_path; + + res = nfs_follow_remote_path(root_mnt, export_path); + dprintk("<-- nfs4_referral_mount() = %ld%s\n", + IS_ERR(res) ? PTR_ERR(res) : 0, + IS_ERR(res) ? " [error]" : ""); + return res; +} + + +static int __init init_nfs_v4(void) +{ + int err; + + err = nfs_idmap_init(); + if (err) + goto out; + + err = nfs4_register_sysctl(); + if (err) + goto out1; + + err = register_filesystem(&nfs4_fs_type); + if (err < 0) + goto out2; + + register_nfs_version(&nfs_v4); + return 0; +out2: + nfs4_unregister_sysctl(); +out1: + nfs_idmap_quit(); +out: + return err; +} + +static void __exit exit_nfs_v4(void) +{ + unregister_nfs_version(&nfs_v4); + unregister_filesystem(&nfs4_fs_type); + nfs4_unregister_sysctl(); + nfs_idmap_quit(); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v4); +module_exit(exit_nfs_v4); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c new file mode 100644 index 000000000000..5729bc8aa75d --- /dev/null +++ b/fs/nfs/nfs4sysctl.c @@ -0,0 +1,68 @@ +/* + * linux/fs/nfs/nfs4sysctl.c + * + * Sysctl interface to NFS v4 parameters + * + * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/sysctl.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> + +#include "callback.h" + +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +static struct ctl_table_header *nfs4_callback_sysctl_table; + +static ctl_table nfs4_cb_sysctls[] = { + { + .procname = "nfs_callback_tcpport", + .data = &nfs_callback_set_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&nfs_set_port_min, + .extra2 = (int *)&nfs_set_port_max, + }, + { + .procname = "idmap_cache_timeout", + .data = &nfs_idmap_cache_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static ctl_table nfs4_cb_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nfs4_cb_sysctls, + }, + { } +}; + +static ctl_table nfs4_cb_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nfs4_cb_sysctl_dir, + }, + { } +}; + +int nfs4_register_sysctl(void) +{ + nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); + if (nfs4_callback_sysctl_table == NULL) + return -ENOMEM; + return 0; +} + +void nfs4_unregister_sysctl(void) +{ + unregister_sysctl_table(nfs4_callback_sysctl_table); + nfs4_callback_sysctl_table = NULL; +} diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 18fae29b0301..ca13483edd60 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -852,12 +852,6 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + XDR_UNIT); #endif /* CONFIG_NFS_V4_1 */ -static unsigned short send_implementation_id = 1; - -module_param(send_implementation_id, ushort, 0644); -MODULE_PARM_DESC(send_implementation_id, - "Send implementation ID with NFSv4.1 exchange_id"); - static const umode_t nfs_type2fmt[] = { [NF4BAD] = 0, [NF4REG] = S_IFREG, @@ -1236,7 +1230,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct static inline int nfs4_lock_type(struct file_lock *fl, int block) { - if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK) + if (fl->fl_type == F_RDLCK) return block ? NFS4_READW_LT : NFS4_READ_LT; return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; } @@ -3078,7 +3072,7 @@ out_overflow: return -EIO; } -static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) +static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) { __be32 *p; @@ -3086,7 +3080,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, if (unlikely(!p)) goto out_overflow; *attrlen = be32_to_cpup(p); - *savep = xdr->p; + *savep = xdr_stream_pos(xdr); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4068,10 +4062,10 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str return status; } -static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen) +static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen) { unsigned int attrwords = XDR_QUADLEN(attrlen); - unsigned int nwords = xdr->p - savep; + unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2; if (unlikely(attrwords != nwords)) { dprintk("%s: server returned incorrect attribute length: " @@ -4158,13 +4152,18 @@ static int decode_verifier(struct xdr_stream *xdr, void *verifier) return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE); } +static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ + return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE); +} + static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res) { int status; status = decode_op_hdr(xdr, OP_COMMIT); if (!status) - status = decode_verifier(xdr, res->verf->verifier); + status = decode_write_verifier(xdr, &res->verf->verifier); return status; } @@ -4193,7 +4192,7 @@ out_overflow: static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) { - __be32 *savep; + unsigned int savep; uint32_t attrlen, bitmap[3] = {0}; int status; @@ -4222,7 +4221,7 @@ xdr_error: static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) { - __be32 *savep; + unsigned int savep; uint32_t attrlen, bitmap[3] = {0}; int status; @@ -4254,7 +4253,7 @@ xdr_error: static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) { - __be32 *savep; + unsigned int savep; uint32_t attrlen, bitmap[3] = {0}; int status; @@ -4299,7 +4298,8 @@ out_overflow: static int decode_first_threshold_item4(struct xdr_stream *xdr, struct nfs4_threshold *res) { - __be32 *p, *savep; + __be32 *p; + unsigned int savep; uint32_t bitmap[3] = {0,}, attrlen; int status; @@ -4503,7 +4503,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, const struct nfs_server *server) { - __be32 *savep; + unsigned int savep; uint32_t attrlen, bitmap[3] = {0}; int status; @@ -4615,7 +4615,7 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { - __be32 *savep; + unsigned int savep; uint32_t attrlen, bitmap[3]; int status; @@ -4920,9 +4920,8 @@ static int decode_putrootfh(struct xdr_stream *xdr) static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) { - struct kvec *iov = req->rq_rcv_buf.head; __be32 *p; - uint32_t count, eof, recvd, hdrlen; + uint32_t count, eof, recvd; int status; status = decode_op_hdr(xdr, OP_READ); @@ -4933,15 +4932,13 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ goto out_overflow; eof = be32_to_cpup(p++); count = be32_to_cpup(p); - hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; - recvd = req->rq_rcv_buf.len - hdrlen; + recvd = xdr_read_pages(xdr, count); if (count > recvd) { dprintk("NFS: server cheating in read reply: " "count %u > recvd %u\n", count, recvd); count = recvd; eof = 0; } - xdr_read_pages(xdr, count); res->eof = eof; res->count = count; return 0; @@ -4952,10 +4949,6 @@ out_overflow: static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; - u32 recvd, pglen = rcvbuf->page_len; int status; __be32 verf[2]; @@ -4967,22 +4960,12 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n memcpy(verf, readdir->verifier.data, sizeof(verf)); dprintk("%s: verifier = %08x:%08x\n", __func__, verf[0], verf[1]); - - hdrlen = (char *) xdr->p - (char *) iov->iov_base; - recvd = rcvbuf->len - hdrlen; - if (pglen > recvd) - pglen = recvd; - xdr_read_pages(xdr, pglen); - - - return pglen; + return xdr_read_pages(xdr, xdr->buf->page_len); } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) { struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; u32 len, recvd; __be32 *p; int status; @@ -5000,14 +4983,12 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; } - hdrlen = (char *) xdr->p - (char *) iov->iov_base; - recvd = req->rq_rcv_buf.len - hdrlen; + recvd = xdr_read_pages(xdr, len); if (recvd < len) { dprintk("NFS: server cheating in readlink reply: " "count %u > recvd %u\n", len, recvd); return -EIO; } - xdr_read_pages(xdr, len); /* * The XDR encode routine has set things up so that * the link text will be copied directly into the @@ -5063,10 +5044,10 @@ decode_restorefh(struct xdr_stream *xdr) static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_getaclres *res) { - __be32 *savep, *bm_p; + unsigned int savep; + __be32 *bm_p; uint32_t attrlen, bitmap[3] = {0}; - struct kvec *iov = req->rq_rcv_buf.head; int status; size_t page_len = xdr->buf->page_len; @@ -5089,7 +5070,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { - size_t hdrlen; /* The bitmap (xdr len + bitmaps) and the attr xdr len words * are stored with the acl data to handle the problem of @@ -5098,7 +5078,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, /* We ignore &savep and don't do consistency checks on * the attr length. Let userspace figure it out.... */ - hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; attrlen += res->acl_data_offset; if (attrlen > page_len) { if (res->acl_flags & NFS4_ACL_LEN_REQUEST) { @@ -5212,13 +5191,12 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - p = xdr_inline_decode(xdr, 16); + p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); - memcpy(res->verf->verifier, p, NFS4_VERIFIER_SIZE); - return 0; + return decode_write_verifier(xdr, &res->verf->verifier); out_overflow: print_overflow_msg(__func__, xdr); return -EIO; @@ -5599,7 +5577,7 @@ static int decode_getdevicelist(struct xdr_stream *xdr, { __be32 *p; int status, i; - struct nfs_writeverf verftemp; + nfs4_verifier verftemp; status = decode_op_hdr(xdr, OP_GETDEVICELIST); if (status) @@ -5613,7 +5591,7 @@ static int decode_getdevicelist(struct xdr_stream *xdr, p += 2; /* Read verifier */ - p = xdr_decode_opaque_fixed(p, verftemp.verifier, NFS4_VERIFIER_SIZE); + p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE); res->num_devs = be32_to_cpup(p); @@ -5707,9 +5685,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, __be32 *p; int status; u32 layout_count; - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - u32 hdrlen, recvd; + u32 recvd; status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) @@ -5746,8 +5722,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, res->type, res->layoutp->len); - hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; - recvd = req->rq_rcv_buf.len - hdrlen; + recvd = xdr_read_pages(xdr, res->layoutp->len); if (res->layoutp->len > recvd) { dprintk("NFS: server cheating in layoutget reply: " "layout len %u > recvd %u\n", @@ -5755,8 +5730,6 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, return -EINVAL; } - xdr_read_pages(xdr, res->layoutp->len); - if (layout_count > 1) { /* We only handle a length one array at the moment. Any * further entries are just ignored. Note that this means @@ -7103,6 +7076,7 @@ out: int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) { + unsigned int savep; uint32_t bitmap[3] = {0}; uint32_t len; __be32 *p = xdr_inline_decode(xdr, 4); @@ -7141,7 +7115,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (decode_attr_bitmap(xdr, bitmap) < 0) goto out_overflow; - if (decode_attr_length(xdr, &len, &p) < 0) + if (decode_attr_length(xdr, &len, &savep) < 0) goto out_overflow; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index aed913c833f4..1a6732ed04a4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -54,6 +54,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, if (hdr->completion_ops->init_hdr) hdr->completion_ops->init_hdr(hdr); } +EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { @@ -70,7 +71,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) static inline struct nfs_page * nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -117,7 +118,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page->index; + req->wb_index = page_file_index(page); page_cache_get(page); req->wb_offset = offset; req->wb_pgbase = offset; @@ -268,6 +269,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_lseg = NULL; desc->pg_dreq = NULL; } +EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_can_coalesce_requests - test two requests for compatibility @@ -409,6 +411,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, } while (ret); return ret; } +EXPORT_SYMBOL_GPL(nfs_pageio_add_request); /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor @@ -424,6 +427,7 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) break; } } +EXPORT_SYMBOL_GPL(nfs_pageio_complete); /** * nfs_pageio_cond_complete - Conditional I/O completion diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bbc49caa7a82..76875bfcf19c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -651,7 +651,14 @@ out_err_free: return NULL; } -/* Initiates a LAYOUTRETURN(FILE) */ +/* + * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr + * when the layout segment list is empty. + * + * Note that a pnfs_layout_hdr can exist with an empty layout segment + * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the + * deviceid is marked invalid. + */ int _pnfs_return_layout(struct inode *ino) { @@ -660,22 +667,31 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); struct nfs4_layoutreturn *lrp; nfs4_stateid stateid; - int status = 0; + int status = 0, empty; - dprintk("--> %s\n", __func__); + dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo) { + if (!lo || pnfs_test_layout_returned(lo)) { spin_unlock(&ino->i_lock); - dprintk("%s: no layout to return\n", __func__); - return status; + dprintk("NFS: %s no layout to return\n", __func__); + goto out; } stateid = nfsi->layout->plh_stateid; /* Reference matched in nfs4_layoutreturn_release */ get_layout_hdr(lo); + empty = list_empty(&lo->plh_segs); mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + /* Don't send a LAYOUTRETURN if list was initially empty */ + if (empty) { + spin_unlock(&ino->i_lock); + put_layout_hdr(lo); + dprintk("NFS: %s no layout segments to return\n", __func__); + goto out; + } lo->plh_block_lgets++; + pnfs_mark_layout_returned(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -686,6 +702,7 @@ _pnfs_return_layout(struct inode *ino) status = -ENOMEM; set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + pnfs_clear_layout_returned(lo); put_layout_hdr(lo); goto out; } @@ -1075,6 +1092,10 @@ pnfs_update_layout(struct inode *ino, get_layout_hdr(lo); if (list_empty(&lo->plh_segs)) first = true; + + /* Enable LAYOUTRETURNs */ + pnfs_clear_layout_returned(lo); + spin_unlock(&ino->i_lock); if (first) { /* The lo must be on the clp list if there is any @@ -1209,7 +1230,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); -bool +void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, const struct nfs_pgio_completion_ops *compl_ops) { @@ -1217,13 +1238,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; if (ld == NULL) - return false; - nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, - server->rsize, 0); - return true; + nfs_pageio_init_read(pgio, inode, compl_ops); + else + nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); } -bool +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, const struct nfs_pgio_completion_ops *compl_ops) @@ -1232,10 +1252,9 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; if (ld == NULL) - return false; - nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, - server->wsize, ioflags); - return true; + nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); + else + nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); } bool @@ -1272,7 +1291,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE, compl_ops); + nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1388,6 +1407,7 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) put_lseg(hdr->lseg); nfs_writehdr_free(hdr); } +EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) @@ -1427,7 +1447,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_read_mds(&pgio, inode, compl_ops); + nfs_pageio_init_read(&pgio, inode, compl_ops); while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1542,6 +1562,7 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) put_lseg(hdr->lseg); nfs_readhdr_free(hdr); } +EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 64f90d845f6a..2c6c80503ba4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -64,6 +64,7 @@ enum { NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ NFS_LAYOUT_INVALID, /* layout is being destroyed */ + NFS_LAYOUT_RETURNED, /* layout has already been returned */ }; enum layoutdriver_policy_flags { @@ -178,9 +179,9 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); -bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, +void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, const struct nfs_pgio_completion_ops *); -bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int, const struct nfs_pgio_completion_ops *); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); @@ -255,6 +256,24 @@ struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node * bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); void nfs4_deviceid_purge_client(const struct nfs_client *); +static inline void +pnfs_mark_layout_returned(struct pnfs_layout_hdr *lo) +{ + set_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); +} + +static inline void +pnfs_clear_layout_returned(struct pnfs_layout_hdr *lo) +{ + clear_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); +} + +static inline bool +pnfs_test_layout_returned(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); +} + static inline int lo_fail_bit(u32 iomode) { return iomode == IOMODE_RW ? @@ -438,16 +457,16 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, +static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, const struct nfs_pgio_completion_ops *compl_ops) { - return false; + nfs_pageio_init_read(pgio, inode, compl_ops); } -static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, +static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, const struct nfs_pgio_completion_ops *compl_ops) { - return false; + nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); } static inline int diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4433806e116f..50a88c3546ed 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -734,6 +734,38 @@ out_einval: return -EINVAL; } +static int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static int nfs_return_delegation(struct inode *inode) +{ + nfs_wb_all(inode); + return 0; +} + +static const struct inode_operations nfs_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +static const struct inode_operations nfs_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + const struct nfs_rpc_ops nfs_v2_clientops = { .version = 2, /* protocol version */ .dentry_ops = &nfs_dentry_operations, @@ -742,6 +774,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .submount = nfs_submount, + .try_mount = nfs_try_mount, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, .lookup = nfs_proc_lookup, @@ -767,9 +800,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .pathconf = nfs_proc_pathconf, .decode_dirent = nfs2_decode_dirent, .read_setup = nfs_proc_read_setup, + .read_pageio_init = nfs_pageio_init_read, .read_rpc_prepare = nfs_proc_read_rpc_prepare, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, + .write_pageio_init = nfs_pageio_init_write, .write_rpc_prepare = nfs_proc_write_rpc_prepare, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, @@ -777,5 +812,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, + .have_delegation = nfs_have_delegation, + .return_delegation = nfs_return_delegation, + .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, + .free_client = nfs_free_client, + .create_server = nfs_create_server, + .clone_server = nfs_clone_server, }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 86ced7836214..b6bdb18e892c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -20,8 +20,6 @@ #include <linux/nfs_page.h> #include <linux/module.h> -#include "pnfs.h" - #include "nfs4_fs.h" #include "internal.h" #include "iostat.h" @@ -50,6 +48,7 @@ struct nfs_read_header *nfs_readhdr_alloc(void) } return rhdr; } +EXPORT_SYMBOL_GPL(nfs_readhdr_alloc); static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr, unsigned int pagecount) @@ -82,6 +81,7 @@ void nfs_readhdr_free(struct nfs_pgio_header *hdr) kmem_cache_free(nfs_rdata_cachep, rhdr); } +EXPORT_SYMBOL_GPL(nfs_readhdr_free); void nfs_readdata_release(struct nfs_read_data *rdata) { @@ -98,6 +98,7 @@ void nfs_readdata_release(struct nfs_read_data *rdata) if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); } +EXPORT_SYMBOL_GPL(nfs_readdata_release); static int nfs_return_empty_page(struct page *page) @@ -108,13 +109,14 @@ int nfs_return_empty_page(struct page *page) return 0; } -void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, const struct nfs_pgio_completion_ops *compl_ops) { nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops, NFS_SERVER(inode)->rsize, 0); } +EXPORT_SYMBOL_GPL(nfs_pageio_init_read); void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { @@ -123,14 +125,6 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); -void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - if (!pnfs_pageio_init_read(pgio, inode, compl_ops)) - nfs_pageio_init_read_mds(pgio, inode, compl_ops); -} - int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -149,7 +143,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops); + NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); NFS_I(inode)->read_io += pgio.pg_bytes_written; @@ -407,6 +401,7 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, return nfs_pagein_multi(desc, hdr); return nfs_pagein_one(desc, hdr); } +EXPORT_SYMBOL_GPL(nfs_generic_pagein); static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { @@ -532,11 +527,11 @@ static const struct rpc_call_ops nfs_read_common_ops = { int nfs_readpage(struct file *file, struct page *page) { struct nfs_open_context *ctx; - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_CACHE_SIZE, page->index); + page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); @@ -590,7 +585,7 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *new; unsigned int len; int error; @@ -652,7 +647,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops); + NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 8b2a2977b720..ac6a3c55dce4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -64,11 +64,12 @@ #include "internal.h" #include "fscache.h" #include "pnfs.h" +#include "nfs.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_TEXT_DATA 1 -#ifdef CONFIG_NFS_V3 +#if IS_ENABLED(CONFIG_NFS_V3) #define NFS_DEFAULT_VERSION 3 #else #define NFS_DEFAULT_VERSION 2 @@ -278,37 +279,17 @@ static match_table_t nfs_vers_tokens = { { Opt_vers_err, NULL } }; -struct nfs_mount_info { - void (*fill_super)(struct super_block *, struct nfs_mount_info *); - int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); - struct nfs_parsed_mount_data *parsed; - struct nfs_clone_mount *cloned; - struct nfs_fh *mntfh; -}; - -static void nfs_umount_begin(struct super_block *); -static int nfs_statfs(struct dentry *, struct kstatfs *); -static int nfs_show_options(struct seq_file *, struct dentry *); -static int nfs_show_devname(struct seq_file *, struct dentry *); -static int nfs_show_path(struct seq_file *, struct dentry *); -static int nfs_show_stats(struct seq_file *, struct dentry *); -static struct dentry *nfs_fs_mount_common(struct file_system_type *, - struct nfs_server *, int, const char *, struct nfs_mount_info *); -static struct dentry *nfs_fs_mount(struct file_system_type *, - int, const char *, void *); static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data); -static void nfs_put_super(struct super_block *); -static void nfs_kill_super(struct super_block *); -static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); -static struct file_system_type nfs_fs_type = { +struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, .name = "nfs", .mount = nfs_fs_mount, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; +EXPORT_SYMBOL_GPL(nfs_fs_type); struct file_system_type nfs_xdev_fs_type = { .owner = THIS_MODULE, @@ -318,7 +299,7 @@ struct file_system_type nfs_xdev_fs_type = { .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; -static const struct super_operations nfs_sops = { +const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, @@ -332,77 +313,12 @@ static const struct super_operations nfs_sops = { .show_stats = nfs_show_stats, .remount_fs = nfs_remount, }; +EXPORT_SYMBOL_GPL(nfs_sops); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *); static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name); -static struct dentry *nfs4_try_mount(int flags, const char *dev_name, - struct nfs_mount_info *mount_info); -static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); -static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); -static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); -static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); -static void nfs4_kill_super(struct super_block *sb); - -static struct file_system_type nfs4_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs_fs_mount, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -static struct file_system_type nfs4_remote_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_remote_mount, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -struct file_system_type nfs4_xdev_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_xdev_mount, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -static struct file_system_type nfs4_remote_referral_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_remote_referral_mount, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -struct file_system_type nfs4_referral_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_referral_mount, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -static const struct super_operations nfs4_sops = { - .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, - .put_super = nfs_put_super, - .statfs = nfs_statfs, - .evict_inode = nfs4_evict_inode, - .umount_begin = nfs_umount_begin, - .show_options = nfs_show_options, - .show_devname = nfs_show_devname, - .show_path = nfs_show_path, - .show_stats = nfs_show_stats, - .remount_fs = nfs_remount, -}; #endif static struct shrinker acl_shrinker = { @@ -424,18 +340,9 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_1; -#ifdef CONFIG_NFS_V4 - ret = register_filesystem(&nfs4_fs_type); - if (ret < 0) - goto error_2; -#endif register_shrinker(&acl_shrinker); return 0; -#ifdef CONFIG_NFS_V4 -error_2: - nfs_unregister_sysctl(); -#endif error_1: unregister_filesystem(&nfs_fs_type); error_0: @@ -448,9 +355,6 @@ error_0: void __exit unregister_nfs_fs(void) { unregister_shrinker(&acl_shrinker); -#ifdef CONFIG_NFS_V4 - unregister_filesystem(&nfs4_fs_type); -#endif nfs_unregister_sysctl(); unregister_filesystem(&nfs_fs_type); } @@ -462,6 +366,7 @@ void nfs_sb_active(struct super_block *sb) if (atomic_inc_return(&server->active) == 1) atomic_inc(&sb->s_active); } +EXPORT_SYMBOL_GPL(nfs_sb_active); void nfs_sb_deactive(struct super_block *sb) { @@ -470,11 +375,12 @@ void nfs_sb_deactive(struct super_block *sb) if (atomic_dec_and_test(&server->active)) deactivate_super(sb); } +EXPORT_SYMBOL_GPL(nfs_sb_deactive); /* * Deliver file system statistics to userspace */ -static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) +int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct nfs_server *server = NFS_SB(dentry->d_sb); unsigned char blockbits; @@ -535,6 +441,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) dprintk("%s: statfs error = %d\n", __func__, -error); return error; } +EXPORT_SYMBOL_GPL(nfs_statfs); /* * Map the security flavour number to a name @@ -640,7 +547,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, nfs_show_mountd_netid(m, nfss, showdefaults); } -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { @@ -757,7 +664,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, /* * Describe the mount options on this VFS mountpoint */ -static int nfs_show_options(struct seq_file *m, struct dentry *root) +int nfs_show_options(struct seq_file *m, struct dentry *root) { struct nfs_server *nfss = NFS_SB(root->d_sb); @@ -771,8 +678,9 @@ static int nfs_show_options(struct seq_file *m, struct dentry *root) return 0; } +EXPORT_SYMBOL_GPL(nfs_show_options); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) #ifdef CONFIG_NFS_V4_1 static void show_sessions(struct seq_file *m, struct nfs_server *server) { @@ -805,7 +713,7 @@ static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) } } #else -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) static void show_pnfs(struct seq_file *m, struct nfs_server *server) { } @@ -815,7 +723,7 @@ static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) } #endif -static int nfs_show_devname(struct seq_file *m, struct dentry *root) +int nfs_show_devname(struct seq_file *m, struct dentry *root) { char *page = (char *) __get_free_page(GFP_KERNEL); char *devname, *dummy; @@ -830,17 +738,19 @@ static int nfs_show_devname(struct seq_file *m, struct dentry *root) free_page((unsigned long)page); return err; } +EXPORT_SYMBOL_GPL(nfs_show_devname); -static int nfs_show_path(struct seq_file *m, struct dentry *dentry) +int nfs_show_path(struct seq_file *m, struct dentry *dentry) { seq_puts(m, "/"); return 0; } +EXPORT_SYMBOL_GPL(nfs_show_path); /* * Present statistical information for this VFS mountpoint */ -static int nfs_show_stats(struct seq_file *m, struct dentry *root) +int nfs_show_stats(struct seq_file *m, struct dentry *root) { int i, cpu; struct nfs_server *nfss = NFS_SB(root->d_sb); @@ -870,7 +780,7 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root) seq_printf(m, ",bsize=%u", nfss->bsize); seq_printf(m, ",namlen=%u", nfss->namelen); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) if (nfss->nfs_client->rpc_ops->version == 4) { seq_printf(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); @@ -928,12 +838,13 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root) return 0; } +EXPORT_SYMBOL_GPL(nfs_show_stats); /* * Begin unmount by attempting to remove all automounted mountpoints we added * in response to xdev traversals and referrals */ -static void nfs_umount_begin(struct super_block *sb) +void nfs_umount_begin(struct super_block *sb) { struct nfs_server *server; struct rpc_clnt *rpc; @@ -947,6 +858,7 @@ static void nfs_umount_begin(struct super_block *sb) if (!IS_ERR(rpc)) rpc_killall_tasks(rpc); } +EXPORT_SYMBOL_GPL(nfs_umount_begin); static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) { @@ -1748,8 +1660,9 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return nfs_walk_authlist(args, &request); } -static struct dentry *nfs_try_mount(int flags, const char *dev_name, - struct nfs_mount_info *mount_info) +struct dentry *nfs_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) { int status; struct nfs_server *server; @@ -1761,12 +1674,13 @@ static struct dentry *nfs_try_mount(int flags, const char *dev_name, } /* Get a volume representation */ - server = nfs_create_server(mount_info->parsed, mount_info->mntfh); + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); if (IS_ERR(server)) return ERR_CAST(server); - return nfs_fs_mount_common(&nfs_fs_type, server, flags, dev_name, mount_info); + return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod); } +EXPORT_SYMBOL_GPL(nfs_try_mount); /* * Split "dev_name" into "hostname:export_path". @@ -1970,7 +1884,7 @@ static int nfs23_validate_mount_data(void *options, return NFS_TEXT_DATA; } -#ifndef CONFIG_NFS_V3 +#if !IS_ENABLED(CONFIG_NFS_V3) if (args->version == 3) goto out_v3_not_compiled; #endif /* !CONFIG_NFS_V3 */ @@ -1990,7 +1904,7 @@ out_no_sec: dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); return -EINVAL; -#ifndef CONFIG_NFS_V3 +#if !IS_ENABLED(CONFIG_NFS_V3) out_v3_not_compiled: dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); return -EPROTONOSUPPORT; @@ -2009,7 +1923,7 @@ out_invalid_fh: return -EINVAL; } -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) static int nfs_validate_mount_data(struct file_system_type *fs_type, void *options, struct nfs_parsed_mount_data *args, @@ -2047,7 +1961,7 @@ static int nfs_validate_text_mount_data(void *options, goto out_no_address; if (args->version == 4) { -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) port = NFS_PORT; max_namelen = NFS4_MAXNAMLEN; max_pathlen = NFS4_MAXPATHLEN; @@ -2070,7 +1984,7 @@ static int nfs_validate_text_mount_data(void *options, &args->nfs_server.export_path, max_pathlen); -#ifndef CONFIG_NFS_V4 +#if !IS_ENABLED(CONFIG_NFS_V4) out_v4_not_compiled: dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); return -EPROTONOSUPPORT; @@ -2108,7 +2022,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, return 0; } -static int +int nfs_remount(struct super_block *sb, int *flags, char *raw_data) { int error; @@ -2169,11 +2083,12 @@ out: kfree(data); return error; } +EXPORT_SYMBOL_GPL(nfs_remount); /* * Initialise the common bits of the superblock */ -static inline void nfs_initialise_sb(struct super_block *sb) +inline void nfs_initialise_sb(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); @@ -2195,18 +2110,19 @@ static inline void nfs_initialise_sb(struct super_block *sb) /* * Finish setting up an NFS2/3 superblock */ -static void nfs_fill_super(struct super_block *sb, - struct nfs_mount_info *mount_info) +void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) { struct nfs_parsed_mount_data *data = mount_info->parsed; struct nfs_server *server = NFS_SB(sb); sb->s_blocksize_bits = 0; sb->s_blocksize = 0; - if (data->bsize) + sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; + sb->s_op = server->nfs_client->cl_nfs_mod->sops; + if (data && data->bsize) sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); - if (server->nfs_client->rpc_ops->version == 3) { + if (server->nfs_client->rpc_ops->version != 2) { /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ @@ -2214,15 +2130,14 @@ static void nfs_fill_super(struct super_block *sb, sb->s_time_gran = 1; } - sb->s_op = &nfs_sops; nfs_initialise_sb(sb); } +EXPORT_SYMBOL_GPL(nfs_fill_super); /* - * Finish setting up a cloned NFS2/3 superblock + * Finish setting up a cloned NFS2/3/4 superblock */ -static void nfs_clone_super(struct super_block *sb, - struct nfs_mount_info *mount_info) +void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) { const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); @@ -2230,16 +2145,17 @@ static void nfs_clone_super(struct super_block *sb, sb->s_blocksize_bits = old_sb->s_blocksize_bits; sb->s_blocksize = old_sb->s_blocksize; sb->s_maxbytes = old_sb->s_maxbytes; + sb->s_xattr = old_sb->s_xattr; + sb->s_op = old_sb->s_op; + sb->s_time_gran = 1; - if (server->nfs_client->rpc_ops->version == 3) { + if (server->nfs_client->rpc_ops->version != 2) { /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ sb->s_flags |= MS_POSIXACL; - sb->s_time_gran = 1; } - sb->s_op = old_sb->s_op; nfs_initialise_sb(sb); } @@ -2381,14 +2297,15 @@ static int nfs_bdi_register(struct nfs_server *server) return bdi_register_dev(&server->backing_dev_info, server->s_dev); } -static int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, - struct nfs_mount_info *mount_info) +int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) { return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); } +EXPORT_SYMBOL_GPL(nfs_set_sb_security); -static int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, - struct nfs_mount_info *mount_info) +int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ security_sb_clone_mnt_opts(mount_info->cloned->sb, s); @@ -2396,11 +2313,12 @@ static int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, return -ESTALE; return 0; } +EXPORT_SYMBOL_GPL(nfs_clone_sb_security); -static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type, - struct nfs_server *server, - int flags, const char *dev_name, - struct nfs_mount_info *mount_info) +struct dentry *nfs_fs_mount_common(struct nfs_server *server, + int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) { struct super_block *s; struct dentry *mntroot = ERR_PTR(-ENOMEM); @@ -2419,7 +2337,7 @@ static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type, sb_mntdata.mntflags |= MS_SYNCHRONOUS; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(fs_type, compare_super, nfs_set_super, flags, &sb_mntdata); + s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); if (IS_ERR(s)) { mntroot = ERR_CAST(s); goto out_err_nosb; @@ -2469,8 +2387,9 @@ error_splat_bdi: deactivate_locked_super(s); goto out; } +EXPORT_SYMBOL_GPL(nfs_fs_mount_common); -static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, +struct dentry *nfs_fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { struct nfs_mount_info mount_info = { @@ -2478,6 +2397,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, .set_security = nfs_set_sb_security, }; struct dentry *mntroot = ERR_PTR(-ENOMEM); + struct nfs_subversion *nfs_mod; int error; mount_info.parsed = nfs_alloc_parsed_mount_data(); @@ -2494,34 +2414,38 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, goto out; } -#ifdef CONFIG_NFS_V4 - if (mount_info.parsed->version == 4) - mntroot = nfs4_try_mount(flags, dev_name, &mount_info); - else -#endif /* CONFIG_NFS_V4 */ - mntroot = nfs_try_mount(flags, dev_name, &mount_info); + nfs_mod = get_nfs_version(mount_info.parsed->version); + if (IS_ERR(nfs_mod)) { + mntroot = ERR_CAST(nfs_mod); + goto out; + } + + mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod); + put_nfs_version(nfs_mod); out: nfs_free_parsed_mount_data(mount_info.parsed); nfs_free_fhandle(mount_info.mntfh); return mntroot; } +EXPORT_SYMBOL_GPL(nfs_fs_mount); /* * Ensure that we unregister the bdi before kill_anon_super * releases the device name */ -static void nfs_put_super(struct super_block *s) +void nfs_put_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); bdi_unregister(&server->backing_dev_info); } +EXPORT_SYMBOL_GPL(nfs_put_super); /* * Destroy an NFS2/3 superblock */ -static void nfs_kill_super(struct super_block *s) +void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); @@ -2529,31 +2453,38 @@ static void nfs_kill_super(struct super_block *s) nfs_fscache_release_super_cookie(s); nfs_free_server(server); } +EXPORT_SYMBOL_GPL(nfs_kill_super); /* * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) */ -static struct dentry * -nfs_xdev_mount_common(struct file_system_type *fs_type, int flags, - const char *dev_name, struct nfs_mount_info *mount_info) +struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { - struct nfs_clone_mount *data = mount_info->cloned; + struct nfs_clone_mount *data = raw_data; + struct nfs_mount_info mount_info = { + .fill_super = nfs_clone_super, + .set_security = nfs_clone_sb_security, + .cloned = data, + }; struct nfs_server *server; struct dentry *mntroot = ERR_PTR(-ENOMEM); + struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; int error; dprintk("--> nfs_xdev_mount_common()\n"); - mount_info->mntfh = data->fh; + mount_info.mntfh = mount_info.cloned->fh; /* create a new volume representation */ - server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); + server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); if (IS_ERR(server)) { error = PTR_ERR(server); goto out_err; } - mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info); + mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); dprintk("<-- nfs_xdev_mount_common() = 0\n"); out: return mntroot; @@ -2563,60 +2494,7 @@ out_err: goto out; } -/* - * Clone an NFS2/3 server record on xdev traversal (FSID-change) - */ -static struct dentry * -nfs_xdev_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) -{ - struct nfs_mount_info mount_info = { - .fill_super = nfs_clone_super, - .set_security = nfs_clone_sb_security, - .cloned = raw_data, - }; - return nfs_xdev_mount_common(&nfs_fs_type, flags, dev_name, &mount_info); -} - -#ifdef CONFIG_NFS_V4 - -/* - * Finish setting up a cloned NFS4 superblock - */ -static void nfs4_clone_super(struct super_block *sb, - struct nfs_mount_info *mount_info) -{ - const struct super_block *old_sb = mount_info->cloned->sb; - sb->s_blocksize_bits = old_sb->s_blocksize_bits; - sb->s_blocksize = old_sb->s_blocksize; - sb->s_maxbytes = old_sb->s_maxbytes; - sb->s_time_gran = 1; - sb->s_op = old_sb->s_op; - /* - * The VFS shouldn't apply the umask to mode bits. We will do - * so ourselves when necessary. - */ - sb->s_flags |= MS_POSIXACL; - sb->s_xattr = old_sb->s_xattr; - nfs_initialise_sb(sb); -} - -/* - * Set up an NFS4 superblock - */ -static void nfs4_fill_super(struct super_block *sb, - struct nfs_mount_info *mount_info) -{ - sb->s_time_gran = 1; - sb->s_op = &nfs4_sops; - /* - * The VFS shouldn't apply the umask to mode bits. We will do - * so ourselves when necessary. - */ - sb->s_flags |= MS_POSIXACL; - sb->s_xattr = nfs4_xattr_handlers; - nfs_initialise_sb(sb); -} +#if IS_ENABLED(CONFIG_NFS_V4) static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) { @@ -2716,249 +2594,55 @@ out_no_address: } /* - * Get the superblock for the NFS4 root partition + * NFS v4 module parameters need to stay in the + * NFS client for backwards compatibility */ -static struct dentry * -nfs4_remote_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *info) -{ - struct nfs_mount_info *mount_info = info; - struct nfs_server *server; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - - mount_info->fill_super = nfs4_fill_super; - mount_info->set_security = nfs_set_sb_security; - - /* Get a volume representation */ - server = nfs4_create_server(mount_info->parsed, mount_info->mntfh); - if (IS_ERR(server)) { - mntroot = ERR_CAST(server); - goto out; - } - - mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info); - -out: - return mntroot; -} - -static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, - int flags, void *data, const char *hostname) -{ - struct vfsmount *root_mnt; - char *root_devname; - size_t len; +unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_tcpport; +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600; +/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ +bool nfs4_disable_idmapping = true; +unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short send_implementation_id = 1; + +EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); +EXPORT_SYMBOL_GPL(nfs_callback_tcpport); +EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); +EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); +EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(send_implementation_id); + +#define NFS_CALLBACK_MAXPORTNR (65535U) + +static int param_set_portnr(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret; - len = strlen(hostname) + 5; - root_devname = kmalloc(len, GFP_KERNEL); - if (root_devname == NULL) - return ERR_PTR(-ENOMEM); - /* Does hostname needs to be enclosed in brackets? */ - if (strchr(hostname, ':')) - snprintf(root_devname, len, "[%s]:/", hostname); - else - snprintf(root_devname, len, "%s:/", hostname); - root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); - kfree(root_devname); - return root_mnt; + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; } - -struct nfs_referral_count { - struct list_head list; - const struct task_struct *task; - unsigned int referral_count; +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, }; - -static LIST_HEAD(nfs_referral_count_list); -static DEFINE_SPINLOCK(nfs_referral_count_list_lock); - -static struct nfs_referral_count *nfs_find_referral_count(void) -{ - struct nfs_referral_count *p; - - list_for_each_entry(p, &nfs_referral_count_list, list) { - if (p->task == current) - return p; - } - return NULL; -} - -#define NFS_MAX_NESTED_REFERRALS 2 - -static int nfs_referral_loop_protect(void) -{ - struct nfs_referral_count *p, *new; - int ret = -ENOMEM; - - new = kmalloc(sizeof(*new), GFP_KERNEL); - if (!new) - goto out; - new->task = current; - new->referral_count = 1; - - ret = 0; - spin_lock(&nfs_referral_count_list_lock); - p = nfs_find_referral_count(); - if (p != NULL) { - if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) - ret = -ELOOP; - else - p->referral_count++; - } else { - list_add(&new->list, &nfs_referral_count_list); - new = NULL; - } - spin_unlock(&nfs_referral_count_list_lock); - kfree(new); -out: - return ret; -} - -static void nfs_referral_loop_unprotect(void) -{ - struct nfs_referral_count *p; - - spin_lock(&nfs_referral_count_list_lock); - p = nfs_find_referral_count(); - p->referral_count--; - if (p->referral_count == 0) - list_del(&p->list); - else - p = NULL; - spin_unlock(&nfs_referral_count_list_lock); - kfree(p); -} - -static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, - const char *export_path) -{ - struct dentry *dentry; - int err; - - if (IS_ERR(root_mnt)) - return ERR_CAST(root_mnt); - - err = nfs_referral_loop_protect(); - if (err) { - mntput(root_mnt); - return ERR_PTR(err); - } - - dentry = mount_subtree(root_mnt, export_path); - nfs_referral_loop_unprotect(); - - return dentry; -} - -static struct dentry *nfs4_try_mount(int flags, const char *dev_name, - struct nfs_mount_info *mount_info) -{ - char *export_path; - struct vfsmount *root_mnt; - struct dentry *res; - struct nfs_parsed_mount_data *data = mount_info->parsed; - - dfprintk(MOUNT, "--> nfs4_try_mount()\n"); - - mount_info->fill_super = nfs4_fill_super; - - export_path = data->nfs_server.export_path; - data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, - data->nfs_server.hostname); - data->nfs_server.export_path = export_path; - - res = nfs_follow_remote_path(root_mnt, export_path); - - dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); - return res; -} - -static void nfs4_kill_super(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - dprintk("--> %s\n", __func__); - nfs_super_return_all_delegations(sb); - kill_anon_super(sb); - nfs_fscache_release_super_cookie(sb); - nfs_free_server(server); - dprintk("<-- %s\n", __func__); -} - -/* - * Clone an NFS4 server record on xdev traversal (FSID-change) - */ -static struct dentry * -nfs4_xdev_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) -{ - struct nfs_mount_info mount_info = { - .fill_super = nfs4_clone_super, - .set_security = nfs_clone_sb_security, - .cloned = raw_data, - }; - return nfs_xdev_mount_common(&nfs4_fs_type, flags, dev_name, &mount_info); -} - -static struct dentry * -nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) -{ - struct nfs_mount_info mount_info = { - .fill_super = nfs4_fill_super, - .set_security = nfs_clone_sb_security, - .cloned = raw_data, - }; - struct nfs_server *server; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - - dprintk("--> nfs4_referral_get_sb()\n"); - - mount_info.mntfh = nfs_alloc_fhandle(); - if (mount_info.cloned == NULL || mount_info.mntfh == NULL) - goto out; - - /* create a new volume representation */ - server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); - if (IS_ERR(server)) { - mntroot = ERR_CAST(server); - goto out; - } - - mntroot = nfs_fs_mount_common(&nfs4_fs_type, server, flags, dev_name, &mount_info); -out: - nfs_free_fhandle(mount_info.mntfh); - return mntroot; -} - -/* - * Create an NFS4 server record on referral traversal - */ -static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) -{ - struct nfs_clone_mount *data = raw_data; - char *export_path; - struct vfsmount *root_mnt; - struct dentry *res; - - dprintk("--> nfs4_referral_mount()\n"); - - export_path = data->mnt_path; - data->mnt_path = "/"; - - root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, - flags, data, data->hostname); - data->mnt_path = export_path; - - res = nfs_follow_remote_path(root_mnt, export_path); - dprintk("<-- nfs4_referral_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); - return res; -} - +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param(nfs_idmap_cache_timeout, int, 0644); +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); +module_param(max_session_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " + "requests the client will negotiate"); +module_param(send_implementation_id, ushort, 0644); +MODULE_PARM_DESC(send_implementation_id, + "Send implementation ID with NFSv4.1 exchange_id"); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index ad4d2e787b20..6b3f2535a3ec 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -9,37 +9,11 @@ #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/module.h> -#include <linux/nfs4.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> -#include "callback.h" - -#ifdef CONFIG_NFS_V4 -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; -#endif static struct ctl_table_header *nfs_callback_sysctl_table; static ctl_table nfs_cb_sysctls[] = { -#ifdef CONFIG_NFS_V4 - { - .procname = "nfs_callback_tcpport", - .data = &nfs_callback_set_tcpport, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (int *)&nfs_set_port_min, - .extra2 = (int *)&nfs_set_port_max, - }, - { - .procname = "idmap_cache_timeout", - .data = &nfs_idmap_cache_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#endif { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 3210a03342f9..13cea637eff8 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -501,7 +501,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) (unsigned long long)NFS_FILEID(dentry->d_inode)); /* Return delegation in anticipation of the rename */ - nfs_inode_return_delegation(dentry->d_inode); + NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); sdentry = NULL; do { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4d6861c0dc14..5829d0ce7cfb 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -52,7 +52,7 @@ static mempool_t *nfs_commit_mempool; struct nfs_commit_data *nfs_commitdata_alloc(void) { - struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); + struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); if (p) { memset(p, 0, sizeof(*p)); @@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); struct nfs_write_header *nfs_writehdr_alloc(void) { - struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); + struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); if (p) { struct nfs_pgio_header *hdr = &p->header; @@ -84,6 +84,7 @@ struct nfs_write_header *nfs_writehdr_alloc(void) } return p; } +EXPORT_SYMBOL_GPL(nfs_writehdr_alloc); static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr, unsigned int pagecount) @@ -115,6 +116,7 @@ void nfs_writehdr_free(struct nfs_pgio_header *hdr) struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header); mempool_free(whdr, nfs_wdata_mempool); } +EXPORT_SYMBOL_GPL(nfs_writehdr_free); void nfs_writedata_release(struct nfs_write_data *wdata) { @@ -131,6 +133,7 @@ void nfs_writedata_release(struct nfs_write_data *wdata) if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); } +EXPORT_SYMBOL_GPL(nfs_writedata_release); static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) { @@ -139,25 +142,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } -static struct nfs_page *nfs_page_find_request_locked(struct page *page) +static struct nfs_page * +nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; - if (PagePrivate(page)) { + if (PagePrivate(page)) req = (struct nfs_page *)page_private(page); - if (req != NULL) - kref_get(&req->wb_kref); + else if (unlikely(PageSwapCache(page))) { + struct nfs_page *freq, *t; + + /* Linearly search the commit list for the correct req */ + list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { + if (freq->wb_page == page) { + req = freq; + break; + } + } } + + if (req) + kref_get(&req->wb_kref); + return req; } static struct nfs_page *nfs_page_find_request(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -165,16 +181,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page) /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (i_size > 0 && page->index < end_index) + if (i_size > 0 && page_file_index(page) < end_index) goto out; - end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); + end = page_file_offset(page) + ((loff_t)offset+count); if (i_size >= end) goto out; i_size_write(inode, end); @@ -187,7 +203,7 @@ out: static void nfs_set_pageerror(struct page *page) { SetPageError(page); - nfs_zap_mapping(page->mapping->host, page->mapping); + nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } /* We can set the PG_uptodate flag if we see that a write request @@ -228,7 +244,7 @@ static int nfs_set_page_writeback(struct page *page) int ret = test_set_page_writeback(page); if (!ret) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_server *nfss = NFS_SERVER(inode); if (atomic_long_inc_return(&nfss->writeback) > @@ -242,7 +258,7 @@ static int nfs_set_page_writeback(struct page *page) static void nfs_end_page_writeback(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); @@ -252,13 +268,13 @@ static void nfs_end_page_writeback(struct page *page) static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req; int ret; spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); if (req == NULL) break; if (nfs_lock_request(req)) @@ -313,13 +329,13 @@ out: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - nfs_pageio_cond_complete(pgio, page->index); + nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); @@ -336,8 +352,10 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), - &nfs_async_write_completion_ops); + NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, + page->mapping->host, + wb_priority(wbc), + &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) @@ -380,8 +398,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), - &nfs_async_write_completion_ops); + NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); @@ -410,11 +427,17 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) nfs_lock_request(req); spin_lock(&inode->i_lock); - if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) + if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) inode->i_version++; - set_bit(PG_MAPPED, &req->wb_flags); - SetPagePrivate(req->wb_page); - set_page_private(req->wb_page, (unsigned long)req); + /* + * Swap-space should not get truncated. Hence no need to plug the race + * with invalidate/truncate. + */ + if (likely(!PageSwapCache(req->wb_page))) { + set_bit(PG_MAPPED, &req->wb_flags); + SetPagePrivate(req->wb_page); + set_page_private(req->wb_page, (unsigned long)req); + } nfsi->npages++; kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); @@ -431,9 +454,11 @@ static void nfs_inode_remove_request(struct nfs_page *req) BUG_ON (!NFS_WBACK_BUSY(req)); spin_lock(&inode->i_lock); - set_page_private(req->wb_page, 0); - ClearPagePrivate(req->wb_page); - clear_bit(PG_MAPPED, &req->wb_flags); + if (likely(!PageSwapCache(req->wb_page))) { + set_page_private(req->wb_page, 0); + ClearPagePrivate(req->wb_page); + clear_bit(PG_MAPPED, &req->wb_flags); + } nfsi->npages--; spin_unlock(&inode->i_lock); nfs_release_request(req); @@ -445,7 +470,7 @@ nfs_mark_request_dirty(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page @@ -470,7 +495,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, + inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); @@ -537,7 +562,7 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); } static void @@ -620,7 +645,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { - memcpy(&req->wb_verf, hdr->verf, sizeof(req->wb_verf)); + memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -635,7 +660,7 @@ out: hdr->release(hdr); } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { @@ -729,7 +754,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; @@ -788,7 +813,7 @@ out_err: static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, struct page *page, unsigned int offset, unsigned int bytes) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req; req = nfs_try_to_update_request(inode, page, offset, bytes); @@ -841,7 +866,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_page(page->mapping->host, page); + status = nfs_wb_page(page_file_mapping(page)->host, page); } while (status == 0); return status; } @@ -871,7 +896,7 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -879,7 +904,7 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, - (long long)(page_offset(page) + offset)); + (long long)(page_file_offset(page) + offset)); /* If we're not using byte range locks, and we know the page * is up to date, it may be more efficient to extend the write @@ -1172,6 +1197,7 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc, return nfs_flush_multi(desc, hdr); return nfs_flush_one(desc, hdr); } +EXPORT_SYMBOL_GPL(nfs_generic_flush); static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { @@ -1202,13 +1228,14 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = { .pg_doio = nfs_generic_pg_writepages, }; -void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, const struct nfs_pgio_completion_ops *compl_ops) { nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, NFS_SERVER(inode)->wsize, ioflags); } +EXPORT_SYMBOL_GPL(nfs_pageio_init_write); void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { @@ -1217,13 +1244,6 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - if (!pnfs_pageio_init_write(pgio, inode, ioflags, compl_ops)) - nfs_pageio_init_write_mds(pgio, inode, ioflags, compl_ops); -} void nfs_write_prepare(struct rpc_task *task, void *calldata) { @@ -1303,7 +1323,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) return; nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) if (resp->verf->committed < argp->stable && task->tk_status >= 0) { /* We tried a write call, but the server did not * commit data to stable storage even though we @@ -1363,7 +1383,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { int ret; @@ -1475,7 +1495,7 @@ void nfs_retry_commit(struct list_head *page_list, nfs_mark_request_commit(req, lseg, cinfo); if (!cinfo->dreq) { dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, BDI_RECLAIMABLE); } nfs_unlock_and_release_request(req); @@ -1547,7 +1567,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { + if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { /* We have a match */ nfs_inode_remove_request(req); dprintk(" OK\n"); @@ -1677,22 +1697,9 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - - ret = nfs_commit_unstable_pages(inode, wbc); - if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { - int status; - bool sync = true; - - if (wbc->sync_mode == WB_SYNC_NONE) - sync = false; - - status = pnfs_layoutcommit_inode(inode, sync); - if (status < 0) - return status; - } - return ret; + return nfs_commit_unstable_pages(inode, wbc); } +EXPORT_SYMBOL_GPL(nfs_write_inode); /* * flush the inode to disk. @@ -1708,6 +1715,7 @@ int nfs_wb_all(struct inode *inode) return sync_inode(inode, &wbc); } +EXPORT_SYMBOL_GPL(nfs_wb_all); int nfs_wb_page_cancel(struct inode *inode, struct page *page) { @@ -1744,7 +1752,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) */ int nfs_wb_page(struct inode *inode, struct page *page) { - loff_t range_start = page_offset(page); + loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ba233499b9a5..a3946cf13fc8 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -398,7 +398,7 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) int migrated, i, err; /* listsize */ - err = get_int(mesg, &fsloc->locations_count); + err = get_uint(mesg, &fsloc->locations_count); if (err) return err; if (fsloc->locations_count > MAX_FS_LOCATIONS) @@ -456,7 +456,7 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { - err = get_int(mesg, &f->pseudoflavor); + err = get_uint(mesg, &f->pseudoflavor); if (err) return err; /* @@ -465,7 +465,7 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) * problem at export time instead of when a client fails * to authenticate. */ - err = get_int(mesg, &f->flags); + err = get_uint(mesg, &f->flags); if (err) return err; /* Only some flags are allowed to differ between flavors: */ @@ -929,7 +929,7 @@ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) @@ -960,7 +960,7 @@ struct svc_export * rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 39365636b244..65c2431ea32f 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -34,6 +34,10 @@ struct nfsd_net { struct cache_detail *idtoname_cache; struct cache_detail *nametoid_cache; + + struct lock_manager nfsd4_manager; + bool grace_ended; + time_t boot_time; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a5fd6b982f27..cbaf4f8bb7b7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -756,7 +756,6 @@ static void do_probe_callback(struct nfs4_client *clp) */ void nfsd4_probe_callback(struct nfs4_client *clp) { - /* XXX: atomicity? Also, should we be using cl_flags? */ clp->cl_cb_state = NFSD4_CB_UNKNOWN; set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); do_probe_callback(clp); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index dae36f1dee95..fdc91a6fc9c4 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -546,7 +546,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen .type = type, }; int ret; - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (namelen + 1 > sizeof(key.name)) return nfserr_badowner; @@ -571,7 +571,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) .type = type, }; int ret; - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 987e719fbae8..c9c1c0a25417 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -354,10 +354,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -686,7 +686,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid, + if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), + cstate, &read->rd_stateid, RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -741,7 +742,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (locks_in_grace()) + if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -760,8 +761,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags - & NFSEXP_NOSUBTREECHECK)) + if (locks_in_grace(SVC_NET(rqstp)) && + !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, rename->rn_snamelen, &cstate->current_fh, @@ -845,7 +846,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(cstate, + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &setattr->sa_stateid, WR_STATE, NULL); nfs4_unlock_state(); if (status) { @@ -890,7 +891,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp); + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), + cstate, stateid, WR_STATE, &filp); if (filp) get_file(filp); nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5ff0b7b9fc08..43295d45cc2b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (status < 0) return; + status = mnt_want_write_file(rec_file); + if (status) + return; + dir = rec_file->f_path.dentry; /* lock the parent */ mutex_lock(&dir->d_inode->i_mutex); @@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = mnt_want_write_file(rec_file); - if (status) - goto out_put; status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); - mnt_drop_write_file(rec_file); out_put: dput(dentry); out_unlock: @@ -189,6 +189,7 @@ out_unlock: " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); + mnt_drop_write_file(rec_file); nfs4_reset_creds(original_cred); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 94effd5bc4a1..cc894eda385a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -38,18 +38,21 @@ #include <linux/namei.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/clnt.h> #include "xdr4.h" #include "vfs.h" #include "current_stateid.h" +#include "fault_inject.h" + +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; -static time_t boot_time; #define all_ones {{~0,~0},~0} static const stateid_t one_stateid = { @@ -862,6 +865,11 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&conn->cn_xpt_user); + if (ses->se_client->cl_cb_state == NFSD4_CB_DOWN && + dir & NFS4_CDFC4_BACK) { + /* callback channel may be back up */ + nfsd4_probe_callback(ses->se_client); + } return nfs_ok; } @@ -1047,12 +1055,12 @@ renew_client(struct nfs4_client *clp) /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int -STALE_CLIENTID(clientid_t *clid) +STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) { - if (clid->cl_boot == boot_time) + if (clid->cl_boot == nn->boot_time) return 0; dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", - clid->cl_boot, clid->cl_id, boot_time); + clid->cl_boot, clid->cl_id, nn->boot_time); return 1; } @@ -1215,7 +1223,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) return true; } -static int +static bool same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { if ((cr1->cr_flavor != cr2->cr_flavor) @@ -1227,14 +1235,15 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2) return true; if (!cr1->cr_principal || !cr2->cr_principal) return false; - return 0 == strcmp(cr1->cr_principal, cr1->cr_principal); + return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); } static void gen_clid(struct nfs4_client *clp) { static u32 current_clientid = 1; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - clp->cl_clientid.cl_boot = boot_time; + clp->cl_clientid.cl_boot = nn->boot_time; clp->cl_clientid.cl_id = current_clientid++; } @@ -2217,8 +2226,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - if (STALE_CLIENTID(clid)) + if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; nfs4_lock_state(); @@ -2577,8 +2587,9 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, unsigned int strhashval; struct nfs4_openowner *oo = NULL; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - if (STALE_CLIENTID(&open->op_clientid)) + if (STALE_CLIENTID(&open->op_clientid, nn)) return nfserr_stale_clientid; /* * In case we need it later, after we've already created the @@ -2876,7 +2887,8 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) * Attempt to hand out a delegation. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp) +nfs4_open_delegation(struct net *net, struct svc_fh *fh, + struct nfsd4_open *open, struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); @@ -2897,7 +2909,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_ case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (locks_in_grace()) + if (locks_in_grace(net)) goto out; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out; @@ -3007,14 +3019,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); if (status) goto out; + status = nfsd4_truncate(rqstp, current_fh, open); + if (status) + goto out; stp = open->op_stp; open->op_stp = NULL; init_open_stateid(stp, fp, open); - status = nfsd4_truncate(rqstp, current_fh, open); - if (status) { - release_open_stateid(stp); - goto out; - } } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -3033,7 +3043,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ - nfs4_open_delegation(current_fh, open, stp); + nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp); nodeleg: status = nfs_ok; @@ -3087,12 +3097,13 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_client *clp; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); dprintk("process_renew(%08x/%08x): starting\n", clid->cl_boot, clid->cl_id); status = nfserr_stale_clientid; - if (STALE_CLIENTID(clid)) + if (STALE_CLIENTID(clid, nn)) goto out; clp = find_confirmed_client(clid); status = nfserr_expired; @@ -3111,22 +3122,19 @@ out: return status; } -static struct lock_manager nfsd4_manager = { -}; - -static bool grace_ended; - static void -nfsd4_end_grace(void) +nfsd4_end_grace(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + /* do nothing if grace period already ended */ - if (grace_ended) + if (nn->grace_ended) return; dprintk("NFSD: end of grace period\n"); - grace_ended = true; - nfsd4_record_grace_done(&init_net, boot_time); - locks_end_grace(&nfsd4_manager); + nn->grace_ended = true; + nfsd4_record_grace_done(net, nn->boot_time); + locks_end_grace(&nn->nfsd4_manager); /* * Now that every NFSv4 client has had the chance to recover and * to see the (possibly new, possibly shorter) lease time, we @@ -3149,7 +3157,7 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - nfsd4_end_grace(); + nfsd4_end_grace(&init_net); INIT_LIST_HEAD(&reaplist); spin_lock(&client_lock); list_for_each_safe(pos, next, &client_lru) { @@ -3231,9 +3239,9 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s } static int -STALE_STATEID(stateid_t *stateid) +STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn) { - if (stateid->si_opaque.so_clid.cl_boot == boot_time) + if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time) return 0; dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); @@ -3273,11 +3281,11 @@ out: } static inline __be32 -check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) +check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags) { if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (locks_in_grace()) { + else if (locks_in_grace(net)) { /* Answer in remaining cases depends on existence of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -3294,9 +3302,9 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) * that are not able to provide mandatory locking. */ static inline int -grace_disallows_io(struct inode *inode) +grace_disallows_io(struct net *net, struct inode *inode) { - return locks_in_grace() && mandatory_lock(inode); + return locks_in_grace(net) && mandatory_lock(inode); } /* Returns true iff a is later than b: */ @@ -3333,18 +3341,26 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } -__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) +static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; struct nfs4_ol_stateid *ols; __be32 status; - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + /* Client debugging aid. */ + if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str, + sizeof(addr_str)); + pr_warn_ratelimited("NFSD: client %s testing state ID " + "with incorrect client ID\n", addr_str); + return nfserr_bad_stateid; + } s = find_stateid(cl, stateid); if (!s) - return nfserr_stale_stateid; + return nfserr_bad_stateid; status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) return status; @@ -3360,10 +3376,11 @@ __be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) { struct nfs4_client *cl; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; - if (STALE_STATEID(stateid)) + if (STALE_STATEID(stateid, nn)) return nfserr_stale_stateid; cl = find_confirmed_client(&stateid->si_opaque.so_clid); if (!cl) @@ -3379,7 +3396,7 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, s * Checks for stateid operations */ __be32 -nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, +nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { struct nfs4_stid *s; @@ -3392,11 +3409,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (filpp) *filpp = NULL; - if (grace_disallows_io(ino)) + if (grace_disallows_io(net, ino)) return nfserr_grace; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return check_special_stateids(current_fh, stateid, flags); + return check_special_stateids(net, current_fh, stateid, flags); status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); if (status) @@ -3463,7 +3480,8 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) - stateid->ts_id_status = nfs4_validate_stateid(cl, &stateid->ts_id_stateid); + stateid->ts_id_status = + nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); nfs4_unlock_state(); return nfs_ok; @@ -3750,12 +3768,19 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_close_open_stateid(stp); oo->oo_last_closed_stid = stp; - /* place unused nfs4_stateowners on so_close_lru list to be - * released by the laundromat service after the lease period - * to enable us to handle CLOSE replay - */ - if (list_empty(&oo->oo_owner.so_stateids)) - move_to_close_lru(oo); + if (list_empty(&oo->oo_owner.so_stateids)) { + if (cstate->minorversion) { + release_openowner(oo); + cstate->replay_owner = NULL; + } else { + /* + * In the 4.0 case we need to keep the owners around a + * little while to handle CLOSE replay. + */ + if (list_empty(&oo->oo_owner.so_stateids)) + move_to_close_lru(oo); + } + } out: if (!cstate->replay_owner) nfs4_unlock_state(); @@ -4027,6 +4052,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool new_state = false; int lkflg; int err; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", (long long) lock->lk_offset, @@ -4044,11 +4070,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); if (lock->lk_is_new) { - /* - * Client indicates that this is a new lockowner. - * Use open owner and open stateid to create lock owner and - * lock stateid. - */ struct nfs4_ol_stateid *open_stp = NULL; if (nfsd4_has_session(cstate)) @@ -4058,7 +4079,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, sizeof(clientid_t)); status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid)) + if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) goto out; /* validate and update open stateid and open seqid */ @@ -4075,17 +4096,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new_state); - if (status) - goto out; - } else { - /* lock (lock owner + lock stateid) already exists */ + } else status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, NFS4_LOCK_STID, &lock_stp); - if (status) - goto out; - } + if (status) + goto out; lock_sop = lockowner(lock_stp->st_stateowner); lkflg = setlkflg(lock->lk_type); @@ -4094,10 +4111,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = nfserr_grace; - if (locks_in_grace() && !lock->lk_reclaim) + if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!locks_in_grace() && lock->lk_reclaim) + if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -4196,8 +4213,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock file_lock; struct nfs4_lockowner *lo; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - if (locks_in_grace()) + if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -4206,7 +4224,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfserr_stale_clientid; - if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) + if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn)) goto out; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) @@ -4355,6 +4373,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct list_head matches; unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -4362,7 +4381,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, /* XXX check for lease expiration */ status = nfserr_stale_clientid; - if (STALE_CLIENTID(clid)) + if (STALE_CLIENTID(clid, nn)) return status; nfs4_lock_state(); @@ -4564,7 +4583,7 @@ void nfsd_forget_openowners(u64 num) printk(KERN_INFO "NFSD: Forgot %d open owners", count); } -int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *)) +int nfsd_process_n_delegations(u64 num, struct list_head *list) { int i, count = 0; struct nfs4_file *fp, *fnext; @@ -4573,7 +4592,7 @@ int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegatio for (i = 0; i < FILE_HASH_SIZE; i++) { list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { - deleg_func(dp); + list_move(&dp->dl_recall_lru, list); if (++count == num) return count; } @@ -4586,9 +4605,16 @@ int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegatio void nfsd_forget_delegations(u64 num) { unsigned int count; + LIST_HEAD(victims); + struct nfs4_delegation *dp, *dnext; + + spin_lock(&recall_lock); + count = nfsd_process_n_delegations(num, &victims); + spin_unlock(&recall_lock); nfs4_lock_state(); - count = nfsd_process_n_delegations(num, unhash_delegation); + list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) + unhash_delegation(dp); nfs4_unlock_state(); printk(KERN_INFO "NFSD: Forgot %d delegations", count); @@ -4597,12 +4623,16 @@ void nfsd_forget_delegations(u64 num) void nfsd_recall_delegations(u64 num) { unsigned int count; + LIST_HEAD(victims); + struct nfs4_delegation *dp, *dnext; - nfs4_lock_state(); spin_lock(&recall_lock); - count = nfsd_process_n_delegations(num, nfsd_break_one_deleg); + count = nfsd_process_n_delegations(num, &victims); + list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) { + list_del(&dp->dl_recall_lru); + nfsd_break_one_deleg(dp); + } spin_unlock(&recall_lock); - nfs4_unlock_state(); printk(KERN_INFO "NFSD: Recalled %d delegations", count); } @@ -4665,6 +4695,8 @@ set_max_delegations(void) int nfs4_state_start(void) { + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; /* @@ -4674,11 +4706,11 @@ nfs4_state_start(void) * to that instead and then do most of the rest of this on a per-net * basis. */ - get_net(&init_net); - nfsd4_client_tracking_init(&init_net); - boot_time = get_seconds(); - locks_start_grace(&nfsd4_manager); - grace_ended = false; + get_net(net); + nfsd4_client_tracking_init(net); + nn->boot_time = get_seconds(); + locks_start_grace(net, &nn->nfsd4_manager); + nn->grace_ended = false; printk(KERN_INFO "NFSD: starting %ld-second grace period\n", nfsd4_grace); ret = set_callback_cred(); @@ -4700,8 +4732,8 @@ nfs4_state_start(void) out_free_laundry: destroy_workqueue(laundry_wq); out_recovery: - nfsd4_client_tracking_exit(&init_net); - put_net(&init_net); + nfsd4_client_tracking_exit(net); + put_net(net); return ret; } @@ -4742,9 +4774,12 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + cancel_delayed_work_sync(&laundromat_work); destroy_workqueue(laundry_wq); - locks_end_grace(&nfsd4_manager); + locks_end_grace(&nn->nfsd4_manager); nfs4_lock_state(); __nfs4_state_shutdown(); nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 4949667c84ea..6322df36031f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2259,7 +2259,7 @@ out_acl: if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(1); + WRITE32(0); } if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { if ((buflen -= 4) < 0) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c55298ed5772..fa49cff5ee65 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -673,9 +673,7 @@ static ssize_t __write_ports_addfd(char *buf) err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); if (err < 0) { - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); + nfsd_destroy(net); return err; } @@ -744,9 +742,7 @@ out_close: svc_xprt_put(xprt); } out_err: - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); + nfsd_destroy(net); return err; } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 1671429ffa66..2244222368ab 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -72,6 +72,19 @@ int nfsd_nrthreads(void); int nfsd_nrpools(void); int nfsd_get_nrthreads(int n, int *); int nfsd_set_nrthreads(int n, int *); +int nfsd_pool_stats_open(struct inode *, struct file *); +int nfsd_pool_stats_release(struct inode *, struct file *); + +static inline void nfsd_destroy(struct net *net) +{ + int destroy = (nfsd_serv->sv_nrthreads == 1); + + if (destroy) + svc_shutdown_net(nfsd_serv, net); + svc_destroy(nfsd_serv); + if (destroy) + nfsd_serv = NULL; +} #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index cc793005a87c..032af381b3aa 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp) fhp->fh_post_saved = 0; #endif } + fh_drop_write(fhp); if (exp) { exp_put(exp); fhp->fh_export = NULL; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index e15dc45fc5ec..aad6d457b9e8 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, struct dentry *dchild; int type, mode; __be32 nfserr; + int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); dprintk("nfsd: CREATE %s %.*s\n", @@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; + hosterr = fh_want_write(dirfhp); + if (hosterr) { + nfserr = nfserrno(hosterr); + goto done; + } + fh_lock_nested(dirfhp, I_MUTEX_PARENT); dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); if (IS_ERR(dchild)) { @@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, out_unlock: /* We don't really need to unlock, as fh_put does it. */ fh_unlock(dirfhp); - + fh_drop_write(dirfhp); done: fh_put(dirfhp); return nfsd_return_dirop(nfserr, resp); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ee709fc8f58b..240473cb708f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -254,8 +254,6 @@ static void nfsd_shutdown(void) static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { - /* When last nfsd thread exits we need to do some clean-up */ - nfsd_serv = NULL; nfsd_shutdown(); svc_rpcb_cleanup(serv, net); @@ -332,6 +330,7 @@ static int nfsd_get_default_max_blksize(void) int nfsd_create_serv(void) { int error; + struct net *net = current->nsproxy->net_ns; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { @@ -346,7 +345,7 @@ int nfsd_create_serv(void) if (nfsd_serv == NULL) return -ENOMEM; - error = svc_bind(nfsd_serv, current->nsproxy->net_ns); + error = svc_bind(nfsd_serv, net); if (error < 0) { svc_destroy(nfsd_serv); return error; @@ -427,11 +426,7 @@ int nfsd_set_nrthreads(int n, int *nthreads) if (err) break; } - - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); - + nfsd_destroy(net); return err; } @@ -478,9 +473,7 @@ out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown(); out_destroy: - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); /* Release server */ + nfsd_destroy(net); /* Release server */ out: mutex_unlock(&nfsd_mutex); return error; @@ -563,12 +556,13 @@ nfsd(void *vrqstp) nfsdstats.th_cnt --; out: - if (rqstp->rq_server->sv_nrthreads == 1) - svc_shutdown_net(rqstp->rq_server, &init_net); + rqstp->rq_server = NULL; /* Release the thread */ svc_exit_thread(rqstp); + nfsd_destroy(&init_net); + /* Release module */ mutex_unlock(&nfsd_mutex); module_put_and_exit(0); @@ -682,9 +676,7 @@ int nfsd_pool_stats_release(struct inode *inode, struct file *file) mutex_lock(&nfsd_mutex); /* this function really, really should have been called svc_put() */ - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); + nfsd_destroy(net); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 849091e16ea6..e6173147f982 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -450,8 +450,10 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) #define WR_STATE 0x00000020 struct nfsd4_compound_state; +struct nfsd_net; -extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, +extern __be32 nfs4_preprocess_stateid_op(struct net *net, + struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filp); extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); @@ -475,7 +477,6 @@ extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern int nfs4_client_to_reclaim(const char *name); extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); /* nfs4recover operations */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4700a0a929d7..a9269f142cc4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -757,8 +757,16 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. */ - err = fh_verify(rqstp, fhp, type, may_flags | NFSD_MAY_OWNER_OVERRIDE); + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); if (err) goto out; @@ -1276,6 +1284,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * If it has, the parent directory should already be locked. */ if (!resfhp->fh_dentry) { + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ fh_lock_nested(fhp, I_MUTEX_PARENT); dchild = lookup_one_len(fname, dentry, flen); @@ -1319,14 +1331,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - /* * Get the dir op function pointer. */ err = 0; + host_err = 0; switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, true); @@ -1343,10 +1352,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; } - if (host_err < 0) { - fh_drop_write(fhp); + if (host_err < 0) goto out_nfserr; - } err = nfsd_create_setattr(rqstp, resfhp, iap); @@ -1358,7 +1365,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; - fh_drop_write(fhp); /* * Update the file handle to get the new inode info. */ @@ -1417,6 +1423,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notdir; if (!dirp->i_op->lookup) goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock_nested(fhp, I_MUTEX_PARENT); /* @@ -1449,9 +1460,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; if (dchild->d_inode) { err = 0; @@ -1522,7 +1530,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = nfserrno(commit_metadata(fhp)); - fh_drop_write(fhp); /* * Update the filehandle to get the new inode info. */ @@ -1533,6 +1540,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, fh_unlock(fhp); if (dchild && !IS_ERR(dchild)) dput(dchild); + fh_drop_write(fhp); return err; out_nfserr: @@ -1613,6 +1621,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock(fhp); dentry = fhp->fh_dentry; dnew = lookup_one_len(fname, dentry, flen); @@ -1620,10 +1633,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - if (unlikely(path[plen] != 0)) { char *path_alloced = kmalloc(plen+1, GFP_KERNEL); if (path_alloced == NULL) @@ -1683,6 +1692,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; + host_err = fh_want_write(tfhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; dirp = ddir->d_inode; @@ -1694,18 +1709,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; - host_err = fh_want_write(tfhp); - if (host_err) { - err = nfserrno(host_err); - goto out_dput; - } err = nfserr_noent; if (!dold->d_inode) - goto out_drop_write; + goto out_dput; host_err = nfsd_break_lease(dold->d_inode); if (host_err) { err = nfserrno(host_err); - goto out_drop_write; + goto out_dput; } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { @@ -1718,12 +1728,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } -out_drop_write: - fh_drop_write(tfhp); out_dput: dput(dnew); out_unlock: fh_unlock(ffhp); + fh_drop_write(tfhp); out: return err; @@ -1766,6 +1775,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; + host_err = fh_want_write(ffhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + /* cannot use fh_lock as we need deadlock protective ordering * so do it by hand */ trap = lock_rename(tdentry, fdentry); @@ -1796,17 +1811,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; - host_err = fh_want_write(ffhp); - if (host_err) - goto out_dput_new; host_err = nfsd_break_lease(odentry->d_inode); if (host_err) - goto out_drop_write; + goto out_dput_new; if (ndentry->d_inode) { host_err = nfsd_break_lease(ndentry->d_inode); if (host_err) - goto out_drop_write; + goto out_dput_new; } host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err) { @@ -1814,8 +1826,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!host_err) host_err = commit_metadata(ffhp); } -out_drop_write: - fh_drop_write(ffhp); out_dput_new: dput(ndentry); out_dput_old: @@ -1831,6 +1841,7 @@ out_drop_write: fill_post_wcc(tfhp); unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = 0; + fh_drop_write(ffhp); out: return err; @@ -1856,6 +1867,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (err) goto out; + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; dirp = dentry->d_inode; @@ -1874,21 +1889,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = fh_want_write(fhp); - if (host_err) - goto out_put; - host_err = nfsd_break_lease(rdentry->d_inode); if (host_err) - goto out_drop_write; + goto out_put; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) host_err = commit_metadata(fhp); -out_drop_write: - fh_drop_write(fhp); out_put: dput(rdentry); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ec0611b2b738..359594c393d2 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); static inline int fh_want_write(struct svc_fh *fh) { - return mnt_want_write(fh->fh_export->ex_path.mnt); + int ret = mnt_want_write(fh->fh_export->ex_path.mnt); + + if (!ret) + fh->fh_want_write = 1; + return ret; } static inline void fh_drop_write(struct svc_fh *fh) { - mnt_drop_write(fh->fh_export->ex_path.mnt); + if (fh->fh_want_write) { + fh->fh_want_write = 0; + mnt_drop_write(fh->fh_export->ex_path.mnt); + } } #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index f5fde36b9e28..fb7238100548 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -76,15 +76,23 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit find_next_zero_bit_le -/* - * persistent object allocator cache +/** + * struct nilfs_bh_assoc - block offset and buffer head association + * @blkoff: block offset + * @bh: buffer head */ - struct nilfs_bh_assoc { unsigned long blkoff; struct buffer_head *bh; }; +/** + * struct nilfs_palloc_cache - persistent object allocator cache + * @lock: cache protecting lock + * @prev_desc: blockgroup descriptors cache + * @prev_bitmap: blockgroup bitmap cache + * @prev_entry: translation entries cache + */ struct nilfs_palloc_cache { spinlock_t lock; struct nilfs_bh_assoc prev_desc; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index 40d9f453d31c..b89e68076adc 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -135,6 +135,13 @@ struct nilfs_bmap { /* state */ #define NILFS_BMAP_DIRTY 0x00000001 +/** + * struct nilfs_bmap_store - shadow copy of bmap state + * @data: cached raw block mapping of on-disk inode + * @last_allocated_key: cached value of last allocated key for data block + * @last_allocated_ptr: cached value of last allocated ptr for data block + * @state: cached value of state field of bmap structure + */ struct nilfs_bmap_store { __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)]; __u64 last_allocated_key; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 3a4dd2d8d3fc..d876b565ce64 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -29,7 +29,13 @@ #include <linux/fs.h> #include <linux/backing-dev.h> - +/** + * struct nilfs_btnode_chkey_ctxt - change key context + * @oldkey: old key of block's moving content + * @newkey: new key for block's content + * @bh: buffer head of old buffer + * @newbh: buffer head of new buffer + */ struct nilfs_btnode_chkey_ctxt { __u64 oldkey; __u64 newkey; diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index dab5c4c6dfaf..deaa3d33a0aa 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -286,7 +286,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, __u64 cno; void *kaddr; unsigned long tnicps; - int ret, ncps, nicps, count, i; + int ret, ncps, nicps, nss, count, i; if (unlikely(start == 0 || start > end)) { printk(KERN_ERR "%s: invalid range of checkpoint numbers: " @@ -301,6 +301,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, if (ret < 0) goto out_sem; tnicps = 0; + nss = 0; for (cno = start; cno < end; cno += ncps) { ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); @@ -318,8 +319,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, cpfile, cno, cp_bh, kaddr); nicps = 0; for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { - WARN_ON(nilfs_checkpoint_snapshot(cp)); - if (!nilfs_checkpoint_invalid(cp)) { + if (nilfs_checkpoint_snapshot(cp)) { + nss++; + } else if (!nilfs_checkpoint_invalid(cp)) { nilfs_checkpoint_set_invalid(cp); nicps++; } @@ -364,6 +366,8 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, } brelse(header_bh); + if (nss > 0) + ret = -EBUSY; out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index b5c13f3576b9..fa0f80308c2d 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -33,6 +33,12 @@ #define NILFS_CNO_MIN ((__u64)1) #define NILFS_CNO_MAX (~(__u64)0) +/** + * struct nilfs_dat_info - on-memory private data of DAT file + * @mi: on-memory private data of metadata file + * @palloc_cache: persistent object allocator cache of DAT file + * @shadow: shadow map of DAT file + */ struct nilfs_dat_info { struct nilfs_mdt_info mi; struct nilfs_palloc_cache palloc_cache; diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h index a71cc412b651..19ccbf9522ab 100644 --- a/fs/nilfs2/export.h +++ b/fs/nilfs2/export.h @@ -5,6 +5,14 @@ extern const struct export_operations nilfs_export_ops; +/** + * struct nilfs_fid - NILFS file id type + * @cno: checkpoint number + * @ino: inode number + * @gen: file generation (version) for NFS + * @parent_gen: parent generation (version) for NFS + * @parent_ino: parent inode number + */ struct nilfs_fid { u64 cno; u64 ino; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 62cebc8e1a1f..a4d56ac02e6c 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_dentry->d_inode; struct nilfs_transaction_info ti; - int ret; + int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ + sb_start_pagefault(inode->i_sb); lock_page(page); if (page->mapping != inode->i_mapping || page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); - return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + ret = -EFAULT; /* make the VM retry the fault */ + goto out; } /* @@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); /* never returns -ENOMEM, but may return -ENOSPC */ if (unlikely(ret)) - return VM_FAULT_SIGBUS; + goto out; - ret = block_page_mkwrite(vma, vmf, nilfs_get_block); - if (ret != VM_FAULT_LOCKED) { + ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); + if (ret) { nilfs_transaction_abort(inode->i_sb); - return ret; + goto out; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: wait_on_page_writeback(page); - return VM_FAULT_LOCKED; + out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 5a48df79d674..d8e65bde083c 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -29,7 +29,11 @@ #include "alloc.h" #include "ifile.h" - +/** + * struct nilfs_ifile_info - on-memory private data of ifile + * @mi: on-memory private data of metadata file + * @palloc_cache: persistent object allocator cache of ifile + */ struct nilfs_ifile_info { struct nilfs_mdt_info mi; struct nilfs_palloc_cache palloc_cache; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7cc64465ec26..6e2c3db976b2 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -34,6 +34,13 @@ #include "cpfile.h" #include "ifile.h" +/** + * struct nilfs_iget_args - arguments used during comparison between inodes + * @ino: inode number + * @cno: checkpoint number + * @root: pointer on NILFS root object (mounted checkpoint) + * @for_gc: inode for GC flag + */ struct nilfs_iget_args { u64 ino; __u64 cno; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 06658caa18bd..fdb180769485 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -182,7 +182,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (copy_from_user(&cpmode, argp, sizeof(cpmode))) goto out; - down_read(&inode->i_sb->s_umount); + mutex_lock(&nilfs->ns_snapshot_mount_mutex); nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_change_cpmode( @@ -192,7 +192,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, else nilfs_transaction_commit(inode->i_sb); /* never fails */ - up_read(&inode->i_sb->s_umount); + mutex_unlock(&nilfs->ns_snapshot_mount_mutex); out: mnt_drop_write_file(filp); return ret; @@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, goto out_free; } - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index ab20a4baa50f..ab172e8549c5 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -28,6 +28,13 @@ #include "nilfs.h" #include "page.h" +/** + * struct nilfs_shadow_map - shadow mapping of meta data file + * @bmap_store: shadow copy of bmap state + * @frozen_data: shadowed dirty data pages + * @frozen_btnodes: shadowed dirty b-tree nodes' pages + * @frozen_buffers: list of frozen buffers + */ struct nilfs_shadow_map { struct nilfs_bmap_store bmap_store; struct address_space frozen_data; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 250add84da76..74cece80e9a3 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -32,8 +32,21 @@ #include "the_nilfs.h" #include "bmap.h" -/* - * nilfs inode data in memory +/** + * struct nilfs_inode_info - nilfs inode data in memory + * @i_flags: inode flags + * @i_state: dynamic state flags + * @i_bmap: pointer on i_bmap_data + * @i_bmap_data: raw block mapping + * @i_xattr: <TODO> + * @i_dir_start_lookup: page index of last successful search + * @i_cno: checkpoint number for GC inode + * @i_btnode_cache: cached pages of b-tree nodes + * @i_dirty: list for connecting dirty files + * @xattr_sem: semaphore for extended attributes processing + * @i_bh: buffer contains disk inode + * @i_root: root object of the current filesystem tree + * @vfs_inode: VFS inode object */ struct nilfs_inode_info { __u32 i_flags; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 88e11fb346b6..a5752a589932 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb, if (ret > 0) return 0; - vfs_check_frozen(sb, SB_FREEZE_WRITE); + sb_start_intwrite(sb); nilfs = sb->s_fs_info; down_read(&nilfs->ns_segctor_sem); @@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb, current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); return ret; } @@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb) err = nilfs_construct_segment(sb); if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); return err; } @@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb) current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); } void nilfs_relax_pressure_in_lock(struct super_block *sb) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index c5b7653a4391..3127e9f438a7 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -30,7 +30,13 @@ #include "mdt.h" #include "sufile.h" - +/** + * struct nilfs_sufile_info - on-memory private data of sufile + * @mi: on-memory private data of metadata file + * @ncleansegs: number of clean segments + * @allocmin: lower limit of allocatable segment range + * @allocmax: upper limit of allocatable segment range + */ struct nilfs_sufile_info { struct nilfs_mdt_info mi; unsigned long ncleansegs;/* number of clean segments */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index d57c42f974ea..6a10812711c1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -676,20 +676,13 @@ static const struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .destroy_inode = nilfs_destroy_inode, .dirty_inode = nilfs_dirty_inode, - /* .write_inode = nilfs_write_inode, */ - /* .put_inode = nilfs_put_inode, */ - /* .drop_inode = nilfs_drop_inode, */ .evict_inode = nilfs_evict_inode, .put_super = nilfs_put_super, - /* .write_super = nilfs_write_super, */ .sync_fs = nilfs_sync_fs, .freeze_fs = nilfs_freeze, .unfreeze_fs = nilfs_unfreeze, - /* .write_super_lockfs */ - /* .unlockfs */ .statfs = nilfs_statfs, .remount_fs = nilfs_remount, - /* .umount_begin */ .show_options = nilfs_show_options }; @@ -948,6 +941,8 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, struct nilfs_root *root; int ret; + mutex_lock(&nilfs->ns_snapshot_mount_mutex); + down_read(&nilfs->ns_segctor_sem); ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno); up_read(&nilfs->ns_segctor_sem); @@ -972,6 +967,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, ret = nilfs_get_root_dentry(s, root, root_dentry); nilfs_put_root(root); out: + mutex_unlock(&nilfs->ns_snapshot_mount_mutex); return ret; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 501b7f8b739f..41e6a04a561f 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -76,6 +76,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_bdev = bdev; atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); + mutex_init(&nilfs->ns_snapshot_mount_mutex); INIT_LIST_HEAD(&nilfs->ns_dirty_files); INIT_LIST_HEAD(&nilfs->ns_gc_inodes); spin_lock_init(&nilfs->ns_inode_lock); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 9992b11312ff..be1267a34cea 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -47,11 +47,13 @@ enum { * @ns_flags: flags * @ns_bdev: block device * @ns_sem: semaphore for shared states + * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data * @ns_sbwtime: previous write time of super block * @ns_sbwcount: write count of super block * @ns_sbsize: size of valid data in super block + * @ns_mount_state: file system state * @ns_seg_seq: segment sequence counter * @ns_segnum: index number of the latest full segment. * @ns_nextnum: index number of the full segment index to be used next @@ -99,13 +101,12 @@ struct the_nilfs { struct block_device *ns_bdev; struct rw_semaphore ns_sem; + struct mutex ns_snapshot_mount_mutex; /* * used for * - loading the latest checkpoint exclusively. * - allocating a new full segment. - * - protecting s_dirt in the super_block struct - * (see nilfs_write_super) and the following fields. */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; @@ -229,9 +230,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) * @count: refcount of this structure * @nilfs: nilfs object * @ifile: inode file - * @root: root inode * @inodes_count: number of inodes - * @blocks_count: number of blocks (Reserved) + * @blocks_count: number of blocks */ struct nilfs_root { __u64 cno; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7389d2d5e51d..1ecf46448f85 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, if (err) return err; pos = *ppos; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim. */ current->backing_dev_info = mapping->backing_dev_info; written = 0; @@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index b341492542ca..2bc149d6a784 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2660,31 +2660,14 @@ static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */ #ifdef NTFS_RW - //.dirty_inode = NULL, /* VFS: Called from - // __mark_inode_dirty(). */ .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to disk. */ - //.drop_inode = NULL, /* VFS: Called just after the - // inode reference count has - // been decreased to zero. - // NOTE: The inode lock is - // held. See fs/inode.c:: - // generic_drop_inode(). */ - //.delete_inode = NULL, /* VFS: Delete inode from disk. - // Called when i_count becomes - // 0 and i_nlink is also 0. */ - //.write_super = NULL, /* Flush dirty super block to - // disk. */ - //.sync_fs = NULL, /* ? */ - //.write_super_lockfs = NULL, /* ? */ - //.unlockfs = NULL, /* ? */ #endif /* NTFS_RW */ .put_super = ntfs_put_super, /* Syscall: umount. */ .statfs = ntfs_statfs, /* Syscall: statfs */ .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is removed from memory. */ - //.umount_begin = NULL, /* Forced umount. */ .show_options = ntfs_show_options, /* Show mount options in proc. */ }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7602783d7f41..46a1f6d75104 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, { struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && !ocfs2_writes_unwritten_extents(osb)) @@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + ret = mnt_want_write_file(file); + if (ret) + return ret; + ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + mnt_drop_write_file(file); + return ret; } static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, @@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2436,6 +2442,7 @@ out_sems: ocfs2_iocb_clear_sem_locked(iocb); mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); if (written) ret = written; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d96f7f81d8dd..f20edcbfe700 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(new_clusters, (int __user *)arg)) return -EFAULT; - return ocfs2_group_extend(inode, new_clusters); + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_extend(inode, new_clusters); + mnt_drop_write_file(filp); + return status; case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: if (!capable(CAP_SYS_RESOURCE)) @@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (copy_from_user(&input, (int __user *) arg, sizeof(input))) return -EFAULT; - return ocfs2_group_add(inode, &input); + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_add(inode, &input); + mnt_drop_write_file(filp); + return status; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 0a42ae96dca7..2dd36af79e26 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) if (journal_current_handle()) return jbd2_journal_start(journal, max_buffs); + sb_start_intwrite(osb->sb); + down_read(&osb->journal->j_trans_barrier); handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); + sb_end_intwrite(osb->sb); mlog_errno(PTR_ERR(handle)); @@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, if (ret < 0) mlog_errno(ret); - if (!nested) + if (!nested) { up_read(&journal->j_trans_barrier); + sb_end_intwrite(osb->sb); + } return ret; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 210c35237548..a9f78c74d687 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -784,14 +784,10 @@ bail: static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { - int i; - u8 *buffer; - u32 count = 0; + u32 count; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - buffer = la->la_bitmap; - for (i = 0; i < le16_to_cpu(la->la_size); i++) - count += hweight8(buffer[i]); + count = memweight(la->la_bitmap, le16_to_cpu(la->la_size)); trace_ocfs2_local_alloc_count_bits(count); return count; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9cd41083e991..d150372fd81d 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sigset_t oldset; int ret; + sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); /* @@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out: ocfs2_unblock_signals(&oldset); + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f32d7cbb7a3..30a055049e16 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4466,20 +4466,11 @@ int ocfs2_reflink_ioctl(struct inode *inode, goto out_dput; } - error = mnt_want_write(new_path.mnt); - if (error) { - mlog_errno(error); - goto out_dput; - } - error = ocfs2_vfs_reflink(old_path.dentry, new_path.dentry->d_inode, new_dentry, preserve); - mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/open.c b/fs/open.c index 1e914b397e12..e1f2cdb91a4d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(inode)) goto out_putf; + sb_start_write(inode->i_sb); error = locks_verify_truncate(inode, file, length); if (!error) error = security_path_truncate(&file->f_path); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); + sb_end_write(inode->i_sb); out_putf: fput(file); out: @@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!file->f_op->fallocate) return -EOPNOTSUPP; - return file->f_op->fallocate(file, mode, offset, len); + sb_start_write(inode->i_sb); + ret = file->f_op->fallocate(file, mode, offset, len); + sb_end_write(inode->i_sb); + return ret; } SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) @@ -620,7 +625,7 @@ static inline int __get_file_write_access(struct inode *inode, /* * Balanced in __fput() */ - error = mnt_want_write(mnt); + error = __mnt_want_write(mnt); if (error) put_write_access(inode); } @@ -654,6 +659,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) f->f_mode = FMODE_PATH; + path_get(&f->f_path); inode = f->f_path.dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, f->f_path.mnt); @@ -711,7 +717,7 @@ cleanup_all: * here, so just reset the state. */ file_reset_write(f); - mnt_drop_write(f->f_path.mnt); + __mnt_drop_write(f->f_path.mnt); } } cleanup_file: @@ -739,9 +745,7 @@ int finish_open(struct file *file, struct dentry *dentry, int error; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - mntget(file->f_path.mnt); - file->f_path.dentry = dget(dentry); - + file->f_path.dentry = dentry; error = do_dentry_open(file, open, current_cred()); if (!error) *opened |= FILE_OPENED; @@ -784,7 +788,6 @@ struct file *dentry_open(const struct path *path, int flags, f->f_flags = flags; f->f_path = *path; - path_get(&f->f_path); error = do_dentry_open(f, NULL, cred); if (!error) { error = open_check_o_direct(f); @@ -849,9 +852,10 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode; - if (!(flags & O_CREAT)) - mode = 0; - op->mode = mode; + if (flags & O_CREAT) + op->mode = (mode & S_IALLUGO) | S_IFREG; + else + op->mode = 0; /* Must never be set by userspace */ flags &= ~FMODE_NONOTIFY; diff --git a/fs/pipe.c b/fs/pipe.c index 95cbd6b227e6..8d85d7068c1e 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1016,18 +1016,16 @@ fail_inode: return NULL; } -struct file *create_write_pipe(int flags) +int create_pipe_files(struct file **res, int flags) { int err; - struct inode *inode; + struct inode *inode = get_pipe_inode(); struct file *f; struct path path; - struct qstr name = { .name = "" }; + static struct qstr name = { .name = "" }; - err = -ENFILE; - inode = get_pipe_inode(); if (!inode) - goto err; + return -ENFILE; err = -ENOMEM; path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); @@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags) f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; - f->f_mapping = inode->i_mapping; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); - f->f_version = 0; - return f; + res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); + if (!res[0]) + goto err_file; + + path_get(&path); + res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); + res[1] = f; + return 0; - err_dentry: +err_file: + put_filp(f); +err_dentry: free_pipe_info(inode); path_put(&path); - return ERR_PTR(err); + return err; - err_inode: +err_inode: free_pipe_info(inode); iput(inode); - err: - return ERR_PTR(err); -} - -void free_write_pipe(struct file *f) -{ - free_pipe_info(f->f_dentry->d_inode); - path_put(&f->f_path); - put_filp(f); -} - -struct file *create_read_pipe(struct file *wrf, int flags) -{ - /* Grab pipe from the writer */ - struct file *f = alloc_file(&wrf->f_path, FMODE_READ, - &read_pipefifo_fops); - if (!f) - return ERR_PTR(-ENFILE); - - path_get(&wrf->f_path); - f->f_flags = O_RDONLY | (flags & O_NONBLOCK); - - return f; + return err; } int do_pipe_flags(int *fd, int flags) { - struct file *fw, *fr; + struct file *files[2]; int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; - fw = create_write_pipe(flags); - if (IS_ERR(fw)) - return PTR_ERR(fw); - fr = create_read_pipe(fw, flags); - error = PTR_ERR(fr); - if (IS_ERR(fr)) - goto err_write_pipe; + error = create_pipe_files(files, flags); + if (error) + return error; error = get_unused_fd_flags(flags); if (error < 0) @@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags) fdw = error; audit_fd_pair(fdr, fdw); - fd_install(fdr, fr); - fd_install(fdw, fw); + fd_install(fdr, files[0]); + fd_install(fdw, files[1]); fd[0] = fdr; fd[1] = fdw; @@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags) err_fdr: put_unused_fd(fdr); err_read_pipe: - path_put(&fr->f_path); - put_filp(fr); - err_write_pipe: - free_write_pipe(fw); + fput(files[0]); + fput(files[1]); return error; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 2772208338f8..1b6c84cbdb73 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -695,8 +695,6 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) mmput(mm); } - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; file->private_data = mm; return 0; @@ -704,7 +702,12 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - return __mem_open(inode, file, PTRACE_MODE_ATTACH); + int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); + + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + + return ret; } static ssize_t mem_rw(struct file *file, char __user *buf, @@ -827,15 +830,16 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; while (count > 0) { - int this_len, retval, max_len; - - this_len = mm->env_end - (mm->env_start + src); + size_t this_len, max_len; + int retval; - if (this_len <= 0) + if (src >= (mm->env_end - mm->env_start)) break; - max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - this_len = (this_len > max_len) ? max_len : this_len; + this_len = mm->env_end - (mm->env_start + src); + + max_len = min_t(size_t, PAGE_SIZE, count); + this_len = min(max_len, this_len); retval = access_remote_vm(mm, (mm->env_start + src), page, this_len, 0); diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c index 22e0d60e53ef..76a7a697b778 100644 --- a/fs/qnx4/bitmap.c +++ b/fs/qnx4/bitmap.c @@ -17,23 +17,6 @@ #include <linux/bitops.h> #include "qnx4.h" -static void count_bits(register const char *bmPart, register int size, - int *const tf) -{ - char b; - int tot = *tf; - - if (size > QNX4_BLOCK_SIZE) { - size = QNX4_BLOCK_SIZE; - } - do { - b = *bmPart++; - tot += 8 - hweight8(b); - size--; - } while (size != 0); - *tf = tot; -} - unsigned long qnx4_count_free_blocks(struct super_block *sb) { int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1; @@ -44,13 +27,16 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb) struct buffer_head *bh; while (total < size) { + int bytes = min(size - total, QNX4_BLOCK_SIZE); + if ((bh = sb_bread(sb, start + offset)) == NULL) { printk(KERN_ERR "qnx4: I/O error in counting free blocks\n"); break; } - count_bits(bh->b_data, size - total, &total_free); + total_free += bytes * BITS_PER_BYTE - + memweight(bh->b_data, bytes); brelse(bh); - total += QNX4_BLOCK_SIZE; + total += bytes; offset++; } diff --git a/fs/splice.c b/fs/splice.c index 7bf08fa22ec9..41514dd89462 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; + sb_start_write(inode->i_sb); + pipe_lock(pipe); splice_from_pipe_begin(&sd); @@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/super.c b/fs/super.c index c743fb3be4b8..0902cfa6a12e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,12 +33,19 @@ #include <linux/rculist_bl.h> #include <linux/cleancache.h> #include <linux/fsnotify.h> +#include <linux/lockdep.h> #include "internal.h" LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); +static char *sb_writers_name[SB_FREEZE_LEVELS] = { + "sb_writers", + "sb_pagefaults", + "sb_internal", +}; + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. @@ -62,7 +69,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return -1; if (!grab_super_passive(sb)) - return !sc->nr_to_scan ? 0 : -1; + return -1; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); @@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return total_objects; } +static int init_sb_writers(struct super_block *s, struct file_system_type *type) +{ + int err; + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + err = percpu_counter_init(&s->s_writers.counter[i], 0); + if (err < 0) + goto err_out; + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], + &type->s_writers_key[i], 0); + } + init_waitqueue_head(&s->s_writers.wait); + init_waitqueue_head(&s->s_writers.wait_unfrozen); + return 0; +err_out: + while (--i >= 0) + percpu_counter_destroy(&s->s_writers.counter[i]); + return err; +} + +static void destroy_sb_writers(struct super_block *s) +{ + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); +} + /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to @@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (s) { if (security_sb_alloc(s)) { + /* + * We cannot call security_sb_free() without + * security_sb_alloc() succeeding. So bail out manually + */ kfree(s); s = NULL; goto out; } #ifdef CONFIG_SMP s->s_files = alloc_percpu(struct list_head); - if (!s->s_files) { - security_sb_free(s); - kfree(s); - s = NULL; - goto out; - } else { + if (!s->s_files) + goto err_out; + else { int i; for_each_possible_cpu(i) @@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) #else INIT_LIST_HEAD(&s->s_files); #endif + if (init_sb_writers(s, type)) + goto err_out; s->s_flags = flags; s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); @@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); init_rwsem(&s->s_dquot.dqptr_sem); - init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; @@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) } out: return s; +err_out: + security_sb_free(s); +#ifdef CONFIG_SMP + if (s->s_files) + free_percpu(s->s_files); +#endif + destroy_sb_writers(s); + kfree(s); + s = NULL; + goto out; } /** @@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s) #ifdef CONFIG_SMP free_percpu(s->s_files); #endif + destroy_sb_writers(s); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); @@ -320,7 +369,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) /* * grab_super_passive - acquire a passive reference - * @s: reference we are trying to grab + * @sb: reference we are trying to grab * * Tries to acquire a passive reference. This is used in places where we * cannot take an active reference but we need to ensure that the @@ -488,46 +537,6 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); /** - * sync_supers - helper for periodic superblock writeback - * - * Call the write_super method if present on all dirty superblocks in - * the system. This is for the periodic writeback used by most older - * filesystems. For data integrity superblock writeback use - * sync_filesystems() instead. - * - * Note: check the dirty flag before waiting, so we don't - * hold up the sync while mounting a device. (The newly - * mounted device won't need syncing.) - */ -void sync_supers(void) -{ - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_op->write_super && sb->s_dirt) { - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN)) - sb->s_op->write_super(sb); - up_read(&sb->s_umount); - - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); -} - -/** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it @@ -651,10 +660,11 @@ struct super_block *get_super_thawed(struct block_device *bdev) { while (1) { struct super_block *s = get_super(bdev); - if (!s || s->s_frozen == SB_UNFROZEN) + if (!s || s->s_writers.frozen == SB_UNFROZEN) return s; up_read(&s->s_umount); - vfs_check_frozen(s, SB_FREEZE_WRITE); + wait_event(s->s_writers.wait_unfrozen, + s->s_writers.frozen == SB_UNFROZEN); put_super(s); } } @@ -732,7 +742,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) int retval; int remount_ro; - if (sb->s_frozen != SB_UNFROZEN) + if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; #ifdef CONFIG_BLOCK @@ -1163,6 +1173,120 @@ out: return ERR_PTR(error); } +/* + * This is an internal function, please use sb_end_{write,pagefault,intwrite} + * instead. + */ +void __sb_end_write(struct super_block *sb, int level) +{ + percpu_counter_dec(&sb->s_writers.counter[level-1]); + /* + * Make sure s_writers are updated before we wake up waiters in + * freeze_super(). + */ + smp_mb(); + if (waitqueue_active(&sb->s_writers.wait)) + wake_up(&sb->s_writers.wait); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); +} +EXPORT_SYMBOL(__sb_end_write); + +#ifdef CONFIG_LOCKDEP +/* + * We want lockdep to tell us about possible deadlocks with freezing but + * it's it bit tricky to properly instrument it. Getting a freeze protection + * works as getting a read lock but there are subtle problems. XFS for example + * gets freeze protection on internal level twice in some cases, which is OK + * only because we already hold a freeze protection also on higher level. Due + * to these cases we have to tell lockdep we are doing trylock when we + * already hold a freeze protection for a higher freeze level. + */ +static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, + unsigned long ip) +{ + int i; + + if (!trylock) { + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + trylock = true; + break; + } + } + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); +} +#endif + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ +retry: + if (unlikely(sb->s_writers.frozen >= level)) { + if (!wait) + return 0; + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen < level); + } + +#ifdef CONFIG_LOCKDEP + acquire_freeze_lock(sb, level, !wait, _RET_IP_); +#endif + percpu_counter_inc(&sb->s_writers.counter[level-1]); + /* + * Make sure counter is updated before we check for frozen. + * freeze_super() first sets frozen and then checks the counter. + */ + smp_mb(); + if (unlikely(sb->s_writers.frozen >= level)) { + __sb_end_write(sb, level); + goto retry; + } + return 1; +} +EXPORT_SYMBOL(__sb_start_write); + +/** + * sb_wait_write - wait until all writers to given file system finish + * @sb: the super for which we wait + * @level: type of writers we wait for (normal vs page fault) + * + * This function waits until there are no writers of given type to given file + * system. Caller of this function should make sure there can be no new writers + * of type @level before calling this function. Otherwise this function can + * livelock. + */ +static void sb_wait_write(struct super_block *sb, int level) +{ + s64 writers; + + /* + * We just cycle-through lockdep here so that it does not complain + * about returning with lock to userspace + */ + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + + do { + DEFINE_WAIT(wait); + + /* + * We use a barrier in prepare_to_wait() to separate setting + * of frozen and checking of the counter + */ + prepare_to_wait(&sb->s_writers.wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); + if (writers) + schedule(); + + finish_wait(&sb->s_writers.wait, &wait); + } while (writers); +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1170,6 +1294,31 @@ out: * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return * -EBUSY. + * + * During this function, sb->s_writers.frozen goes through these values: + * + * SB_UNFROZEN: File system is normal, all writes progress as usual. + * + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New + * writes should be blocked, though page faults are still allowed. We wait for + * all writes to complete and then proceed to the next stage. + * + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked + * but internal fs threads can still modify the filesystem (although they + * should not dirty new pages or inodes), writeback can run etc. After waiting + * for all running page faults we sync the filesystem which will clean all + * dirty pages and inodes (no new dirty pages or inodes can be created when + * sync is running). + * + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs + * modification are blocked (e.g. XFS preallocation truncation on inode + * reclaim). This is usually implemented by blocking new transactions for + * filesystems that have them and need this additional guard. After all + * internal writers are finished we call ->freeze_fs() to finish filesystem + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is + * mostly auxiliary for filesystems to verify they do not modify frozen fs. + * + * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb) { @@ -1177,7 +1326,7 @@ int freeze_super(struct super_block *sb) atomic_inc(&sb->s_active); down_write(&sb->s_umount); - if (sb->s_frozen) { + if (sb->s_writers.frozen != SB_UNFROZEN) { deactivate_locked_super(sb); return -EBUSY; } @@ -1188,33 +1337,53 @@ int freeze_super(struct super_block *sb) } if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); + /* Nothing to do really... */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } - sb->s_frozen = SB_FREEZE_WRITE; + /* From now on, no new normal writers can start */ + sb->s_writers.frozen = SB_FREEZE_WRITE; smp_wmb(); + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ + up_write(&sb->s_umount); + + sb_wait_write(sb, SB_FREEZE_WRITE); + + /* Now we go and block page faults... */ + down_write(&sb->s_umount); + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; + smp_wmb(); + + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); + + /* All writers are done so after syncing there won't be dirty data */ sync_filesystem(sb); - sb->s_frozen = SB_FREEZE_TRANS; + /* Now wait for internal filesystem counter */ + sb->s_writers.frozen = SB_FREEZE_FS; smp_wmb(); + sb_wait_write(sb, SB_FREEZE_FS); - sync_blockdev(sb->s_bdev); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } } + /* + * This is just for debugging purposes so that fs can warn if it + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } @@ -1231,7 +1400,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen == SB_UNFROZEN) { up_write(&sb->s_umount); return -EINVAL; } @@ -1244,16 +1413,15 @@ int thaw_super(struct super_block *sb) if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; up_write(&sb->s_umount); return error; } } out: - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return 0; diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index a4759833d62d..614b2b544880 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = 0; if (bb->vm_ops->page_mkwrite) ret = bb->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); sysfs_put_active(attr_sd); return ret; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 35389ca2d267..7bd6e72afd11 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -37,11 +37,11 @@ * * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we * implement. However, this is not true for 'ubifs_writepage()', which may be - * called with @i_mutex unlocked. For example, when pdflush is doing background - * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal" - * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the - * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()' - * we are only guaranteed that the page is locked. + * called with @i_mutex unlocked. For example, when flusher thread is doing + * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. + * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. + * in the "sys_write -> alloc_pages -> direct reclaim path". So, in + * 'ubifs_writepage()' we are only guaranteed that the page is locked. * * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1c766c39c038..c3fa6c5327a3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -303,7 +303,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) mutex_lock(&ui->ui_mutex); /* * Due to races between write-back forced by budgeting - * (see 'sync_some_inodes()') and pdflush write-back, the inode may + * (see 'sync_some_inodes()') and background write-back, the inode may * have already been synchronized, do not do this again. This might * also happen if it was synchronized in an VFS operation, e.g. * 'ubifs_link()'. diff --git a/fs/xattr.c b/fs/xattr.c index 1d7ac3790458..4d45b7189e7e 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -427,6 +427,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, { ssize_t error; void *kvalue = NULL; + void *vvalue = NULL; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); @@ -438,9 +439,13 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; - kvalue = kzalloc(size, GFP_KERNEL); - if (!kvalue) - return -ENOMEM; + kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!kvalue) { + vvalue = vmalloc(size); + if (!vvalue) + return -ENOMEM; + kvalue = vvalue; + } } error = vfs_getxattr(d, kname, kvalue, size); @@ -452,7 +457,10 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } - kfree(kvalue); + if (vvalue) + vfree(vvalue); + else + kfree(kvalue); return error; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index a6caa0022c9b..359fb86ed876 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -51,20 +51,6 @@ typedef struct xfs_alloc_rec_incore { typedef __be32 xfs_alloc_ptr_t; /* - * Minimum and maximum blocksize and sectorsize. - * The blocksize upper limit is pretty much arbitrary. - * The sectorsize upper limit is due to sizeof(sb_sectsize). - */ -#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ -#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) -#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) -#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ -#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) -#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) - -/* * Block numbers in the AG: * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8dad722c0041..e562dd43f41f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* + * We will pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + /* * We hand off the transaction to the completion thread now, so * clear the flag here. */ @@ -179,7 +185,7 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -199,6 +205,15 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (ioend->io_append_trans) { + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + rwsem_acquire_read( + &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -210,7 +225,7 @@ xfs_end_io( * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN) { + if (ioend->io_type == XFS_IO_UNWRITTEN) { /* * For buffered I/O we never preallocate a transaction when * doing the unwritten extent conversion, but for direct I/O @@ -312,7 +327,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (type == IO_UNWRITTEN) + if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { @@ -323,10 +338,10 @@ xfs_map_blocks( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_maxioffset); + ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; + if (offset + count > mp->m_super->s_maxbytes) + count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, @@ -336,7 +351,7 @@ xfs_map_blocks( if (error) return -XFS_ERROR(error); - if (type == IO_DELALLOC && + if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { error = xfs_iomap_write_allocate(ip, offset, count, imap); if (!error) @@ -345,7 +360,7 @@ xfs_map_blocks( } #ifdef DEBUG - if (type == IO_UNWRITTEN) { + if (type == XFS_IO_UNWRITTEN) { ASSERT(nimaps); ASSERT(imap->br_startblock != HOLESTARTBLOCK); ASSERT(imap->br_startblock != DELAYSTARTBLOCK); @@ -634,11 +649,11 @@ xfs_check_page_type( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable += (type == IO_UNWRITTEN); + acceptable += (type == XFS_IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable += (type == IO_DELALLOC); + acceptable += (type == XFS_IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == IO_OVERWRITE); + acceptable += (type == XFS_IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -721,11 +736,11 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh) || buffer_mapped(bh)) { if (buffer_unwritten(bh)) - type = IO_UNWRITTEN; + type = XFS_IO_UNWRITTEN; else if (buffer_delay(bh)) - type = IO_DELALLOC; + type = XFS_IO_DELALLOC; else - type = IO_OVERWRITE; + type = XFS_IO_OVERWRITE; if (!xfs_imap_valid(inode, imap, offset)) { done = 1; @@ -733,7 +748,7 @@ xfs_convert_page( } lock_buffer(bh); - if (type != IO_OVERWRITE) + if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -831,7 +846,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -927,11 +942,26 @@ xfs_vm_writepage( end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; if (page->index >= end_index) { - if ((page->index >= end_index + 1) || - !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + + /* + * Just skip the page if it is fully outside i_size, e.g. due + * to a truncate operation that is in progress. + */ + if (page->index >= end_index + 1 || offset_into_page == 0) { unlock_page(page); return 0; } + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); } end_offset = min_t(unsigned long long, @@ -941,7 +971,7 @@ xfs_vm_writepage( bh = head = page_buffers(page); offset = page_offset(page); - type = IO_OVERWRITE; + type = XFS_IO_OVERWRITE; if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; @@ -966,18 +996,18 @@ xfs_vm_writepage( } if (buffer_unwritten(bh)) { - if (type != IO_UNWRITTEN) { - type = IO_UNWRITTEN; + if (type != XFS_IO_UNWRITTEN) { + type = XFS_IO_UNWRITTEN; imap_valid = 0; } } else if (buffer_delay(bh)) { - if (type != IO_DELALLOC) { - type = IO_DELALLOC; + if (type != XFS_IO_DELALLOC) { + type = XFS_IO_DELALLOC; imap_valid = 0; } } else if (buffer_uptodate(bh)) { - if (type != IO_OVERWRITE) { - type = IO_OVERWRITE; + if (type != XFS_IO_OVERWRITE) { + type = XFS_IO_OVERWRITE; imap_valid = 0; } } else { @@ -1013,7 +1043,7 @@ xfs_vm_writepage( } if (imap_valid) { lock_buffer(bh); - if (type != IO_OVERWRITE) + if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &imap, offset); xfs_add_to_ioend(inode, bh, offset, type, &ioend, new_ioend); @@ -1054,7 +1084,7 @@ xfs_vm_writepage( * Reserve log space if we might write beyond the on-disk * inode size. */ - if (ioend->io_type != IO_UNWRITTEN && + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) { err = xfs_setfilesize_trans_alloc(ioend); if (err) @@ -1162,9 +1192,9 @@ __xfs_get_blocks( lockmode = xfs_ilock_map_shared(ip); } - ASSERT(offset <= mp->m_maxioffset); - if (offset + size > mp->m_maxioffset) - size = mp->m_maxioffset - offset; + ASSERT(offset <= mp->m_super->s_maxbytes); + if (offset + size > mp->m_super->s_maxbytes) + size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1351,7 +1381,7 @@ xfs_end_io_direct_write( ioend->io_iocb = iocb; ioend->io_result = ret; if (private && size > 0) - ioend->io_type = IO_UNWRITTEN; + ioend->io_type = XFS_IO_UNWRITTEN; if (is_async) { ioend->io_isasync = 1; @@ -1383,7 +1413,7 @@ xfs_vm_direct_IO( * and converts at least on unwritten extent we will cancel * the still clean transaction after the I/O has finished. */ - iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT); + iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); if (offset + size > XFS_I(inode)->i_d.di_size) { ret = xfs_setfilesize_trans_alloc(ioend); if (ret) @@ -1410,6 +1440,9 @@ out_trans_cancel: if (ioend->io_append_trans) { current_set_flags_nested(&ioend->io_append_trans->t_pflags, PF_FSTRANS); + rwsem_acquire_read( + &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_trans_cancel(ioend->io_append_trans, 0); } out_destroy_ioend: diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 84eafbcb0d9d..c325abb8d61a 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -24,17 +24,17 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - IO_DIRECT = 0, /* special case for direct I/O ioends */ - IO_DELALLOC, /* mapping covers delalloc region */ - IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ - IO_OVERWRITE, /* mapping covers already allocated extent */ + XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */ + XFS_IO_DELALLOC, /* covers delalloc region */ + XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ + XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ { 0, "" }, \ - { IO_DELALLOC, "delalloc" }, \ - { IO_UNWRITTEN, "unwritten" }, \ - { IO_OVERWRITE, "overwrite" } + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" } /* * xfs_ioend struct manages large extent writes for XFS. diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index a17ff01b5adf..0ca1f0be62d2 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -893,7 +893,7 @@ STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; trace_xfs_attr_leaf_addname(args); @@ -915,11 +915,11 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ retval = xfs_attr_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(retval); } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(retval); } @@ -937,7 +937,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * if required. */ retval = xfs_attr_leaf_add(bp, args); - xfs_da_buf_done(bp); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -1065,8 +1064,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); - } else - xfs_da_buf_done(bp); + } /* * Commit the remove and start the next trans in series. @@ -1092,7 +1090,7 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, committed, forkoff; trace_xfs_attr_leaf_removename(args); @@ -1111,7 +1109,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(error); } @@ -1141,8 +1139,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); - } else - xfs_da_buf_done(bp); + } return(0); } @@ -1155,7 +1152,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) { - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; args->blkno = 0; @@ -1167,11 +1164,11 @@ xfs_attr_leaf_get(xfs_da_args_t *args) error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(error); } error = xfs_attr_leaf_getvalue(bp, args); - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } @@ -1186,23 +1183,23 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) { xfs_attr_leafblock_t *leaf; int error; - xfs_dabuf_t *bp; + struct xfs_buf *bp; context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return XFS_ERROR(error); ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return XFS_ERROR(EFSCORRUPTED); } error = xfs_attr_leaf_list_int(bp, context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return XFS_ERROR(error); } @@ -1489,7 +1486,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_t *state; xfs_da_state_blk_t *blk; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; trace_xfs_attr_node_removename(args); @@ -1601,14 +1598,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ ASSERT(state->path.active == 1); ASSERT(state->path.blk[0].bp); - xfs_da_buf_done(state->path.blk[0].bp); state->path.blk[0].bp = NULL; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->data)->hdr.info.magic) == + ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { @@ -1635,7 +1631,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } else - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); } error = 0; @@ -1665,8 +1661,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1681,8 +1676,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1792,7 +1786,7 @@ xfs_attr_node_get(xfs_da_args_t *args) * If not in a transaction, we have to release all the buffers. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } @@ -1808,7 +1802,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_da_intnode_t *node; xfs_da_node_entry_t *btree; int error, i; - xfs_dabuf_t *bp; + struct xfs_buf *bp; cursor = context->cursor; cursor->initted = 1; @@ -1825,30 +1819,30 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { - node = bp->data; + node = bp->b_addr; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: - leaf = bp->data; + leaf = bp->b_addr; if (cursor->hashval > be32_to_cpu(leaf->entries[ be16_to_cpu(leaf->hdr.count)-1].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } else if (cursor->hashval <= be32_to_cpu(leaf->entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } break; default: trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } } @@ -1873,7 +1867,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) context->dp->i_mount); return(XFS_ERROR(EFSCORRUPTED)); } - node = bp->data; + node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) break; @@ -1883,7 +1877,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, node); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(XFS_ERROR(EFSCORRUPTED)); } btree = node->btree; @@ -1898,10 +1892,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) } } if (i == be16_to_cpu(node->hdr.count)) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(0); } - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); } } ASSERT(bp != NULL); @@ -1912,24 +1906,24 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * adding the information. */ for (;;) { - leaf = bp->data; + leaf = bp->b_addr; if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(XFS_ERROR(EFSCORRUPTED)); } error = xfs_attr_leaf_list_int(bp, context); if (error) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return error; } if (context->seen_enough || leaf->hdr.info.forw == 0) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) @@ -1941,7 +1935,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) return(XFS_ERROR(EFSCORRUPTED)); } } - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(0); } diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 7d89d800f517..d330111ca738 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -54,10 +54,10 @@ * Routines used for growing the Btree. */ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - xfs_dabuf_t **bpp); -STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, - int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); + struct xfs_buf **bpp); +STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, + xfs_da_args_t *args, int freemap_index); +STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -71,9 +71,9 @@ STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * Routines used for shrinking the Btree. */ STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp, int level); + struct xfs_buf *bp, int level); STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp); + struct xfs_buf *bp); STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt); @@ -480,7 +480,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) char *tmpbuffer; int error, i, size; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; xfs_ifork_t *ifp; trace_xfs_attr_sf_to_leaf(args); @@ -550,8 +550,6 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) error = 0; out: - if(bp) - xfs_da_buf_done(bp); kmem_free(tmpbuffer); return(error); } @@ -737,14 +735,16 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * a shortform attribute list. */ int -xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +xfs_attr_shortform_allfit( + struct xfs_buf *bp, + struct xfs_inode *dp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_local_t *name_loc; int bytes, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); entry = &leaf->entries[0]; @@ -774,7 +774,10 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) +xfs_attr_leaf_to_shortform( + struct xfs_buf *bp, + xfs_da_args_t *args, + int forkoff) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -791,10 +794,10 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) ASSERT(tmpbuffer != NULL); ASSERT(bp != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); + memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); leaf = (xfs_attr_leafblock_t *)tmpbuffer; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); /* * Clean out the prior contents of the attribute list. @@ -855,7 +858,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_da_intnode_t *node; xfs_inode_t *dp; - xfs_dabuf_t *bp1, *bp2; + struct xfs_buf *bp1, *bp2; xfs_dablk_t blkno; int error; @@ -877,10 +880,9 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) if (error) goto out; ASSERT(bp2 != NULL); - memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); - xfs_da_buf_done(bp1); + memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; - xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); /* * Set up the new root node. @@ -888,21 +890,17 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; - node = bp1->data; - leaf = bp2->data; + node = bp1->b_addr; + leaf = bp2->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* both on-disk, don't endian-flip twice */ node->btree[0].hashval = leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; node->btree[0].before = cpu_to_be32(blkno); node->hdr.count = cpu_to_be16(1); - xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); error = 0; out: - if (bp1) - xfs_da_buf_done(bp1); - if (bp2) - xfs_da_buf_done(bp2); return(error); } @@ -916,12 +914,15 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +xfs_attr_leaf_create( + xfs_da_args_t *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_attr_leaf_create(args); @@ -933,7 +934,7 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) if (error) return(error); ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); @@ -947,7 +948,7 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - sizeof(xfs_attr_leaf_hdr_t)); - xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); *bpp = bp; return(0); @@ -1014,7 +1015,9 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1023,7 +1026,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) trace_xfs_attr_leaf_add(args); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(leaf->hdr.count))); @@ -1085,7 +1088,10 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +xfs_attr_leaf_add_work( + struct xfs_buf *bp, + xfs_da_args_t *args, + int mapindex) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1096,7 +1102,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) xfs_mount_t *mp; int tmp, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); @@ -1110,7 +1116,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) tmp = be16_to_cpu(hdr->count) - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); memmove((char *)(entry+1), (char *)entry, tmp); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } be16_add_cpu(&hdr->count, 1); @@ -1142,7 +1148,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) args->index2++; } } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); @@ -1174,7 +1180,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) args->rmtblkno = 1; args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); @@ -1198,7 +1204,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) } } be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); return(0); } @@ -1207,7 +1213,9 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +xfs_attr_leaf_compact( + struct xfs_trans *trans, + struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf_s, *leaf_d; xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; @@ -1217,14 +1225,14 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp)); - memset(bp->data, 0, XFS_LBSIZE(mp)); + memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); + memset(bp->b_addr, 0, XFS_LBSIZE(mp)); /* * Copy basic information */ leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->data; + leaf_d = bp->b_addr; hdr_s = &leaf_s->hdr; hdr_d = &leaf_d->hdr; hdr_d->info = hdr_s->info; /* struct copy */ @@ -1247,7 +1255,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) */ xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, be16_to_cpu(hdr_s->count), mp); - xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); kmem_free(tmpbuffer); } @@ -1279,8 +1287,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); args = state->args; @@ -1298,8 +1306,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; swap = 1; } hdr1 = &leaf1->hdr; @@ -1346,8 +1354,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, leaf2, 0, count, state->mp); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); } else if (count > be16_to_cpu(hdr1->count)) { /* * I assert that since all callers pass in an empty @@ -1378,8 +1386,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_attr_leaf_moveents(leaf2, 0, leaf1, be16_to_cpu(hdr1->count), count, state->mp); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); } /* @@ -1448,8 +1456,8 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Set up environment. */ - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; hdr1 = &leaf1->hdr; hdr2 = &leaf2->hdr; foundit = 0; @@ -1551,7 +1559,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_da_blkinfo_t *info; int count, bytes, forward, error, retval, i; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Check for the degenerate case of the block being over 50% full. @@ -1559,7 +1567,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); @@ -1622,13 +1630,13 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); bytes -= sizeof(xfs_attr_leaf_hdr_t); - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ } @@ -1666,7 +1674,9 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_remove( + struct xfs_buf *bp, + xfs_da_args_t *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1676,7 +1686,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) int tablesize, tmp, i; xfs_mount_t *mp; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; mp = args->trans->t_mountp; @@ -1769,7 +1779,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) */ memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); be16_add_cpu(&hdr->usedbytes, -entsize); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), entsize)); @@ -1777,7 +1787,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) * sizeof(xfs_attr_leaf_entry_t); memmove((char *)entry, (char *)(entry+1), tmp); be16_add_cpu(&hdr->count, -1); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); entry = &leaf->entries[be16_to_cpu(hdr->count)]; memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); @@ -1807,7 +1817,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) } else { hdr->holes = 1; /* mark as needing compaction */ } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); /* @@ -1840,8 +1850,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, mp = state->mp; ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); drop_hdr = &drop_leaf->hdr; @@ -1906,7 +1916,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, kmem_free(tmpbuffer); } - xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->blocksize - 1); /* @@ -1934,7 +1944,9 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_lookup_int( + struct xfs_buf *bp, + xfs_da_args_t *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -1945,7 +1957,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) trace_xfs_attr_leaf_lookup(args); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2041,7 +2053,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) * list structure. */ int -xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_getvalue( + struct xfs_buf *bp, + xfs_da_args_t *args) { int valuelen; xfs_attr_leafblock_t *leaf; @@ -2049,7 +2063,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2247,12 +2261,14 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * Return 0 unless leaf2 should go before leaf1. */ int -xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) { xfs_attr_leafblock_t *leaf1, *leaf2; - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; + leaf1 = leaf1_bp->b_addr; + leaf2 = leaf2_bp->b_addr; ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); if ((be16_to_cpu(leaf1->hdr.count) > 0) && @@ -2272,11 +2288,13 @@ xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) * Pick up the last hashvalue from a leaf block. */ xfs_dahash_t -xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +xfs_attr_leaf_lasthash( + struct xfs_buf *bp, + int *count) { xfs_attr_leafblock_t *leaf; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); @@ -2337,7 +2355,9 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ int -xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) +xfs_attr_leaf_list_int( + struct xfs_buf *bp, + xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; @@ -2345,7 +2365,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) int retval, i; ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; cursor = context->cursor; cursor->initted = 1; @@ -2463,7 +2483,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; #ifdef DEBUG xfs_attr_leaf_name_local_t *name_loc; @@ -2482,7 +2502,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) } ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); @@ -2505,7 +2525,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) #endif /* DEBUG */ entry->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if (args->rmtblkno) { @@ -2513,10 +2533,9 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. @@ -2533,7 +2552,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_attr_leaf_setflag(args); @@ -2548,7 +2567,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) } ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); @@ -2556,16 +2575,15 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. @@ -2586,7 +2604,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf1, *leaf2; xfs_attr_leaf_entry_t *entry1, *entry2; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp1, *bp2; + struct xfs_buf *bp1, *bp2; int error; #ifdef DEBUG xfs_attr_leaf_name_local_t *name_loc; @@ -2620,13 +2638,13 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) bp2 = bp1; } - leaf1 = bp1->data; + leaf1 = bp1->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; - leaf2 = bp2->data; + leaf2 = bp2->b_addr; ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); @@ -2660,30 +2678,27 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); entry1->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } entry2->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp1); - if (bp1 != bp2) - xfs_da_buf_done(bp2); /* * Commit the flag value change and start the next trans in series. @@ -2706,7 +2721,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) { xfs_da_blkinfo_t *info; xfs_daddr_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; /* @@ -2718,20 +2733,20 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); - blkno = xfs_da_blkno(bp); + blkno = XFS_BUF_ADDR(bp); /* * Invalidate the tree, even if the "tree" is only a single leaf block. * This is a depth-first traversal! */ - info = bp->data; + info = bp->b_addr; if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, bp, 1); } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { error = xfs_attr_leaf_inactive(trans, dp, bp); } else { error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); } if (error) return(error); @@ -2742,7 +2757,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); if (error) return(error); - xfs_da_binval(*trans, bp); /* remove from cache */ + xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. */ @@ -2756,34 +2771,37 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * We're doing a depth-first traversal in order to invalidate everything. */ STATIC int -xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, - int level) +xfs_attr_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) { xfs_da_blkinfo_t *info; xfs_da_intnode_t *node; xfs_dablk_t child_fsb; xfs_daddr_t parent_blkno, child_blkno; int error, count, i; - xfs_dabuf_t *child_bp; + struct xfs_buf *child_bp; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { - xfs_da_brelse(*trans, bp); /* no locks for later trans */ + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ return(XFS_ERROR(EIO)); } - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ + parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ count = be16_to_cpu(node->hdr.count); if (!count) { - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); return(0); } child_fsb = be32_to_cpu(node->btree[0].before); - xfs_da_brelse(*trans, bp); /* no locks for later trans */ + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* * If this is the node level just above the leaves, simply loop @@ -2803,12 +2821,12 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, return(error); if (child_bp) { /* save for re-read later */ - child_blkno = xfs_da_blkno(child_bp); + child_blkno = XFS_BUF_ADDR(child_bp); /* * Invalidate the subtree, however we have to. */ - info = child_bp->data; + info = child_bp->b_addr; if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, child_bp, level+1); @@ -2817,7 +2835,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, child_bp); } else { error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, child_bp); + xfs_trans_brelse(*trans, child_bp); } if (error) return(error); @@ -2830,7 +2848,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, &child_bp, XFS_ATTR_FORK); if (error) return(error); - xfs_da_binval(*trans, child_bp); + xfs_trans_binval(*trans, child_bp); } /* @@ -2843,7 +2861,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); } /* * Atomically commit the whole invalidate stuff. @@ -2863,7 +2881,10 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * caught holding something that the logging code wants to flush to disk. */ STATIC int -xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) +xfs_attr_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -2871,7 +2892,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) xfs_attr_inactive_list_t *list, *lp; int error, count, size, tmp, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* @@ -2892,7 +2913,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) * If there are no "remote" values, we're done. */ if (count == 0) { - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); return(0); } @@ -2919,7 +2940,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) } } } - xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ /* * Invalidate each of the "remote" value extents. diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 9c7d22fdcf4d..dea17722945e 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -31,7 +31,6 @@ struct attrlist; struct attrlist_cursor_kern; struct xfs_attr_list_context; -struct xfs_dabuf; struct xfs_da_args; struct xfs_da_state; struct xfs_da_state_blk; @@ -215,7 +214,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); -int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); @@ -223,7 +222,7 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, +int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); @@ -235,14 +234,14 @@ int xfs_attr_leaf_flipflags(xfs_da_args_t *args); int xfs_attr_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, +int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, +int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, +int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, +int xfs_attr_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* @@ -257,9 +256,9 @@ int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. */ -xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); -int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 58b815ec8c91..848ffa77707b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5517,7 +5517,7 @@ xfs_getbmap( if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; - fixlen = XFS_MAXIOFFSET(mp); + fixlen = mp->m_super->s_maxbytes; } else { prealloced = 0; fixlen = XFS_ISIZE(ip); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 269b35c084da..d7a9dd735e1e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -164,14 +164,49 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } +static int +xfs_buf_get_maps( + struct xfs_buf *bp, + int map_count) +{ + ASSERT(bp->b_maps == NULL); + bp->b_map_count = map_count; + + if (map_count == 1) { + bp->b_maps = &bp->b_map; + return 0; + } + + bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), + KM_NOFS); + if (!bp->b_maps) + return ENOMEM; + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +static void +xfs_buf_free_maps( + struct xfs_buf *bp) +{ + if (bp->b_maps != &bp->b_map) { + kmem_free(bp->b_maps); + bp->b_maps = NULL; + } +} + struct xfs_buf * -xfs_buf_alloc( +_xfs_buf_alloc( struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { struct xfs_buf *bp; + int error; + int i; bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) @@ -192,16 +227,28 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; + bp->b_flags = flags; /* * Set length and io_length to the same value initially. * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_length = numblks; - bp->b_io_length = numblks; - bp->b_flags = flags; - bp->b_bn = blkno; + error = xfs_buf_get_maps(bp, nmaps); + if (error) { + kmem_zone_free(xfs_buf_zone, bp); + return NULL; + } + + bp->b_bn = map[0].bm_bn; + bp->b_length = 0; + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; + } + bp->b_io_length = bp->b_length; + atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -280,6 +327,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); kmem_zone_free(xfs_buf_zone, bp); } @@ -327,8 +375,9 @@ xfs_buf_allocate_memory( } use_alloc_page: - start = BBTOB(bp->b_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) @@ -425,8 +474,8 @@ _xfs_buf_map_pages( xfs_buf_t * _xfs_buf_find( struct xfs_buftarg *btp, - xfs_daddr_t blkno, - size_t numblks, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { @@ -435,7 +484,12 @@ _xfs_buf_find( struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; + xfs_daddr_t blkno = map[0].bm_bn; + int numblks = 0; + int i; + for (i = 0; i < nmaps; i++) + numblks += map[i].bm_len; numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ @@ -527,31 +581,31 @@ found: * more hits than misses. */ struct xfs_buf * -xfs_buf_get( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks, +xfs_buf_get_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); + bp = _xfs_buf_find(target, map, nmaps, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, blkno, numblks, flags); + new_bp = _xfs_buf_alloc(target, map, nmaps, flags); if (unlikely(!new_bp)) return NULL; error = xfs_buf_allocate_memory(new_bp, flags); if (error) { - kmem_zone_free(xfs_buf_zone, new_bp); + xfs_buf_free(new_bp); return NULL; } - bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); if (!bp) { xfs_buf_free(new_bp); return NULL; @@ -560,8 +614,6 @@ xfs_buf_get( if (bp != new_bp) xfs_buf_free(new_bp); - bp->b_io_length = bp->b_length; - found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); @@ -584,7 +636,7 @@ _xfs_buf_read( xfs_buf_flags_t flags) { ASSERT(!(flags & XBF_WRITE)); - ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -596,17 +648,17 @@ _xfs_buf_read( } xfs_buf_t * -xfs_buf_read( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks, +xfs_buf_read_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { - xfs_buf_t *bp; + struct xfs_buf *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, blkno, numblks, flags); + bp = xfs_buf_get_map(target, map, nmaps, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -634,15 +686,15 @@ xfs_buf_read( * safe manner. */ void -xfs_buf_readahead( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks) +xfs_buf_readahead_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, blkno, numblks, + xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } @@ -665,8 +717,10 @@ xfs_buf_read_uncached( return NULL; /* set up the buffer for a read IO */ - XFS_BUF_SET_ADDR(bp, daddr); - XFS_BUF_READ(bp); + ASSERT(bp->b_map_count == 1); + bp->b_bn = daddr; + bp->b_maps[0].bm_bn = daddr; + bp->b_flags |= XBF_READ; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); @@ -694,7 +748,11 @@ xfs_buf_set_empty( bp->b_addr = NULL; bp->b_length = numblks; bp->b_io_length = numblks; + + ASSERT(bp->b_map_count == 1); bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_len = bp->b_length; } static inline struct page * @@ -758,9 +816,10 @@ xfs_buf_get_uncached( { unsigned long page_count; int error, i; - xfs_buf_t *bp; + struct xfs_buf *bp; + DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); + bp = _xfs_buf_alloc(target, &map, 1, 0); if (unlikely(bp == NULL)) goto fail; @@ -791,6 +850,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: + xfs_buf_free_maps(bp); kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; @@ -1144,36 +1204,39 @@ xfs_buf_bio_end_io( bio_put(bio); } -STATIC void -_xfs_buf_ioapply( - xfs_buf_t *bp) +static void +xfs_buf_ioapply_map( + struct xfs_buf *bp, + int map, + int *buf_offset, + int *count, + int rw) { - int rw, map_i, total_nr_pages, nr_pages; - struct bio *bio; - int offset = bp->b_offset; - int size = BBTOB(bp->b_io_length); - sector_t sector = bp->b_bn; + int page_index; + int total_nr_pages = bp->b_page_count; + int nr_pages; + struct bio *bio; + sector_t sector = bp->b_maps[map].bm_bn; + int size; + int offset; total_nr_pages = bp->b_page_count; - map_i = 0; - if (bp->b_flags & XBF_WRITE) { - if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; - if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; - if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; - } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; - } else { - rw = READ; + /* skip the pages in the buffer before the start offset */ + page_index = 0; + offset = *buf_offset; + while (offset >= PAGE_SIZE) { + page_index++; + offset -= PAGE_SIZE; } - /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + /* + * Limit the IO size to the length of the current vector, and update the + * remaining IO count for the next time around. + */ + size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); + *count -= size; + *buf_offset += size; next_chunk: atomic_inc(&bp->b_io_remaining); @@ -1188,13 +1251,14 @@ next_chunk: bio->bi_private = bp; - for (; size && nr_pages; nr_pages--, map_i++) { + for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; if (nbytes > size) nbytes = size; - rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); + rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, + offset); if (rbytes < nbytes) break; @@ -1216,6 +1280,54 @@ next_chunk: xfs_buf_ioerror(bp, EIO); bio_put(bio); } + +} + +STATIC void +_xfs_buf_ioapply( + struct xfs_buf *bp) +{ + struct blk_plug plug; + int rw; + int offset; + int size; + int i; + + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; + } else { + rw = READ; + } + + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + + /* + * Walk all the vectors issuing IO on them. Set up the initial offset + * into the buffer and the desired IO size before we start - + * _xfs_buf_ioapply_vec() will modify them appropriately for each + * subsequent call. + */ + offset = bp->b_offset; + size = BBTOB(bp->b_io_length); + blk_start_plug(&plug); + for (i = 0; i < bp->b_map_count; i++) { + xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + if (bp->b_error) + break; + if (size <= 0) + break; /* all done */ + } + blk_finish_plug(&plug); } void @@ -1557,7 +1669,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_bn - bp->b_bn; + diff = ap->b_map.bm_bn - bp->b_map.bm_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 79344c48008e..d03b73b9604e 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -58,6 +58,7 @@ typedef enum { #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ typedef unsigned int xfs_buf_flags_t; @@ -75,7 +76,8 @@ typedef unsigned int xfs_buf_flags_t; { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" } + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_COMPOUND, "COMPOUND" } typedef struct xfs_buftarg { dev_t bt_dev; @@ -98,6 +100,14 @@ typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); #define XB_PAGES 2 +struct xfs_buf_map { + xfs_daddr_t bm_bn; /* block number for I/O */ + int bm_len; /* size of I/O */ +}; + +#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ + struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -107,7 +117,7 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_daddr_t b_bn; /* block number for I/O */ + xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ @@ -127,12 +137,16 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ + struct xfs_buf_map *b_maps; /* compound buffer map */ + struct xfs_buf_map b_map; /* inline compound buffer map */ + int b_map_count; int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif @@ -140,22 +154,78 @@ typedef struct xfs_buf { /* Finding and Reading Buffers */ -struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags, - struct xfs_buf *new_bp); -#define xfs_incore(buftarg,blkno,len,lockit) \ - _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) - -struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); -void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks); +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, struct xfs_buf *new_bp); + +static inline struct xfs_buf * +xfs_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_find(target, &map, 1, flags, NULL); +} + +struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); + +static inline struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_alloc(target, &map, 1, flags); +} + +struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +void xfs_buf_readahead_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps); + +static inline struct xfs_buf * +xfs_buf_get( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_get_map(target, &map, 1, flags); +} + +static inline struct xfs_buf * +xfs_buf_read( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_read_map(target, &map, 1, flags); +} + +static inline void +xfs_buf_readahead( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_readahead_map(target, &map, 1); +} struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); @@ -232,8 +302,18 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_ADDR(bp) ((bp)->b_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) +/* + * These macros use the IO block map rather than b_bn. b_bn is now really + * just for the buffer cache index for cached buffers. As IO does not use b_bn + * anymore, uncached buffers do not use b_bn at all and hence must modify the IO + * map directly. Uncached buffers are not allowed to be discontiguous, so this + * is safe to do. + * + * In future, uncached buffers will pass the block number directly to the io + * request function and hence these macros will go away at that point. + */ +#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index d9e451115f98..a8d0ed911196 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -153,33 +153,25 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); * If the XFS_BLI_STALE flag has been set, then log nothing. */ STATIC uint -xfs_buf_item_size( - struct xfs_log_item *lip) +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp) { - struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; uint nvecs; int next_bit; int last_bit; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - if (bip->bli_flags & XFS_BLI_STALE) { - /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. - */ - trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - return 1; - } + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return 0; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + nvecs = 2; - ASSERT(bip->bli_flags & XFS_BLI_LOGGED); - nvecs = 1; - last_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(last_bit != -1); - nvecs++; while (last_bit != -1) { /* * This takes the bit number to start looking from and @@ -187,16 +179,15 @@ xfs_buf_item_size( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); /* * If we run out of bits, leave the loop, * else if we find a new set of bits bump the number of vecs, * else keep scanning the current set of bits. */ if (next_bit == -1) { - last_bit = -1; + break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; @@ -210,22 +201,73 @@ xfs_buf_item_size( } } - trace_xfs_buf_item_size(bip); return nvecs; } /* - * This is called to fill in the vector of log iovecs for the - * given log buf item. It fills the first entry with a buf log - * format structure, and the rest point to contiguous chunks - * within the buffer. + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. */ -STATIC void -xfs_buf_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +STATIC uint +xfs_buf_item_size( + struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); + uint nvecs; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + return bip->bli_format_count; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + nvecs = 0; + for (i = 0; i < bip->bli_format_count; i++) { + nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); + } + + trace_xfs_buf_item_size(bip); + return nvecs; +} + +static struct xfs_log_iovec * +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_iovec *vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ struct xfs_buf *bp = bip->bli_buf; uint base_size; uint nvecs; @@ -235,40 +277,22 @@ xfs_buf_item_format( uint nbits; uint buffer_offset; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || - (bip->bli_flags & XFS_BLI_STALE)); + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->bli_format.blf_flags; /* - * The size of the base structure is the size of the - * declared structure plus the space for the extra words - * of the bitmap. We subtract one from the map size, because - * the first element of the bitmap is accounted for in the - * size of the base structure. + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. */ - base_size = - (uint)(sizeof(xfs_buf_log_format_t) + - ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); - vecp->i_addr = &bip->bli_format; + base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + vecp->i_addr = blfp; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; nvecs = 1; - /* - * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer - * this state if the inode buffer allocation has not yet been committed - * to the log as setting the XFS_BLI_INODE_BUF flag will prevent - * correct replay of the inode allocation. - */ - if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; - bip->bli_flags &= ~XFS_BLI_INODE_BUF; - } - if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -276,16 +300,15 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - bip->bli_format.blf_size = nvecs; - return; + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); + blfp->blf_size = nvecs; + return vecp; } /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); ASSERT(first_bit != -1); last_bit = first_bit; nbits = 1; @@ -296,9 +319,8 @@ xfs_buf_item_format( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - (uint)last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); /* * If we run out of bits fill in the last iovec and get * out of the loop. @@ -309,14 +331,14 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; @@ -325,14 +347,17 @@ xfs_buf_item_format( first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + } else if (xfs_buf_offset(bp, offset + + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + + (last_bit << XFS_BLF_SHIFT)) + XFS_BLF_CHUNK)) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* You would think we need to bump the nvecs here too, but we do not +/* + * You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here * nvecs++; @@ -347,6 +372,48 @@ xfs_buf_item_format( } } bip->bli_format.blf_size = nvecs; + return vecp; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + for (i = 0; i < bip->bli_format_count; i++) { + vecp = xfs_buf_item_format_segment(bip, vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } /* * Check to make sure everything is consistent. @@ -622,6 +689,35 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_committing = xfs_buf_item_committing }; +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->bli_format; + return 0; + } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return ENOMEM; + return 0; +} + +STATIC void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } +} /* * Allocate a new buf log item to go with the given buffer. @@ -639,6 +735,8 @@ xfs_buf_item_init( xfs_buf_log_item_t *bip; int chunks; int map_size; + int error; + int i; /* * Check to see if there is already a buf log item for @@ -650,25 +748,33 @@ xfs_buf_item_init( if (lip != NULL && lip->li_type == XFS_LI_BUF) return; - /* - * chunks is the number of XFS_BLF_CHUNK size pieces - * the buffer can be divided into. Make sure not to - * truncate any pieces. map_size is the size of the - * bitmap needed to describe the chunks of the buffer. - */ - chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> - XFS_BLF_SHIFT); - map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); - - bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, - KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); - bip->bli_format.blf_type = XFS_LI_BUF; - bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)bp->b_length; - bip->bli_format.blf_map_size = map_size; + + /* + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. + */ + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } #ifdef XFS_TRANS_DEBUG /* @@ -699,10 +805,11 @@ xfs_buf_item_init( * item's bitmap. */ void -xfs_buf_item_log( - xfs_buf_log_item_t *bip, +xfs_buf_item_log_segment( + struct xfs_buf_log_item *bip, uint first, - uint last) + uint last, + uint *map) { uint first_bit; uint last_bit; @@ -715,12 +822,6 @@ xfs_buf_item_log( uint mask; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * Convert byte offsets to bit numbers. */ first_bit = first >> XFS_BLF_SHIFT; @@ -736,7 +837,7 @@ xfs_buf_item_log( * to set a bit in. */ word_num = first_bit >> BIT_TO_WORD_SHIFT; - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &map[word_num]; /* * Calculate the starting bit in the first word. @@ -783,6 +884,51 @@ xfs_buf_item_log( xfs_buf_item_log_debug(bip, first, last); } +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * Mark the item as having some dirty data for + * quick reference in xfs_buf_item_dirty. + */ + bip->bli_flags |= XFS_BLI_DIRTY; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; + + xfs_buf_item_log_segment(bip, first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } +} + /* * Return 1 if the buffer has some data that has been logged (at any @@ -804,6 +950,7 @@ xfs_buf_item_free( kmem_free(bip->bli_logged); #endif /* XFS_TRANS_DEBUG */ + xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index b6ecd2061e7c..6850f49f4af3 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -21,23 +21,6 @@ extern kmem_zone_t *xfs_buf_item_zone; /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. - */ -typedef struct xfs_buf_log_format { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_t; - -/* * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ @@ -61,6 +44,23 @@ typedef struct xfs_buf_log_format { #define NBWORD (NBBY * sizeof(unsigned int)) /* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* * buf log item flags */ #define XFS_BLI_HOLD 0x01 @@ -102,7 +102,9 @@ typedef struct xfs_buf_log_item { char *bli_orig; /* original buffer copy */ char *bli_logged; /* bytes logged (bitmap) */ #endif - xfs_buf_log_format_t bli_format; /* in-log header */ + int bli_format_count; /* count of headers */ + struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ + struct xfs_buf_log_format bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 015b946c5808..7bfb7dd334fc 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -83,9 +83,9 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); -STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); -STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps); +STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); +STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp); STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); @@ -100,10 +100,10 @@ STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); */ int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork) + struct xfs_buf **bpp, int whichfork) { xfs_da_intnode_t *node; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; xfs_trans_t *tp; @@ -114,7 +114,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, if (error) return(error); ASSERT(bp != NULL); - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = 0; node->hdr.info.back = 0; node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); @@ -122,7 +122,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, node->hdr.count = 0; node->hdr.level = cpu_to_be16(level); - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); *bpp = bp; @@ -138,7 +138,7 @@ xfs_da_split(xfs_da_state_t *state) { xfs_da_state_blk_t *oldblk, *newblk, *addblk; xfs_da_intnode_t *node; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int max, action, error, i; trace_xfs_da_split(state->args); @@ -203,7 +203,6 @@ xfs_da_split(xfs_da_state_t *state) case XFS_DA_NODE_MAGIC: error = xfs_da_node_split(state, oldblk, newblk, addblk, max - i, &action); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; if (error) return(error); /* GROT: dir is inconsistent */ @@ -221,13 +220,6 @@ xfs_da_split(xfs_da_state_t *state) * Update the btree to show the new hashval for this child. */ xfs_da_fixhashpath(state, &state->path); - /* - * If we won't need this block again, it's getting dropped - * from the active path by the loop control, so we need - * to mark it done now. - */ - if (i > 0 || !addblk) - xfs_da_buf_done(oldblk->bp); } if (!addblk) return(0); @@ -239,8 +231,6 @@ xfs_da_split(xfs_da_state_t *state) oldblk = &state->path.blk[0]; error = xfs_da_root_split(state, oldblk, addblk); if (error) { - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ } @@ -252,7 +242,7 @@ xfs_da_split(xfs_da_state_t *state) * and the original block 0 could be at any position in the list. */ - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { bp = addblk->bp; @@ -260,13 +250,13 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { bp = addblk->bp; @@ -274,14 +264,12 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(0); } @@ -298,7 +286,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_intnode_t *node, *oldroot; xfs_da_args_t *args; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, size; xfs_inode_t *dp; xfs_trans_t *tp; @@ -323,8 +311,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, if (error) return(error); ASSERT(bp != NULL); - node = bp->data; - oldroot = blk1->bp->data; + node = bp->b_addr; + oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - (char *)oldroot); @@ -335,8 +323,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, (char *)leaf); } memcpy(node, oldroot, size); - xfs_da_log_buf(tp, bp, 0, size - 1); - xfs_da_buf_done(blk1->bp); + xfs_trans_log_buf(tp, bp, 0, size - 1); blk1->bp = bp; blk1->blkno = blkno; @@ -348,7 +335,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); if (error) return(error); - node = bp->data; + node = bp->b_addr; node->btree[0].hashval = cpu_to_be32(blk1->hashval); node->btree[0].before = cpu_to_be32(blk1->blkno); node->btree[1].hashval = cpu_to_be32(blk2->hashval); @@ -365,10 +352,9 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, #endif /* Header is already logged by xfs_da_node_create */ - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, node->btree, sizeof(xfs_da_node_entry_t) * 2)); - xfs_da_buf_done(bp); return(0); } @@ -389,7 +375,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, trace_xfs_da_node_split(state->args); - node = oldblk->bp->data; + node = oldblk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); /* @@ -436,7 +422,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * If we had double-split op below us, then add the extra block too. */ - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (oldblk->index <= be16_to_cpu(node->hdr.count)) { oldblk->index++; xfs_da_node_add(state, oldblk, addblk); @@ -477,8 +463,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, trace_xfs_da_node_rebalance(state->args); - node1 = blk1->bp->data; - node2 = blk2->bp->data; + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. @@ -532,7 +518,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; memcpy(btree_d, btree_s, tmp); be16_add_cpu(&node1->hdr.count, count); - xfs_da_log_buf(tp, blk1->bp, + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* @@ -549,9 +535,9 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Log header of node 1 and all current bits of node 2. */ - xfs_da_log_buf(tp, blk1->bp, + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); - xfs_da_log_buf(tp, blk2->bp, + xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, sizeof(node2->hdr) + sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); @@ -560,8 +546,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->data; - node2 = blk2->bp->data; + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); @@ -587,7 +573,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, trace_xfs_da_node_add(state->args); - node = oldblk->bp->data; + node = oldblk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); ASSERT(newblk->blkno != 0); @@ -606,10 +592,10 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, } btree->hashval = cpu_to_be32(newblk->hashval); btree->before = cpu_to_be32(newblk->blkno); - xfs_da_log_buf(state->args->trans, oldblk->bp, + xfs_trans_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); be16_add_cpu(&node->hdr.count, 1); - xfs_da_log_buf(state->args->trans, oldblk->bp, + xfs_trans_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); /* @@ -735,7 +721,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_intnode_t *oldroot; xfs_da_args_t *args; xfs_dablk_t child; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_da_root_join(state->args); @@ -743,7 +729,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) args = state->args; ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); - oldroot = root_blk->bp->data; + oldroot = root_blk->bp->b_addr; ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(!oldroot->hdr.info.forw); ASSERT(!oldroot->hdr.info.back); @@ -765,11 +751,11 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) if (error) return(error); ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->data, + xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); - memcpy(root_blk->bp->data, bp->data, state->blocksize); - xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -791,7 +777,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_da_blkinfo_t *info; int count, forward, error, retval, i; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Check for the degenerate case of the block being over 50% full. @@ -799,7 +785,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; count = be16_to_cpu(node->hdr.count); @@ -859,10 +845,10 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) count = state->node_ents; count -= state->node_ents >> 2; count -= be16_to_cpu(node->hdr.count); - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); count -= be16_to_cpu(node->hdr.count); - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); if (count >= 0) break; /* fits with at least 25% to spare */ } @@ -934,14 +920,14 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) break; } for (blk--, level--; level >= 0; blk--, level--) { - node = blk->bp->data; + node = blk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); btree = &node->btree[ blk->index ]; if (be32_to_cpu(btree->hashval) == lasthash) break; blk->hashval = lasthash; btree->hashval = cpu_to_be32(lasthash); - xfs_da_log_buf(state->args->trans, blk->bp, + xfs_trans_log_buf(state->args->trans, blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); @@ -960,7 +946,7 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) trace_xfs_da_node_remove(state->args); - node = drop_blk->bp->data; + node = drop_blk->bp->b_addr; ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); ASSERT(drop_blk->index >= 0); @@ -972,15 +958,15 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); memmove(btree, btree + 1, tmp); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, tmp)); btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; } memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); be16_add_cpu(&node->hdr.count, -1); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); /* @@ -1005,8 +991,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_node_unbalance(state->args); - drop_node = drop_blk->bp->data; - save_node = save_blk->bp->data; + drop_node = drop_blk->bp->b_addr; + save_node = save_blk->bp->b_addr; ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); tp = state->args->trans; @@ -1023,13 +1009,13 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); memmove(btree, &save_node->btree[0], tmp); btree = &save_node->btree[0]; - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, btree, (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * sizeof(xfs_da_node_entry_t))); } else { btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, btree, be16_to_cpu(drop_node->hdr.count) * sizeof(xfs_da_node_entry_t))); @@ -1042,7 +1028,7 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, memcpy(btree, &drop_node->btree[0], tmp); be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, sizeof(save_node->hdr))); @@ -1100,7 +1086,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) state->path.active--; return(error); } - curr = blk->bp->data; + curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); ASSERT(blk->magic == XFS_DA_NODE_MAGIC || blk->magic == XFS_DIR2_LEAFN_MAGIC || @@ -1110,7 +1096,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Search an intermediate node for a match. */ if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->data; + node = blk->bp->b_addr; max = be16_to_cpu(node->hdr.count); blk->hashval = be32_to_cpu(node->btree[max-1].hashval); @@ -1216,15 +1202,15 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; xfs_da_args_t *args; int before=0, error; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Set up environment. */ args = state->args; ASSERT(args != NULL); - old_info = old_blk->bp->data; - new_info = new_blk->bp->data; + old_info = old_blk->bp->b_addr; + new_info = new_blk->bp->b_addr; ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1261,12 +1247,11 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->back = cpu_to_be32(new_blk->blkno); } else { @@ -1283,18 +1268,17 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); tmp_info->back = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->forw = cpu_to_be32(new_blk->blkno); } - xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); - xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); return(0); } @@ -1302,12 +1286,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, * Compare two intermediate nodes for "order". */ STATIC int -xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) +xfs_da_node_order( + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) { xfs_da_intnode_t *node1, *node2; - node1 = node1_bp->data; - node2 = node2_bp->data; + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && @@ -1324,11 +1310,13 @@ xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) * Pick up the last hashvalue from an intermediate node. */ STATIC uint -xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) +xfs_da_node_lasthash( + struct xfs_buf *bp, + int *count) { xfs_da_intnode_t *node; - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (count) *count = be16_to_cpu(node->hdr.count); @@ -1346,7 +1334,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, { xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; xfs_da_args_t *args; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; /* @@ -1354,8 +1342,8 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, */ args = state->args; ASSERT(args != NULL); - save_info = save_blk->bp->data; - drop_info = drop_blk->bp->data; + save_info = save_blk->bp->b_addr; + drop_info = drop_blk->bp->b_addr; ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1380,13 +1368,12 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); tmp_info->forw = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } else { trace_xfs_da_unlink_forward(args); @@ -1398,17 +1385,16 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); tmp_info->back = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } - xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); return(0); } @@ -1443,7 +1429,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { ASSERT(blk->bp != NULL); - node = blk->bp->data; + node = blk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { blk->index++; @@ -1471,7 +1457,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * (if it's dirty, trans won't actually let go) */ if (release) - xfs_da_brelse(args->trans, blk->bp); + xfs_trans_brelse(args->trans, blk->bp); /* * Read the next child block. @@ -1482,7 +1468,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, if (error) return(error); ASSERT(blk->bp != NULL); - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); @@ -1702,11 +1688,13 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, - xfs_dabuf_t **dead_bufp) +xfs_da_swap_lastblock( + xfs_da_args_t *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; + struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; xfs_fileoff_t lastoff; xfs_inode_t *ip; xfs_trans_t *tp; @@ -1744,9 +1732,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize); - xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); - dead_info = dead_buf->data; + memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize); + xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + dead_info = dead_buf->b_addr; /* * Get values from the moved block. */ @@ -1767,7 +1755,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if ((sib_blkno = be32_to_cpu(dead_info->back))) { if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1777,10 +1765,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->forw = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->forw, sizeof(sib_info->forw))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } /* @@ -1789,7 +1776,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if ((sib_blkno = be32_to_cpu(dead_info->forw))) { if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1799,10 +1786,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->back = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->back, sizeof(sib_info->back))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } par_blkno = mp->m_dirleafblk; @@ -1813,7 +1799,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, for (;;) { if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) goto done; - par_node = par_buf->data; + par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC) || (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { @@ -1837,7 +1823,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, par_blkno = be32_to_cpu(par_node->btree[entno].before); if (level == dead_level + 1) break; - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; } /* @@ -1853,7 +1839,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if (entno < be16_to_cpu(par_node->hdr.count)) break; par_blkno = be32_to_cpu(par_node->hdr.info.forw); - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", @@ -1863,7 +1849,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, } if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) goto done; - par_node = par_buf->data; + par_node = par_buf->b_addr; if (unlikely( be16_to_cpu(par_node->hdr.level) != level || par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { @@ -1878,20 +1864,18 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, * Update the parent entry pointing to the moved block. */ par_node->btree[entno].before = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, par_buf, + xfs_trans_log_buf(tp, par_buf, XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, sizeof(par_node->btree[entno].before))); - xfs_da_buf_done(par_buf); - xfs_da_buf_done(dead_buf); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; done: if (par_buf) - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); if (sib_buf) - xfs_da_brelse(tp, sib_buf); - xfs_da_brelse(tp, last_buf); + xfs_trans_brelse(tp, sib_buf); + xfs_trans_brelse(tp, last_buf); return error; } @@ -1899,8 +1883,10 @@ done: * Remove a btree block from a directory or attribute. */ int -xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf) +xfs_da_shrink_inode( + xfs_da_args_t *args, + xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf) { xfs_inode_t *dp; int done, error, w, count; @@ -1935,7 +1921,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, break; } } - xfs_da_binval(tp, dead_buf); + xfs_trans_binval(tp, dead_buf); return error; } @@ -1967,35 +1953,75 @@ xfs_da_map_covers_blocks( } /* - * Make a dabuf. - * Used for get_buf, read_buf, read_bufr, and reada_buf. + * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. + * + * For the single map case, it is assumed that the caller has provided a pointer + * to a valid xfs_buf_map. For the multiple map case, this function will + * allocate the xfs_buf_map to hold all the maps and replace the caller's single + * map pointer with the allocated map. */ -STATIC int -xfs_da_do_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t *mappedbnop, - xfs_dabuf_t **bpp, - int whichfork, - int caller) +static int +xfs_buf_map_from_irec( + struct xfs_mount *mp, + struct xfs_buf_map **mapp, + unsigned int *nmaps, + struct xfs_bmbt_irec *irecs, + unsigned int nirecs) { - xfs_buf_t *bp = NULL; - xfs_buf_t **bplist; - int error=0; - int i; - xfs_bmbt_irec_t map; - xfs_bmbt_irec_t *mapp; - xfs_daddr_t mappedbno; - xfs_mount_t *mp; - int nbplist=0; - int nfsb; - int nmap; - xfs_dabuf_t *rbp; + struct xfs_buf_map *map; + int i; + + ASSERT(*nmaps == 1); + ASSERT(nirecs >= 1); + + if (nirecs > 1) { + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + if (!map) + return ENOMEM; + *mapp = map; + } + + *nmaps = nirecs; + map = *mapp; + for (i = 0; i < *nmaps; i++) { + ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && + irecs[i].br_startblock != HOLESTARTBLOCK); + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); + map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + } + return 0; +} + +/* + * Map the block we are given ready for reading. There are three possible return + * values: + * -1 - will be returned if we land in a hole and mappedbno == -2 so the + * caller knows not to execute a subsequent read. + * 0 - if we mapped the block successfully + * >0 - positive error number if there was an error. + */ +static int +xfs_dabuf_map( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + struct xfs_buf_map **map, + int *nmaps) +{ + struct xfs_mount *mp = dp->i_mount; + int nfsb; + int error = 0; + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec *irecs = &irec; + int nirecs; + + ASSERT(map && *map); + ASSERT(*nmaps == 1); - mp = dp->i_mount; nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; - mappedbno = *mappedbnop; + /* * Caller doesn't have a mapping. -2 means don't complain * if we land in a hole. @@ -2004,112 +2030,150 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) - mapp = ↦ - else - mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); + if (nfsb != 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); - nmap = nfsb; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, - &nmap, xfs_bmapi_aflag(whichfork)); + nirecs = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, + &nirecs, xfs_bmapi_aflag(whichfork)); if (error) - goto exit0; + goto out; } else { - map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = nfsb; - mapp = ↦ - nmap = 1; + irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + irecs->br_startoff = (xfs_fileoff_t)bno; + irecs->br_blockcount = nfsb; + irecs->br_state = 0; + nirecs = 1; } - if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { - error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + + if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { + error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED); if (unlikely(error == EFSCORRUPTED)) { if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + int i; xfs_alert(mp, "%s: bno %lld dir: inode %lld", __func__, (long long)bno, (long long)dp->i_ino); - for (i = 0; i < nmap; i++) { + for (i = 0; i < *nmaps; i++) { xfs_alert(mp, "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", i, - (long long)mapp[i].br_startoff, - (long long)mapp[i].br_startblock, - (long long)mapp[i].br_blockcount, - mapp[i].br_state); + (long long)irecs[i].br_startoff, + (long long)irecs[i].br_startblock, + (long long)irecs[i].br_blockcount, + irecs[i].br_state); } } XFS_ERROR_REPORT("xfs_da_do_buf(1)", XFS_ERRLEVEL_LOW, mp); } - goto exit0; + goto out; } - if (caller != 3 && nmap > 1) { - bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); - nbplist = 0; - } else - bplist = NULL; - /* - * Turn the mapping(s) into buffer(s). - */ - for (i = 0; i < nmap; i++) { - int nmapped; - - mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); - if (i == 0) - *mappedbnop = mappedbno; - nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); - switch (caller) { - case 0: - bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, - mappedbno, nmapped, 0); - error = bp ? bp->b_error : XFS_ERROR(EIO); - break; - case 1: - case 2: - bp = NULL; - error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, - mappedbno, nmapped, 0, &bp); - break; - case 3: - xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped); + error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); +out: + if (irecs != &irec) + kmem_free(irecs); + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) error = 0; - bp = NULL; - break; - } - if (error) { - if (bp) - xfs_trans_brelse(trans, bp); - goto exit1; - } - if (!bp) - continue; - if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) - xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); - else - xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - } - if (bplist) { - bplist[nbplist++] = bp; - } + goto out_free; } - /* - * Build a dabuf structure. - */ - if (bplist) { - rbp = xfs_da_buf_make(nbplist, bplist); - } else if (bp) - rbp = xfs_da_buf_make(1, &bp); + + bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, + mapp, nmap, 0); + error = bp ? bp->b_error : XFS_ERROR(EIO); + if (error) { + xfs_trans_brelse(trans, bp); + goto out_free; + } + + *bpp = bp; + +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + error = xfs_trans_read_buf_map(dp->i_mount, trans, + dp->i_mount->m_ddev_targp, + mapp, nmap, 0, &bp); + if (error) + goto out_free; + + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else - rbp = NULL; + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); + /* - * For read_buf, check the magic number. + * This verification code will be moved to a CRC verification callback + * function so just leave it here unchanged until then. */ - if (caller == 1) { - xfs_dir2_data_hdr_t *hdr = rbp->data; - xfs_dir2_free_t *free = rbp->data; - xfs_da_blkinfo_t *info = rbp->data; + { + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_free_t *free = bp->b_addr; + xfs_da_blkinfo_t *info = bp->b_addr; uint magic, magic1; + struct xfs_mount *mp = dp->i_mount; magic = be16_to_cpu(info->magic); magic1 = be32_to_cpu(hdr->magic); @@ -2123,66 +2187,20 @@ xfs_da_do_buf( (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_); + trace_xfs_da_btree_corrupt(bp, _RET_IP_); XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", XFS_ERRLEVEL_LOW, mp, info); error = XFS_ERROR(EFSCORRUPTED); - xfs_da_brelse(trans, rbp); - nbplist = 0; - goto exit1; + xfs_trans_brelse(trans, bp); + goto out_free; } } - if (bplist) { - kmem_free(bplist); - } - if (mapp != &map) { - kmem_free(mapp); - } - if (bpp) - *bpp = rbp; - return 0; -exit1: - if (bplist) { - for (i = 0; i < nbplist; i++) - xfs_trans_brelse(trans, bplist[i]); - kmem_free(bplist); - } -exit0: + *bpp = bp; +out_free: if (mapp != &map) kmem_free(mapp); - if (bpp) - *bpp = NULL; - return error; -} - -/* - * Get a buffer for the dir/attr block. - */ -int -xfs_da_get_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0); -} -/* - * Get a buffer for the dir/attr block, fill in the contents. - */ -int -xfs_da_read_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1); + return error; } /* @@ -2190,22 +2208,41 @@ xfs_da_read_buf( */ xfs_daddr_t xfs_da_reada_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - int whichfork) + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + int whichfork) { - xfs_daddr_t rval; + xfs_daddr_t mappedbno = -1; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } - rval = -1; - if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3)) + mappedbno = mapp[0].bm_bn; + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + +out_free: + if (mapp != &map) + kmem_free(mapp); + + if (error) return -1; - else - return rval; + return mappedbno; } kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ -kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ /* * Allocate a dir-state structure. @@ -2225,13 +2262,8 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state) { int i; - for (i = 0; i < state->altpath.active; i++) { - if (state->altpath.blk[i].bp) { - if (state->altpath.blk[i].bp != state->path.blk[i].bp) - xfs_da_buf_done(state->altpath.blk[i].bp); - state->altpath.blk[i].bp = NULL; - } - } + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; state->altpath.active = 0; } @@ -2241,204 +2273,9 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state) void xfs_da_state_free(xfs_da_state_t *state) { - int i; - xfs_da_state_kill_altpath(state); - for (i = 0; i < state->path.active; i++) { - if (state->path.blk[i].bp) - xfs_da_buf_done(state->path.blk[i].bp); - } - if (state->extravalid && state->extrablk.bp) - xfs_da_buf_done(state->extrablk.bp); #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ kmem_zone_free(xfs_da_state_zone, state); } - -/* - * Create a dabuf. - */ -/* ARGSUSED */ -STATIC xfs_dabuf_t * -xfs_da_buf_make(int nbuf, xfs_buf_t **bps) -{ - xfs_buf_t *bp; - xfs_dabuf_t *dabuf; - int i; - int off; - - if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); - else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); - dabuf->dirty = 0; - if (nbuf == 1) { - dabuf->nbuf = 1; - bp = bps[0]; - dabuf->bbcount = bp->b_length; - dabuf->data = bp->b_addr; - dabuf->bps[0] = bp; - } else { - dabuf->nbuf = nbuf; - for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { - dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += bp->b_length; - } - dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { - bp = bps[i]; - memcpy((char *)dabuf->data + off, bp->b_addr, - BBTOB(bp->b_length)); - } - } - return dabuf; -} - -/* - * Un-dirty a dabuf. - */ -STATIC void -xfs_da_buf_clean(xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - int i; - int off; - - if (dabuf->dirty) { - ASSERT(dabuf->nbuf > 1); - dabuf->dirty = 0; - for (i = off = 0; i < dabuf->nbuf; - i++, off += BBTOB(bp->b_length)) { - bp = dabuf->bps[i]; - memcpy(bp->b_addr, dabuf->data + off, - BBTOB(bp->b_length)); - } - } -} - -/* - * Release a dabuf. - */ -void -xfs_da_buf_done(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf); - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->dirty) - xfs_da_buf_clean(dabuf); - if (dabuf->nbuf > 1) { - kmem_free(dabuf->data); - kmem_free(dabuf); - } else { - kmem_zone_free(xfs_dabuf_zone, dabuf); - } -} - -/* - * Log transaction from a dabuf. - */ -void -xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) -{ - xfs_buf_t *bp; - uint f; - int i; - uint l; - int off; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == dabuf->bps[0]->b_addr); - xfs_trans_log_buf(tp, dabuf->bps[0], first, last); - return; - } - dabuf->dirty = 1; - ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { - bp = dabuf->bps[i]; - f = off; - l = f + BBTOB(bp->b_length) - 1; - if (f < first) - f = first; - if (l > last) - l = last; - if (f <= l) - xfs_trans_log_buf(tp, bp, f - off, l - off); - /* - * B_DONE is set by xfs_trans_log buf. - * If we don't set it on a new buffer (get not read) - * then if we don't put anything in the buffer it won't - * be set, and at commit it it released into the cache, - * and then a read will fail. - */ - else if (!(XFS_BUF_ISDONE(bp))) - XFS_BUF_DONE(bp); - } - ASSERT(last < off); -} - -/* - * Release dabuf from a transaction. - * Have to free up the dabuf before the buffers are released, - * since the synchronization on the dabuf is really the lock on the buffer. - */ -void -xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } - xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_brelse(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist); -} - -/* - * Invalidate dabuf from a transaction. - */ -void -xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } - xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_binval(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist); -} - -/* - * Get the first daddr from a dabuf. - */ -xfs_daddr_t -xfs_da_blkno(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->nbuf); - ASSERT(dabuf->data); - return XFS_BUF_ADDR(dabuf->bps[0]); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index dbf7c074ae73..132adafb041e 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -32,7 +32,7 @@ struct zone; /* * This structure is common to both leaf nodes and non-leaf nodes in the Btree. * - * Is is used to manage a doubly linked list of all blocks at the same + * It is used to manage a doubly linked list of all blocks at the same * level in the Btree, and to identify which type of block this is. */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ @@ -133,24 +133,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_CILOOKUP, "CILOOKUP" } /* - * Structure to describe buffer(s) for a block. - * This is needed in the directory version 2 format case, when - * multiple non-contiguous fsblocks might be needed to cover one - * logical directory block. - * If the buffer count is 1 then the data pointer points to the - * same place as the b_addr field for the buffer, else to kmem_alloced memory. - */ -typedef struct xfs_dabuf { - int nbuf; /* number of buffer pointers present */ - short dirty; /* data needs to be copied back */ - short bbcount; /* how large is data in bbs */ - void *data; /* pointer for buffers' data */ - struct xfs_buf *bps[1]; /* actually nbuf of these */ -} xfs_dabuf_t; -#define XFS_DA_BUF_SIZE(n) \ - (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) - -/* * Storage for holding state during Btree searches and split/join ops. * * Only need space for 5 intermediate nodes. With a minimum of 62-way @@ -158,7 +140,7 @@ typedef struct xfs_dabuf { * which is slightly more than enough. */ typedef struct xfs_da_state_blk { - xfs_dabuf_t *bp; /* buffer containing block */ + struct xfs_buf *bp; /* buffer containing block */ xfs_dablk_t blkno; /* filesystem blkno of buffer */ xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ int index; /* relevant index into block */ @@ -211,7 +193,7 @@ struct xfs_nameops { * Routines used for growing the Btree. */ int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork); int xfs_da_split(xfs_da_state_t *state); /* @@ -241,14 +223,14 @@ int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bp, int whichfork); + struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, int whichfork); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf); + struct xfs_buf *dead_buf); uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, @@ -258,15 +240,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); -void xfs_da_buf_done(xfs_dabuf_t *dabuf); -void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, - uint last); -void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); - extern struct kmem_zone *xfs_da_state_zone; -extern struct kmem_zone *xfs_dabuf_zone; extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index a3721633abc8..1d9643b3dce6 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -33,7 +33,7 @@ typedef struct xfs_timestamp { * variable size the leftover area split into a data and an attribute fork. * The format of the data and attribute fork depends on the format of the * inode as indicated by di_format and di_aformat. To access the data and - * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros * below. * * There is a very similar struct icdinode in xfs_inode which matches the diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 67a250c36d41..b26a50f9921d 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -592,7 +592,7 @@ int xfs_dir2_shrink_inode( xfs_da_args_t *args, xfs_dir2_db_t db, - xfs_dabuf_t *bp) + struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ @@ -634,7 +634,7 @@ xfs_dir2_shrink_inode( /* * Invalidate the buffer from the transaction. */ - xfs_da_binval(tp, bp); + xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 586732f2d80d..e93ca8f054f4 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -37,10 +37,10 @@ /* * Local function prototypes. */ -static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, - int last); -static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, + int first, int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, int *entno); static int xfs_dir2_block_sort(const void *a, const void *b); @@ -66,7 +66,7 @@ xfs_dir2_block_addname( xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ int compact; /* need to compact leaf ents */ xfs_dir2_data_entry_t *dep; /* block data entry */ @@ -102,14 +102,14 @@ xfs_dir2_block_addname( return error; } ASSERT(bp != NULL); - hdr = bp->data; + hdr = bp->b_addr; /* * Check the magic number, corrupted if wrong. */ if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", XFS_ERRLEVEL_LOW, mp, hdr); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(EFSCORRUPTED); } len = xfs_dir2_data_entsize(args->namelen); @@ -212,7 +212,7 @@ xfs_dir2_block_addname( * If this isn't a real add, we're done with the buffer. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); /* * If we don't have space for the new entry & leaf ... */ @@ -228,7 +228,6 @@ xfs_dir2_block_addname( * Then add the new entry in that format. */ error = xfs_dir2_block_to_leaf(args, bp); - xfs_da_buf_done(bp); if (error) return error; return xfs_dir2_leaf_addname(args); @@ -422,7 +421,6 @@ xfs_dir2_block_addname( xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(tp, bp, dep); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } @@ -437,7 +435,7 @@ xfs_dir2_block_getdents( filldir_t filldir) { xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_dir2_data_unused_t *dup; /* block unused entry */ @@ -469,7 +467,7 @@ xfs_dir2_block_getdents( * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(mp, *offset); - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* * Set up values for the loop. @@ -514,7 +512,7 @@ xfs_dir2_block_getdents( cook & 0x7fffffff, be64_to_cpu(dep->inumber), DT_UNKNOWN)) { *offset = cook & 0x7fffffff; - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return 0; } } @@ -525,7 +523,7 @@ xfs_dir2_block_getdents( */ *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return 0; } @@ -535,17 +533,17 @@ xfs_dir2_block_getdents( static void xfs_dir2_block_log_leaf( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, /* block buffer */ int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); blp = xfs_dir2_block_leaf_p(btp); - xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } @@ -555,13 +553,13 @@ xfs_dir2_block_log_leaf( static void xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_buf *bp) /* block buffer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); - xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } @@ -575,7 +573,7 @@ xfs_dir2_block_lookup( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -593,7 +591,7 @@ xfs_dir2_block_lookup( return error; dp = args->dp; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -607,7 +605,7 @@ xfs_dir2_block_lookup( */ args->inumber = be64_to_cpu(dep->inumber); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return XFS_ERROR(error); } @@ -617,13 +615,13 @@ xfs_dir2_block_lookup( static int /* error */ xfs_dir2_block_lookup_int( xfs_da_args_t *args, /* dir lookup arguments */ - xfs_dabuf_t **bpp, /* returned block buffer */ + struct xfs_buf **bpp, /* returned block buffer */ int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -647,7 +645,7 @@ xfs_dir2_block_lookup_int( return error; } ASSERT(bp != NULL); - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -666,7 +664,7 @@ xfs_dir2_block_lookup_int( high = mid - 1; if (low > high) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } } @@ -714,7 +712,7 @@ xfs_dir2_block_lookup_int( /* * No match, release the buffer and return ENOENT. */ - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -728,7 +726,7 @@ xfs_dir2_block_removename( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -753,7 +751,7 @@ xfs_dir2_block_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -790,10 +788,9 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) { - xfs_da_buf_done(bp); + if (size > XFS_IFORK_DSIZE(dp)) return 0; - } + /* * If it works, do the conversion. */ @@ -810,7 +807,7 @@ xfs_dir2_block_replace( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -829,7 +826,7 @@ xfs_dir2_block_replace( } dp = args->dp; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -844,7 +841,6 @@ xfs_dir2_block_replace( dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_log_entry(args->trans, bp, dep); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } @@ -871,8 +867,8 @@ xfs_dir2_block_sort( int /* error */ xfs_dir2_leaf_to_block( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ - xfs_dabuf_t *dbp) /* data buffer */ + struct xfs_buf *lbp, /* leaf buffer */ + struct xfs_buf *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ xfs_dir2_data_hdr_t *hdr; /* block header */ @@ -898,7 +894,7 @@ xfs_dir2_leaf_to_block( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; + leaf = lbp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* @@ -914,11 +910,9 @@ xfs_dir2_leaf_to_block( if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) - goto out; - } else { - error = 0; - goto out; - } + return error; + } else + return 0; } /* * Read the data block if we don't already have it, give up if it fails. @@ -926,9 +920,9 @@ xfs_dir2_leaf_to_block( if (dbp == NULL && (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, XFS_DATA_FORK))) { - goto out; + return error; } - hdr = dbp->data; + hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); /* * Size of the "leaf" area in the block. @@ -944,10 +938,9 @@ xfs_dir2_leaf_to_block( * If it's not free or is too short we can't do it. */ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || - be16_to_cpu(dup->length) < size) { - error = 0; - goto out; - } + be16_to_cpu(dup->length) < size) + return 0; + /* * Start converting it to block form. */ @@ -989,25 +982,17 @@ xfs_dir2_leaf_to_block( * Pitch the old leaf block. */ error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); - lbp = NULL; - if (error) { - goto out; - } + if (error) + return error; + /* * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) { - error = 0; - goto out; - } + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); -out: - if (lbp) - xfs_da_buf_done(lbp); - if (dbp) - xfs_da_buf_done(dbp); - return error; } /* @@ -1020,7 +1005,7 @@ xfs_dir2_sf_to_block( xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ @@ -1088,7 +1073,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } - hdr = bp->data; + hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* * Compute size of block "tail" area. @@ -1217,6 +1202,5 @@ xfs_dir2_sf_to_block( xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 2046988e9eb2..44ffd4d6bc91 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -42,8 +42,8 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); */ void xfs_dir2_data_check( - xfs_inode_t *dp, /* incore inode pointer */ - xfs_dabuf_t *bp) /* data block's buffer */ + struct xfs_inode *dp, /* incore inode pointer */ + struct xfs_buf *bp) /* data block's buffer */ { xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ xfs_dir2_data_free_t *bf; /* bestfree table */ @@ -65,7 +65,7 @@ xfs_dir2_data_check( struct xfs_name name; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); @@ -389,9 +389,9 @@ int /* error */ xfs_dir2_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ - xfs_dabuf_t **bpp) /* output block buffer */ + struct xfs_buf **bpp) /* output block buffer */ { - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ @@ -417,7 +417,7 @@ xfs_dir2_data_init( /* * Initialize the header. */ - hdr = bp->data; + hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { @@ -449,16 +449,16 @@ xfs_dir2_data_init( */ void xfs_dir2_data_log_entry( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - (char *)hdr - 1)); } @@ -468,15 +468,15 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - xfs_da_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); } /* @@ -484,11 +484,11 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); @@ -496,13 +496,13 @@ xfs_dir2_data_log_unused( /* * Log the first part of the unused entry. */ - xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); @@ -514,8 +514,8 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ int *needlogp, /* out: log header */ @@ -531,7 +531,7 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ mp = tp->t_mountp; - hdr = bp->data; + hdr = bp->b_addr; /* * Figure out where the end of the data area is. @@ -696,8 +696,8 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* data block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ xfs_dir2_data_aoff_t len, /* length to use */ @@ -713,7 +713,7 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ - hdr = bp->data; + hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 397ffbcbab1d..0b296253bd01 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -38,15 +38,15 @@ * Local function declarations. */ #ifdef DEBUG -static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_leaf_check(dp, bp) #endif -static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, - int *indexp, xfs_dabuf_t **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); +static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); /* @@ -55,7 +55,7 @@ static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); int /* error */ xfs_dir2_block_to_leaf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *dbp) /* input block's buffer */ + struct xfs_buf *dbp) /* input block's buffer */ { __be16 *bestsp; /* leaf's bestsp entries */ xfs_dablk_t blkno; /* leaf block's bno */ @@ -64,7 +64,7 @@ xfs_dir2_block_to_leaf( xfs_dir2_block_tail_t *btp; /* block's tail */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *lbp; /* leaf block's buffer */ + struct xfs_buf *lbp; /* leaf block's buffer */ xfs_dir2_db_t ldb; /* leaf block's bno */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ @@ -95,8 +95,8 @@ xfs_dir2_block_to_leaf( return error; } ASSERT(lbp != NULL); - leaf = lbp->data; - hdr = dbp->data; + leaf = lbp->b_addr; + hdr = dbp->b_addr; xfs_dir2_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -143,7 +143,6 @@ xfs_dir2_block_to_leaf( xfs_dir2_leaf_check(dp, lbp); xfs_dir2_data_check(dp, dbp); xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); - xfs_da_buf_done(lbp); return 0; } @@ -282,7 +281,7 @@ xfs_dir2_leaf_addname( __be16 *bestsp; /* freespace table in leaf */ int compact; /* need to compact leaves */ xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry */ @@ -291,7 +290,7 @@ xfs_dir2_leaf_addname( int highstale; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - xfs_dabuf_t *lbp; /* leaf's buffer */ + struct xfs_buf *lbp; /* leaf's buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ @@ -328,7 +327,7 @@ xfs_dir2_leaf_addname( * But if there are dup hash values the index is of the first of those. */ index = xfs_dir2_leaf_search_hash(args, lbp); - leaf = lbp->data; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); length = xfs_dir2_data_entsize(args->namelen); @@ -402,14 +401,13 @@ xfs_dir2_leaf_addname( */ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* * Convert to node form. */ error = xfs_dir2_leaf_to_node(args, lbp); - xfs_da_buf_done(lbp); if (error) return error; /* @@ -427,7 +425,7 @@ xfs_dir2_leaf_addname( * a new data block. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; } /* @@ -435,7 +433,7 @@ xfs_dir2_leaf_addname( * changed anything. */ if (args->total == 0 && use_block == -1) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* @@ -466,14 +464,14 @@ xfs_dir2_leaf_addname( */ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &use_block))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* * Initialize the block. */ if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* @@ -493,7 +491,7 @@ xfs_dir2_leaf_addname( */ else xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); - hdr = dbp->data; + hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; } @@ -505,10 +503,10 @@ xfs_dir2_leaf_addname( if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp, XFS_DATA_FORK))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } - hdr = dbp->data; + hdr = dbp->b_addr; grown = 0; } xfs_dir2_data_check(dp, dbp); @@ -570,9 +568,7 @@ xfs_dir2_leaf_addname( xfs_dir2_leaf_log_header(tp, lbp); xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); xfs_dir2_data_check(dp, dbp); - xfs_da_buf_done(dbp); return 0; } @@ -583,8 +579,8 @@ xfs_dir2_leaf_addname( */ STATIC void xfs_dir2_leaf_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf's buffer */ + struct xfs_inode *dp, /* incore directory inode */ + struct xfs_buf *bp) /* leaf's buffer */ { int i; /* leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -592,7 +588,7 @@ xfs_dir2_leaf_check( xfs_mount_t *mp; /* filesystem mount point */ int stale; /* count of stale leaves */ - leaf = bp->data; + leaf = bp->b_addr; mp = dp->i_mount; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); /* @@ -628,14 +624,14 @@ xfs_dir2_leaf_check( void xfs_dir2_leaf_compact( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ - leaf = bp->data; + leaf = bp->b_addr; if (!leaf->hdr.stale) { return; } @@ -677,7 +673,7 @@ xfs_dir2_leaf_compact( */ void xfs_dir2_leaf_compact_x1( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -693,7 +689,7 @@ xfs_dir2_leaf_compact_x1( int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); index = *indexp; @@ -763,6 +759,218 @@ xfs_dir2_leaf_compact_x1( *highstalep = highstale; } +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_inode *dp, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + int error = 0; + int length; + int i; + int j; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= mp->m_dirblkfsbs; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = mp->m_dirblkfsbs; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(mp, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + error = xfs_da_read_buf(NULL, dp, map->br_startoff, + map->br_blockcount >= mp->m_dirblkfsbs ? + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, + &bp, XFS_DATA_FORK); + + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= mp->m_dirblkfsbs; + + /* + * Do we need more readahead? + */ + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += mp->m_dirblkfsbs) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, + map[mip->ra_index].br_startblock + + mip->ra_offset), + (int)BTOBB(mp->m_dirblksize)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_da_reada_buf(NULL, dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, + XFS_DATA_FORK); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < mp->m_dirblkfsbs; j++) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, mp->m_dirblkfsbs, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + j += length; + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + +out: + *bpp = bp; + return error; +} + /* * Getdents (readdir) for leaf and node directories. * This reads the data blocks only, so is the same for both forms. @@ -775,30 +983,18 @@ xfs_dir2_leaf_getdents( xfs_off_t *offset, filldir_t filldir) { - xfs_dabuf_t *bp; /* data block buffer */ - int byteoff; /* offset in current block */ - xfs_dir2_db_t curdb; /* db for current block */ - xfs_dir2_off_t curoff; /* current overall offset */ + struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ int error = 0; /* error return value */ - int i; /* temporary loop index */ - int j; /* temporary loop index */ int length; /* temporary length value */ - xfs_bmbt_irec_t *map; /* map vector for blocks */ - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ xfs_mount_t *mp; /* filesystem mount point */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ xfs_dir2_off_t newoff; /* new curoff after new blk */ - int nmap; /* mappings to ask xfs_bmapi */ char *ptr = NULL; /* pointer to current data */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ + struct xfs_dir2_leaf_map_info *map_info; /* * If the offset is at or past the largest allowed value, @@ -814,10 +1010,12 @@ xfs_dir2_leaf_getdents( * buffer size, the directory block size, and the filesystem * block size. */ - map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize); - map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); - map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; - bp = NULL; + length = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP); + map_info->map_size = length; /* * Inside the loop we keep the main offset value as a byte offset @@ -829,7 +1027,9 @@ xfs_dir2_leaf_getdents( * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff)); + map_info->map_off = xfs_dir2_db_to_da(mp, + xfs_dir2_byte_to_db(mp, curoff)); + /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. @@ -839,191 +1039,17 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) { - /* - * If we have a buffer, we need to release it and - * take it out of the mapping. - */ - if (bp) { - xfs_da_brelse(NULL, bp); - bp = NULL; - map_blocks -= mp->m_dirblkfsbs; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = mp->m_dirblkfsbs; i > 0; ) { - j = MIN((int)map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * - map_valid); - i -= j; - } - } - /* - * Recalculate the readahead blocks wanted. - */ - ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; - ASSERT(ra_want >= 0); + if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + ra_want > map_blocks && - map_off < - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - nmap = map_size - map_valid; - error = xfs_bmapi_read(dp, map_off, - xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET) - map_off, - &map[map_valid], &nmap, 0); - /* - * Don't know if we should ignore this or - * try to return an error. - * The trouble with returning errors - * is that readdir will just stop without - * actually passing the error through. - */ - if (error) - break; /* XXX */ - /* - * If we got all the mappings we asked for, - * set the final map offset based on the - * last bmap value received. - * Otherwise, we've reached the end. - */ - if (nmap == map_size - map_valid) - map_off = - map[map_valid + nmap - 1].br_startoff + - map[map_valid + nmap - 1].br_blockcount; - else - map_off = - xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET); - /* - * Look for holes in the mapping, and - * eliminate them. Count up the valid blocks. - */ - for (i = map_valid; i < map_valid + nmap; ) { - if (map[i].br_startblock == - HOLESTARTBLOCK) { - nmap--; - length = map_valid + nmap - i; - if (length) - memmove(&map[i], - &map[i + 1], - sizeof(map[i]) * - length); - } else { - map_blocks += - map[i].br_blockcount; - i++; - } - } - map_valid += nmap; - } - /* - * No valid mappings, so no more data blocks. - */ - if (!map_valid) { - curoff = xfs_dir2_da_to_byte(mp, map_off); + error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) break; - } - /* - * Read the directory block starting at the first - * mapping. - */ - curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : - -1, - &bp, XFS_DATA_FORK); - /* - * Should just skip over the data block instead - * of giving up. - */ - if (error) - break; /* XXX */ - /* - * Adjust the current amount of read-ahead: we just - * read a block that was previously ra. - */ - if (ra_current) - ra_current -= mp->m_dirblkfsbs; - /* - * Do we need more readahead? - */ - for (ra_index = ra_offset = i = 0; - ra_want > ra_current && i < map_blocks; - i += mp->m_dirblkfsbs) { - ASSERT(ra_index < map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > ra_current && - map[ra_index].br_blockcount >= - mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, - map[ra_index].br_startblock + - ra_offset), - (int)BTOBB(mp->m_dirblksize)); - ra_current = i; - } - /* - * Read-ahead a non-contiguous directory block. - * This doesn't use our mapping, but this - * is a very rare case. - */ - else if (i > ra_current) { - (void)xfs_da_reada_buf(NULL, dp, - map[ra_index].br_startoff + - ra_offset, XFS_DATA_FORK); - ra_current = i; - } - /* - * Advance offset through the mapping table. - */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { - /* - * The rest of this extent but not - * more than a dir block. - */ - length = MIN(mp->m_dirblkfsbs, - (int)(map[ra_index].br_blockcount - - ra_offset)); - j += length; - ra_offset += length; - /* - * Advance to the next mapping if - * this one is used up. - */ - if (ra_offset == - map[ra_index].br_blockcount) { - ra_offset = 0; - ra_index++; - } - } - } + /* * Having done a read, we need to set a new offset. */ - newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0); + newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); /* * Start of the current block. */ @@ -1034,8 +1060,8 @@ xfs_dir2_leaf_getdents( */ else if (curoff > newoff) ASSERT(xfs_dir2_byte_to_db(mp, curoff) == - curdb); - hdr = bp->data; + map_info->curdb); + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* * Find our position in the block. @@ -1117,9 +1143,9 @@ xfs_dir2_leaf_getdents( *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; - kmem_free(map); + kmem_free(map_info); if (bp) - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return error; } @@ -1130,10 +1156,10 @@ int xfs_dir2_leaf_init( xfs_da_args_t *args, /* operation arguments */ xfs_dir2_db_t bno, /* directory block number */ - xfs_dabuf_t **bpp, /* out: leaf buffer */ + struct xfs_buf **bpp, /* out: leaf buffer */ int magic) /* magic number for block */ { - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1156,7 +1182,7 @@ xfs_dir2_leaf_init( return error; } ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; /* * Initialize the header. */ @@ -1186,7 +1212,7 @@ xfs_dir2_leaf_init( static void xfs_dir2_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { @@ -1195,12 +1221,12 @@ xfs_dir2_leaf_log_bests( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; - xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1210,7 +1236,7 @@ xfs_dir2_leaf_log_bests( void xfs_dir2_leaf_log_ents( xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { @@ -1218,12 +1244,12 @@ xfs_dir2_leaf_log_ents( xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); firstlep = &leaf->ents[first]; lastlep = &leaf->ents[last]; - xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1232,15 +1258,15 @@ xfs_dir2_leaf_log_ents( */ void xfs_dir2_leaf_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), (uint)(sizeof(leaf->hdr) - 1)); } @@ -1249,18 +1275,18 @@ xfs_dir2_leaf_log_header( */ STATIC void xfs_dir2_leaf_log_tail( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ xfs_mount_t *mp; /* filesystem mount point */ mp = tp->t_mountp; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); - xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); } @@ -1273,12 +1299,12 @@ int xfs_dir2_leaf_lookup( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* found entry index */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ @@ -1294,7 +1320,7 @@ xfs_dir2_leaf_lookup( tp = args->trans; dp = args->dp; xfs_dir2_leaf_check(dp, lbp); - leaf = lbp->data; + leaf = lbp->b_addr; /* * Get to the leaf entry and contained data entry address. */ @@ -1303,15 +1329,15 @@ xfs_dir2_leaf_lookup( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + + ((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); /* * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(error); } @@ -1324,17 +1350,17 @@ xfs_dir2_leaf_lookup( static int /* error */ xfs_dir2_leaf_lookup_int( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t **lbpp, /* out: leaf buffer */ + struct xfs_buf **lbpp, /* out: leaf buffer */ int *indexp, /* out: index in leaf block */ - xfs_dabuf_t **dbpp) /* out: data buffer */ + struct xfs_buf **dbpp) /* out: data buffer */ { xfs_dir2_db_t curdb = -1; /* current data block number */ - xfs_dabuf_t *dbp = NULL; /* data buffer */ + struct xfs_buf *dbp = NULL; /* data buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index in leaf block */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ @@ -1354,7 +1380,7 @@ xfs_dir2_leaf_lookup_int( if (error) return error; *lbpp = lbp; - leaf = lbp->data; + leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); /* * Look for the first leaf entry with our hash value. @@ -1382,12 +1408,12 @@ xfs_dir2_leaf_lookup_int( */ if (newdb != curdb) { if (dbp) - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &dbp, XFS_DATA_FORK); if (error) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } xfs_dir2_data_check(dp, dbp); @@ -1396,7 +1422,7 @@ xfs_dir2_leaf_lookup_int( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)dbp->data + + dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare name and if it's an exact match, return the index @@ -1424,12 +1450,12 @@ xfs_dir2_leaf_lookup_int( if (args->cmpresult == XFS_CMP_CASE) { ASSERT(cidb != -1); if (cidb != curdb) { - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, cidb), -1, &dbp, XFS_DATA_FORK); if (error) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } } @@ -1441,8 +1467,8 @@ xfs_dir2_leaf_lookup_int( */ ASSERT(cidb == -1); if (dbp) - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOENT); } @@ -1456,13 +1482,13 @@ xfs_dir2_leaf_removename( __be16 *bestsp; /* leaf block best freespace */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry structure */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_db_t i; /* temporary data block # */ int index; /* index into leaf entries */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ @@ -1483,8 +1509,8 @@ xfs_dir2_leaf_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; - hdr = dbp->data; + leaf = lbp->b_addr; + hdr = dbp->b_addr; xfs_dir2_data_check(dp, dbp); /* * Point to the leaf entry, use that to point to the data entry. @@ -1541,12 +1567,9 @@ xfs_dir2_leaf_removename( * Just go on, returning success, leaving the * empty block in place. */ - if (error == ENOSPC && args->total == 0) { - xfs_da_buf_done(dbp); + if (error == ENOSPC && args->total == 0) error = 0; - } xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); return error; } dbp = NULL; @@ -1577,10 +1600,9 @@ xfs_dir2_leaf_removename( /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk && dbp != NULL) { - xfs_da_buf_done(dbp); + else if (db != mp->m_dirdatablk) dbp = NULL; - } + xfs_dir2_leaf_check(dp, lbp); /* * See if we can convert to block form. @@ -1595,12 +1617,12 @@ int /* error */ xfs_dir2_leaf_replace( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index of leaf entry */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ @@ -1614,7 +1636,7 @@ xfs_dir2_leaf_replace( return error; } dp = args->dp; - leaf = lbp->data; + leaf = lbp->b_addr; /* * Point to the leaf entry, get data address from it. */ @@ -1623,7 +1645,7 @@ xfs_dir2_leaf_replace( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + + ((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* @@ -1632,9 +1654,8 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_da_buf_done(dbp); xfs_dir2_leaf_check(dp, lbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return 0; } @@ -1646,7 +1667,7 @@ xfs_dir2_leaf_replace( int /* index value */ xfs_dir2_leaf_search_hash( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_dahash_t hash=0; /* hash from this entry */ xfs_dahash_t hashwant; /* hash value looking for */ @@ -1656,7 +1677,7 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ - leaf = lbp->data; + leaf = lbp->b_addr; #ifndef __KERNEL__ if (!leaf->hdr.count) return 0; @@ -1699,11 +1720,11 @@ xfs_dir2_leaf_search_hash( int /* error */ xfs_dir2_leaf_trim_data( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ + struct xfs_buf *lbp, /* leaf buffer */ xfs_dir2_db_t db) /* data block number */ { __be16 *bestsp; /* leaf bests table */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1722,12 +1743,12 @@ xfs_dir2_leaf_trim_data( return error; } - leaf = lbp->data; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); #ifdef DEBUG { - struct xfs_dir2_data_hdr *hdr = dbp->data; + struct xfs_dir2_data_hdr *hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); ASSERT(be16_to_cpu(hdr->bestfree[0].length) == @@ -1741,7 +1762,7 @@ xfs_dir2_leaf_trim_data( */ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { ASSERT(error != ENOSPC); - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); return error; } /* @@ -1781,10 +1802,10 @@ xfs_dir2_node_to_leaf( xfs_da_args_t *args; /* operation arguments */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *fbp; /* buffer for freespace block */ + struct xfs_buf *fbp; /* buffer for freespace block */ xfs_fileoff_t fo; /* freespace file offset */ xfs_dir2_free_t *free; /* freespace structure */ - xfs_dabuf_t *lbp; /* buffer for leaf block */ + struct xfs_buf *lbp; /* buffer for leaf block */ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ @@ -1838,7 +1859,7 @@ xfs_dir2_node_to_leaf( if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) return 0; lbp = state->path.blk[0].bp; - leaf = lbp->data; + leaf = lbp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Read the freespace block. @@ -1847,7 +1868,7 @@ xfs_dir2_node_to_leaf( XFS_DATA_FORK))) { return error; } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); @@ -1857,7 +1878,7 @@ xfs_dir2_node_to_leaf( */ if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > mp->m_dirblksize) { - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); return 0; } diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index b0f26780449d..6c7052406605 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -36,20 +36,20 @@ /* * Function declarations. */ -static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index); +static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, + int index); #ifdef DEBUG -static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_leafn_check(dp, bp) #endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s, - int start_s, xfs_dabuf_t *bp_d, int start_d, - int count); +static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, + int start_s, struct xfs_buf *bp_d, + int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp, +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); static int xfs_dir2_node_addname_int(xfs_da_args_t *args, @@ -60,16 +60,16 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, */ STATIC void xfs_dir2_free_log_bests( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* freespace buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, (uint)((char *)&free->bests[first] - (char *)free), (uint)((char *)&free->bests[last] - (char *)free + sizeof(free->bests[0]) - 1)); @@ -80,14 +80,14 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* freespace buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), + xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); } @@ -99,11 +99,11 @@ xfs_dir2_free_log_header( int /* error */ xfs_dir2_leaf_to_node( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_db_t fdb; /* freespace block number */ xfs_dir2_free_t *free; /* freespace structure */ __be16 *from; /* pointer to freespace entry */ @@ -136,8 +136,8 @@ xfs_dir2_leaf_to_node( return error; } ASSERT(fbp != NULL); - free = fbp->data; - leaf = lbp->data; + free = fbp->b_addr; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Initialize the freespace block header. @@ -164,7 +164,6 @@ xfs_dir2_leaf_to_node( xfs_dir2_leaf_log_header(tp, lbp); xfs_dir2_free_log_header(tp, fbp); xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_da_buf_done(fbp); xfs_dir2_leafn_check(dp, lbp); return 0; } @@ -175,7 +174,7 @@ xfs_dir2_leaf_to_node( */ static int /* error */ xfs_dir2_leafn_add( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int index) /* insertion pt for new entry */ { @@ -195,7 +194,7 @@ xfs_dir2_leafn_add( dp = args->dp; mp = dp->i_mount; tp = args->trans; - leaf = bp->data; + leaf = bp->b_addr; /* * Quick check just to make sure we are not going to index @@ -261,15 +260,15 @@ xfs_dir2_leafn_add( */ void xfs_dir2_leafn_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_inode *dp, + struct xfs_buf *bp) { int i; /* leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ int stale; /* count of stale leaves */ - leaf = bp->data; + leaf = bp->b_addr; mp = dp->i_mount; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); @@ -291,12 +290,12 @@ xfs_dir2_leafn_check( */ xfs_dahash_t /* hash value */ xfs_dir2_leafn_lasthash( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); @@ -311,12 +310,12 @@ xfs_dir2_leafn_lasthash( */ STATIC int xfs_dir2_leafn_lookup_for_addname( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ xfs_dir2_db_t curdb = -1; /* current data block number */ xfs_dir2_db_t curfdb = -1; /* current free block number */ xfs_inode_t *dp; /* incore directory inode */ @@ -335,7 +334,7 @@ xfs_dir2_leafn_lookup_for_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); @@ -352,7 +351,7 @@ xfs_dir2_leafn_lookup_for_addname( /* If so, it's a free block buffer, get the block number. */ curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; - free = curbp->data; + free = curbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); @@ -394,7 +393,7 @@ xfs_dir2_leafn_lookup_for_addname( * If we had one before, drop it. */ if (curbp) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); /* * Read the free block. */ @@ -403,7 +402,7 @@ xfs_dir2_leafn_lookup_for_addname( -1, &curbp, XFS_DATA_FORK); if (error) return error; - free = curbp->data; + free = curbp->b_addr; ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); ASSERT((be32_to_cpu(free->hdr.firstdb) % @@ -424,7 +423,7 @@ xfs_dir2_leafn_lookup_for_addname( XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; @@ -459,12 +458,12 @@ out: */ STATIC int xfs_dir2_leafn_lookup_for_entry( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ xfs_dir2_db_t curdb = -1; /* current data block number */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ @@ -480,7 +479,7 @@ xfs_dir2_leafn_lookup_for_entry( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); @@ -525,7 +524,7 @@ xfs_dir2_leafn_lookup_for_entry( */ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || curdb != state->extrablk.blkno)) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); /* * If needing the block that is saved with a CI match, * use it otherwise read in the new data block. @@ -547,7 +546,7 @@ xfs_dir2_leafn_lookup_for_entry( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)curbp->data + + dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare the entry and if it's an exact match, return @@ -559,7 +558,7 @@ xfs_dir2_leafn_lookup_for_entry( /* If there is a CI match block, drop it */ if (args->cmpresult != XFS_CMP_DIFFERENT && curdb != state->extrablk.blkno) - xfs_da_brelse(tp, state->extrablk.bp); + xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); *indexp = index; @@ -567,7 +566,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.bp = curbp; state->extrablk.blkno = curdb; state->extrablk.index = (int)((char *)dep - - (char *)curbp->data); + (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); @@ -586,7 +585,7 @@ xfs_dir2_leafn_lookup_for_entry( } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); } } else { state->extravalid = 0; @@ -602,7 +601,7 @@ xfs_dir2_leafn_lookup_for_entry( */ int xfs_dir2_leafn_lookup_int( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ @@ -620,9 +619,9 @@ xfs_dir2_leafn_lookup_int( static void xfs_dir2_leafn_moveents( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp_s, /* source leaf buffer */ + struct xfs_buf *bp_s, /* source leaf buffer */ int start_s, /* source leaf index */ - xfs_dabuf_t *bp_d, /* destination leaf buffer */ + struct xfs_buf *bp_d, /* destination leaf buffer */ int start_d, /* destination leaf index */ int count) /* count of leaves to copy */ { @@ -640,8 +639,8 @@ xfs_dir2_leafn_moveents( return; } tp = args->trans; - leaf_s = bp_s->data; - leaf_d = bp_d->data; + leaf_s = bp_s->b_addr; + leaf_d = bp_d->b_addr; /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination @@ -702,14 +701,14 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */ - xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */ + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; + leaf1 = leaf1_bp->b_addr; + leaf2 = leaf2_bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (be16_to_cpu(leaf1->hdr.count) > 0 && @@ -757,8 +756,8 @@ xfs_dir2_leafn_rebalance( blk1 = blk2; blk2 = tmp; } - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); #ifdef DEBUG oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); @@ -834,14 +833,14 @@ xfs_dir2_leafn_rebalance( static int /* error */ xfs_dir2_leafn_remove( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int index, /* leaf entry index */ xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -858,7 +857,7 @@ xfs_dir2_leafn_remove( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Point to the entry we're removing. @@ -884,7 +883,7 @@ xfs_dir2_leafn_remove( * in the data block in case it changes. */ dbp = dblk->bp; - hdr = dbp->data; + hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); longest = be16_to_cpu(hdr->bestfree[0].length); needlog = needscan = 0; @@ -905,7 +904,7 @@ xfs_dir2_leafn_remove( */ if (longest < be16_to_cpu(hdr->bestfree[0].length)) { int error; /* error return value */ - xfs_dabuf_t *fbp; /* freeblock buffer */ + struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ @@ -920,7 +919,7 @@ xfs_dir2_leafn_remove( -1, &fbp, XFS_DATA_FORK))) { return error; } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == xfs_dir2_free_max_bests(mp) * @@ -948,9 +947,7 @@ xfs_dir2_leafn_remove( * In this case just drop the buffer and some one else * will eventually get rid of the empty block. */ - else if (error == ENOSPC && args->total == 0) - xfs_da_buf_done(dbp); - else + else if (!(error == ENOSPC && args->total == 0)) return error; } /* @@ -1018,11 +1015,6 @@ xfs_dir2_leafn_remove( */ if (logfree) xfs_dir2_free_log_bests(tp, fbp, findex, findex); - /* - * Drop the buffer if we still have it. - */ - if (fbp) - xfs_da_buf_done(fbp); } xfs_dir2_leafn_check(dp, bp); /* @@ -1114,7 +1106,7 @@ xfs_dir2_leafn_toosmall( { xfs_da_state_blk_t *blk; /* leaf block */ xfs_dablk_t blkno; /* leaf block number */ - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ int bytes; /* bytes in use */ int count; /* leaf live entry count */ int error; /* error return value */ @@ -1130,7 +1122,7 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); @@ -1189,7 +1181,7 @@ xfs_dir2_leafn_toosmall( leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes = state->blocksize - (state->blocksize >> 2); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes -= count * (uint)sizeof(leaf->ents[0]); @@ -1198,7 +1190,7 @@ xfs_dir2_leafn_toosmall( */ if (bytes >= 0) break; - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); } /* * Didn't like either block, give up. @@ -1207,11 +1199,7 @@ xfs_dir2_leafn_toosmall( *action = 0; return 0; } - /* - * Done with the sibling leaf block here, drop the dabuf - * so path_shift can get it. - */ - xfs_da_buf_done(bp); + /* * Make altpath point to the block we want to keep (the lower * numbered block) and path point to the block we want to drop. @@ -1247,8 +1235,8 @@ xfs_dir2_leafn_unbalance( args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* @@ -1356,13 +1344,13 @@ xfs_dir2_node_addname_int( { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t dbno; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ int error; /* error return value */ xfs_dir2_db_t fbno; /* freespace block number */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ int findex; /* freespace entry index */ xfs_dir2_free_t *free=NULL; /* freespace block structure */ xfs_dir2_db_t ifbno; /* initial freespace block no */ @@ -1390,7 +1378,7 @@ xfs_dir2_node_addname_int( * Remember initial freespace block number. */ ifbno = fblk->blkno; - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; /* @@ -1474,7 +1462,7 @@ xfs_dir2_node_addname_int( if (unlikely(fbp == NULL)) { continue; } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } @@ -1492,7 +1480,7 @@ xfs_dir2_node_addname_int( /* * Drop the block. */ - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); fbp = NULL; if (fblk && fblk->bp) fblk->bp = NULL; @@ -1507,36 +1495,23 @@ xfs_dir2_node_addname_int( /* * Not allowed to allocate, return failure. */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || - args->total == 0) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); - } + /* * Allocate and initialize the new data block. */ if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + (error = xfs_dir2_data_init(args, dbno, &dbp)))) return error; - } + /* * If (somehow) we have a freespace block, get rid of it. */ if (fbp) - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); if (fblk && fblk->bp) fblk->bp = NULL; @@ -1547,10 +1522,9 @@ xfs_dir2_node_addname_int( fbno = xfs_dir2_db_to_fdb(mp, dbno); if (unlikely(error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { - xfs_da_buf_done(dbp); + XFS_DATA_FORK))) return error; - } + /* * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. @@ -1598,7 +1572,7 @@ xfs_dir2_node_addname_int( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - free = fbp->data; + free = fbp->b_addr; free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); free->hdr.firstdb = cpu_to_be32( (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * @@ -1606,7 +1580,7 @@ xfs_dir2_node_addname_int( free->hdr.nvalid = 0; free->hdr.nused = 0; } else { - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } @@ -1639,7 +1613,7 @@ xfs_dir2_node_addname_int( * We haven't allocated the data entry yet so this will * change again. */ - hdr = dbp->data; + hdr = dbp->b_addr; free->bests[findex] = hdr->bestfree[0].length; logfree = 1; } @@ -1650,22 +1624,17 @@ xfs_dir2_node_addname_int( /* * If just checking, we succeeded. */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; - } + /* * Read the data block in. */ - if (unlikely( - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK))) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp, XFS_DATA_FORK); + if (error) return error; - } - hdr = dbp->data; + hdr = dbp->b_addr; logfree = 0; } ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); @@ -1714,16 +1683,10 @@ xfs_dir2_node_addname_int( if (logfree) xfs_dir2_free_log_bests(tp, fbp, findex, findex); /* - * If the caller didn't hand us the freespace block, drop it. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); - /* * Return the data block and offset in args, then drop the data block. */ args->blkno = (xfs_dablk_t)dbno; args->index = be16_to_cpu(*tagp); - xfs_da_buf_done(dbp); return 0; } @@ -1761,22 +1724,23 @@ xfs_dir2_node_lookup( /* If a CI match, dup the actual name and return EEXIST */ xfs_dir2_data_entry_t *dep; - dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp-> - data + state->extrablk.index); + dep = (xfs_dir2_data_entry_t *) + ((char *)state->extrablk.bp->b_addr + + state->extrablk.index); rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); } /* * Release the btree blocks and leaf block. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } /* * Release the data block if we have it. */ if (state->extravalid && state->extrablk.bp) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } xfs_da_state_free(state); @@ -1893,13 +1857,13 @@ xfs_dir2_node_replace( */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); - leaf = blk->bp->data; + leaf = blk->bp->b_addr; lep = &leaf->ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ - hdr = state->extrablk.bp->data; + hdr = state->extrablk.bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + @@ -1916,14 +1880,14 @@ xfs_dir2_node_replace( * Didn't find it, and we're holding a data block. Drop it. */ else if (state->extravalid) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } /* * Release all the buffers in the cursor. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } xfs_da_state_free(state); @@ -1940,7 +1904,7 @@ xfs_dir2_node_trim_free( xfs_fileoff_t fo, /* free block number */ int *rvalp) /* out: did something */ { - xfs_dabuf_t *bp; /* freespace buffer */ + struct xfs_buf *bp; /* freespace buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_free_t *free; /* freespace structure */ @@ -1965,13 +1929,13 @@ xfs_dir2_node_trim_free( if (bp == NULL) { return 0; } - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* * If there are used entries, there's nothing to do. */ if (be32_to_cpu(free->hdr.nused) > 0) { - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; } @@ -1987,7 +1951,7 @@ xfs_dir2_node_trim_free( * pieces. This is the last block of an extent. */ ASSERT(error != ENOSPC); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return error; } /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 067f403ecf8a..3523d3e15aa8 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -25,7 +25,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_dabuf *bp); + struct xfs_buf *bp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); @@ -37,11 +37,11 @@ extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); + struct xfs_buf *lbp, struct xfs_buf *dbp); /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); +extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_data_check(dp,bp) #endif @@ -51,43 +51,43 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, - struct xfs_dabuf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf **bpp); +extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, +extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, +extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, - struct xfs_dabuf *dbp); + struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_dabuf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, + struct xfs_buf *bp); +extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_dabuf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf **bpp, int magic); +extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); + struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, - struct xfs_dabuf *lbp); + struct xfs_buf *lbp); extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, - struct xfs_dabuf *lbp, xfs_dir2_db_t db); + struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, int lowstale, int highstale, @@ -96,13 +96,13 @@ extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); -extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, + struct xfs_buf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); +extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); extern int xfs_dir2_leafn_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); @@ -122,7 +122,7 @@ extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); -extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 19bf0c5e38f4..1b9fc3ec7e4b 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -222,7 +222,7 @@ xfs_dir2_block_sfsize( int /* error */ xfs_dir2_block_to_sf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, int size, /* shortform directory size */ xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { @@ -249,7 +249,7 @@ xfs_dir2_block_to_sf( * and add local data. */ hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->data, mp->m_dirblksize); + memcpy(hdr, bp->b_addr, mp->m_dirblksize); logflags = XFS_ILOG_CORE; if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { ASSERT(error != ENOSPC); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9f7ec15a6522..56afcdb2377d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -236,7 +236,6 @@ xfs_file_aio_read( ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; - unsigned long seg; XFS_STATS_INC(xs_read_calls); @@ -247,19 +246,9 @@ xfs_file_aio_read( if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - /* START copy & waste from filemap.c */ - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ + ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); + if (ret < 0) + return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = @@ -273,7 +262,7 @@ xfs_file_aio_read( } } - n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + n = mp->m_super->s_maxbytes - iocb->ki_pos; if (n <= 0 || size == 0) return 0; @@ -781,10 +770,12 @@ xfs_file_aio_write( if (ocount == 0) return 0; - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ret = -EIO; + goto out; + } if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); @@ -803,6 +794,8 @@ xfs_file_aio_write( ret = err; } +out: + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 177a21a7ac49..21e37b55f7e5 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -442,14 +442,13 @@ xfs_ialloc_next_ag( * Select an allocation group to look for a free inode in, based on the parent * inode and then mode. Return the allocation group buffer. */ -STATIC xfs_buf_t * /* allocation group buffer */ +STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { - xfs_buf_t *agbp; /* allocation group header buffer */ xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ int flags; /* alloc buffer locking flags */ @@ -459,6 +458,7 @@ xfs_ialloc_ag_select( int needspace; /* file mode implies space allocated */ xfs_perag_t *pag; /* per allocation group data */ xfs_agnumber_t pagno; /* parent (starting) ag number */ + int error; /* * Files of these types need at least one block if length > 0 @@ -474,7 +474,9 @@ xfs_ialloc_ag_select( if (pagno >= agcount) pagno = 0; } + ASSERT(pagno < agcount); + /* * Loop through allocation groups, looking for one with a little * free space in it. Note we don't look for free inodes, exactly. @@ -486,51 +488,45 @@ xfs_ialloc_ag_select( flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + if (!pag->pagi_init) { - if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) goto nextag; - } - } else - agbp = NULL; + } - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto unlock_nextag; + if (pag->pagi_freecount) { + xfs_perag_put(pag); + return agno; } - /* - * Is there enough free space for the file plus a block - * of inodes (if we need to allocate some)? - */ - ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp); - if (ineed && !pag->pagf_init) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + if (!okalloc) + goto nextag; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, agno, flags); + if (error) goto nextag; - } - (void)xfs_alloc_pagf_init(mp, tp, agno, flags); } - if (!ineed || pag->pagf_init) { - if (ineed && !(longest = pag->pagf_longest)) - longest = pag->pagf_flcount > 0; - if (!ineed || - (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed && - okalloc)) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; - goto nextag; - } - xfs_perag_put(pag); - return agbp; - } + + /* + * Is there enough free space for the file plus a block of + * inodes? (if we need to allocate some)? + */ + ineed = XFS_IALLOC_BLOCKS(mp); + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed) { + xfs_perag_put(pag); + return agno; } -unlock_nextag: - if (agbp) - xfs_trans_brelse(tp, agbp); nextag: xfs_perag_put(pag); /* @@ -538,13 +534,13 @@ nextag: * down. */ if (XFS_FORCED_SHUTDOWN(mp)) - return NULL; + return NULLAGNUMBER; agno++; if (agno >= agcount) agno = 0; if (agno == pagno) { if (flags == 0) - return NULL; + return NULLAGNUMBER; flags = 0; } } @@ -607,195 +603,39 @@ xfs_ialloc_get_rec( } /* - * Visible inode allocation functions. - */ -/* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - -/* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. - * - * The arguments IO_agbp and alloc_done are defined to work within - * the constraint of one allocation per transaction. - * xfs_dialloc() is designed to be called twice if it has to do an - * allocation to make more free inodes. On the first call, - * IO_agbp should be set to NULL. If an inode is available, - * i.e., xfs_dialloc() did not need to do an allocation, an inode - * number is returned. In this case, IO_agbp would be set to the - * current ag_buf and alloc_done set to false. - * If an allocation needed to be done, xfs_dialloc would return - * the current ag_buf in IO_agbp and set alloc_done to true. - * The caller should then commit the current transaction, allocate a new - * transaction, and call xfs_dialloc() again, passing in the previous - * value of IO_agbp. IO_agbp should be held across the transactions. - * Since the agbp is locked across the two calls, the second call is - * guaranteed to have a free inode available. + * Allocate an inode. * - * Once we successfully pick an inode its number is returned and the - * on-disk data structures are updated. The inode itself is not read - * in, since doing so would break ordering constraints with xfs_reclaim. + * The caller selected an AG for us, and made sure that free inodes are + * available. */ -int -xfs_dialloc( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent inode (directory) */ - umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ - xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ - boolean_t *alloc_done, /* true if we needed to replenish - inode freelist */ - xfs_ino_t *inop) /* inode number allocated */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) { - xfs_agnumber_t agcount; /* number of allocation groups */ - xfs_buf_t *agbp; /* allocation group header's buffer */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header structure */ - xfs_btree_cur_t *cur; /* inode allocation btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ialloced; /* inode allocation status */ - int noroom = 0; /* no space for inode blk allocation */ - xfs_ino_t ino; /* fs-relative inode to be returned */ - /* REFERENCED */ - int j; /* result code */ - xfs_mount_t *mp; /* file system mount structure */ - int offset; /* index of inode in chunk */ - xfs_agino_t pagino; /* parent's AG relative inode # */ - xfs_agnumber_t pagno; /* parent's AG number */ - xfs_inobt_rec_incore_t rec; /* inode allocation record */ - xfs_agnumber_t tagno; /* testing allocation group number */ - xfs_btree_cur_t *tcur; /* temp cursor */ - xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ - struct xfs_perag *pag; - - - if (*IO_agbp == NULL) { - /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. - */ - agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); - /* - * Couldn't find an allocation group satisfying the - * criteria, give up. - */ - if (!agbp) { - *inop = NULLFSINO; - return 0; - } - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } else { - /* - * Continue where we left off before. In this case, we - * know that the allocation group has free inodes. - */ - agbp = *IO_agbp; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - } - mp = tp->t_mountp; - agcount = mp->m_sb.sb_agcount; - agno = be32_to_cpu(agi->agi_seqno); - tagno = agno; - pagno = XFS_INO_TO_AGNO(mp, parent); - pagino = XFS_INO_TO_AGINO(mp, parent); - - /* - * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free - * inode. - */ - - if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { - noroom = 1; - okalloc = 0; - } + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur, *tcur; + struct xfs_inobt_rec_incore rec, trec; + xfs_ino_t ino; + int error; + int offset; + int i, j; - /* - * Loop until we find an allocation group that either has free inodes - * or in which we can allocate some inodes. Iterate through the - * allocation groups upward, wrapping at the end. - */ - *alloc_done = B_FALSE; - while (!agi->agi_freecount) { - /* - * Don't do anything if we're not supposed to allocate - * any blocks, just go on to the next ag. - */ - if (okalloc) { - /* - * Try to allocate some new inodes in the allocation - * group. - */ - if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { - xfs_trans_brelse(tp, agbp); - if (error == ENOSPC) { - *inop = NULLFSINO; - return 0; - } else - return error; - } - if (ialloced) { - /* - * We successfully allocated some inodes, return - * the current context to the caller so that it - * can commit the current transaction and call - * us again where we left off. - */ - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - *alloc_done = B_TRUE; - *IO_agbp = agbp; - *inop = NULLFSINO; - return 0; - } - } - /* - * If it failed, give up on this ag. - */ - xfs_trans_brelse(tp, agbp); - /* - * Go on to the next ag: get its ag header. - */ -nextag: - if (++tagno == agcount) - tagno = 0; - if (tagno == agno) { - *inop = NULLFSINO; - return noroom ? ENOSPC : 0; - } - pag = xfs_perag_get(mp, tagno); - if (pag->pagi_inodeok == 0) { - xfs_perag_put(pag); - goto nextag; - } - error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - xfs_perag_put(pag); - if (error) - goto nextag; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } - /* - * Here with an allocation group that has a free inode. - * Reset agno since we may have chosen a new ag in the - * loop above. - */ - agno = tagno; - *IO_agbp = NULL; pag = xfs_perag_get(mp, agno); + ASSERT(pag->pagi_init); + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pagi_freecount > 0); + restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -995,7 +835,7 @@ newino: } alloc_inode: - offset = xfs_ialloc_find_free(&rec.ir_free); + offset = xfs_lowbit64(rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1028,6 +868,164 @@ error0: } /* + * Allocate an inode on disk. + * + * Mode is used to tell whether the new inode will need space, and whether it + * is a directory. + * + * This function is designed to be called twice if it has to do an allocation + * to make more free inodes. On the first call, *IO_agbp should be set to NULL. + * If an inode is available without having to performn an allocation, an inode + * number is returned. In this case, *IO_agbp would be NULL. If an allocation + * needes to be done, xfs_dialloc would return the current AGI buffer in + * *IO_agbp. The caller should then commit the current transaction, allocate a + * new transaction, and call xfs_dialloc() again, passing in the previous value + * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI + * buffer is locked across the two calls, the second call is guaranteed to have + * a free inode available. + * + * Once we successfully pick an inode its number is returned and the on-disk + * data structures are updated. The inode itself is not read in, since doing so + * would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + struct xfs_trans *tp, + xfs_ino_t parent, + umode_t mode, + int okalloc, + struct xfs_buf **IO_agbp, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno; + int error; + int ialloced; + int noroom = 0; + xfs_agnumber_t start_agno; + struct xfs_perag *pag; + + if (*IO_agbp) { + /* + * If the caller passes in a pointer to the AGI buffer, + * continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + goto out_alloc; + } + + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (start_agno == NULLAGNUMBER) { + *inop = NULLFSINO; + return 0; + } + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + */ + if (mp->m_maxicount && + mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + agno = start_agno; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto out_error; + } + + /* + * Do a first racy fast path check if this AG is usable. + */ + if (!pag->pagi_freecount && !okalloc) + goto nextag; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + goto out_error; + + /* + * Once the AGI has been read in we have to recheck + * pagi_freecount with the AGI buffer lock held. + */ + if (pag->pagi_freecount) { + xfs_perag_put(pag); + goto out_alloc; + } + + if (!okalloc) { + xfs_trans_brelse(tp, agbp); + goto nextag; + } + + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); + if (error) { + xfs_trans_brelse(tp, agbp); + + if (error != ENOSPC) + goto out_error; + + xfs_perag_put(pag); + *inop = NULLFSINO; + return 0; + } + + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(pag->pagi_freecount > 0); + xfs_perag_put(pag); + + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + +nextag: + xfs_perag_put(pag); + if (++agno == mp->m_sb.sb_agcount) + agno = 0; + if (agno == start_agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } + } + +out_alloc: + *IO_agbp = NULL; + return xfs_dialloc_ag(tp, agbp, parent, inop); +out_error: + xfs_perag_put(pag); + return XFS_ERROR(error); +} + +/* * Free disk inode. Carefully avoids touching the incore inode, all * manipulations incore are the caller's responsibility. * The on-disk inode is not changed by this operation, only the diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 65ac57c8063c..1fd6ea4e9c91 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -75,8 +75,6 @@ xfs_dialloc( umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ - boolean_t *alloc_done, /* an allocation was done to replenish - the free inodes */ xfs_ino_t *inop); /* inode number allocated */ /* diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 1bb4365e8c25..784a803383ec 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -41,17 +41,6 @@ /* - * Define xfs inode iolock lockdep classes. We need to ensure that all active - * inodes are considered the same for lockdep purposes, including inodes that - * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to - * guarantee the locks are considered the same when there are multiple lock - * initialisation siteѕ. Also, define a reclaimable inode class so it is - * obvious in lockdep reports which class the report is against. - */ -static struct lock_class_key xfs_iolock_active; -struct lock_class_key xfs_iolock_reclaimable; - -/* * Allocate and initialise an xfs_inode. */ STATIC struct xfs_inode * @@ -80,8 +69,6 @@ xfs_inode_alloc( ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_active, "xfs_iolock_active"); /* initialise the xfs inode */ ip->i_ino = ino; @@ -250,8 +237,6 @@ xfs_iget_cache_hit( ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_active, "xfs_iolock_active"); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a59eea09930a..2778258fcfa2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,23 +132,28 @@ xfs_inobp_check( #endif /* - * Find the buffer associated with the given inode map - * We do basic validation checks on the buffer once it has been - * retrieved from disk. + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. */ -STATIC int +int xfs_imap_to_bp( - xfs_mount_t *mp, - xfs_trans_t *tp, - struct xfs_imap *imap, - xfs_buf_t **bpp, - uint buf_flags, - uint iget_flags) + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) { - int error; - int i; - int ni; - xfs_buf_t *bp; + struct xfs_buf *bp; + int error; + int i; + int ni; buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, @@ -189,8 +194,8 @@ xfs_imap_to_bp( xfs_trans_brelse(tp, bp); return XFS_ERROR(EINVAL); } - XFS_CORRUPTION_ERROR("xfs_imap_to_bp", - XFS_ERRLEVEL_HIGH, mp, dip); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); #ifdef DEBUG xfs_emerg(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", @@ -204,96 +209,9 @@ xfs_imap_to_bp( } xfs_inobp_check(mp, bp); - *bpp = bp; - return 0; -} - -/* - * This routine is called to map an inode number within a file - * system to the buffer containing the on-disk version of the - * inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dip parameter - * it returns a pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * Use xfs_imap() to determine the size and location of the - * buffer to read from disk. - */ -int -xfs_inotobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - int *offset, - uint imap_flags) -{ - struct xfs_imap imap; - xfs_buf_t *bp; - int error; - - imap.im_blkno = 0; - error = xfs_imap(mp, tp, ino, &imap, imap_flags); - if (error) - return error; - - error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); - if (error) - return error; - - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); - *bpp = bp; - *offset = imap.im_boffset; - return 0; -} - - -/* - * This routine is called to map an inode to the buffer containing - * the on-disk version of the inode. It returns a pointer to the - * buffer containing the on-disk inode in the bpp parameter, and in - * the dip parameter it returns a pointer to the on-disk inode within - * that buffer. - * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * The inode is expected to already been mapped to its buffer and read - * in once, thus we can use the mapping information stored in the inode - * rather than calling xfs_imap(). This allows us to avoid the overhead - * of looking at the inode btree for small block file systems - * (see xfs_imap()). - */ -int -xfs_itobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - uint buf_flags) -{ - xfs_buf_t *bp; - int error; - - ASSERT(ip->i_imap.im_blkno != 0); - - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); - if (error) - return error; - if (!bp) { - ASSERT(buf_flags & XBF_TRYLOCK); - ASSERT(tp == NULL); - *bpp = NULL; - return EAGAIN; - } - - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; } @@ -796,10 +714,9 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); if (error) return error; - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); /* * If we got something that isn't an inode it means someone @@ -876,7 +793,7 @@ xfs_iread( /* * Use xfs_trans_brelse() to release the buffer containing the * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_itobp() above. If tp is NULL, this is just a normal + * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal * brelse(). If we're within a transaction, then xfs_trans_brelse() * will only release the buffer if it is not dirty within the * transaction. It will be OK to release the buffer in this case, @@ -970,7 +887,6 @@ xfs_ialloc( prid_t prid, int okalloc, xfs_buf_t **ialloc_context, - boolean_t *call_again, xfs_inode_t **ipp) { xfs_ino_t ino; @@ -985,10 +901,10 @@ xfs_ialloc( * the on-disk inode to be allocated. */ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, - ialloc_context, call_again, &ino); + ialloc_context, &ino); if (error) return error; - if (*call_again || ino == NULLFSINO) { + if (*ialloc_context || ino == NULLFSINO) { *ipp = NULL; return 0; } @@ -1207,7 +1123,9 @@ xfs_itruncate_extents( int error = 0; int done = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); @@ -1226,7 +1144,7 @@ xfs_itruncate_extents( * then there is nothing to do. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (first_unmap_block == last_block) return 0; @@ -1355,7 +1273,8 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) return error; @@ -1429,16 +1348,16 @@ xfs_iunlink_remove( if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { /* - * We're at the head of the list. Get the inode's - * on-disk buffer to see if there is anyone after us - * on the list. Only modify our next pointer if it - * is not already NULLAGINO. This saves us the overhead - * of dealing with the buffer when there is no need to - * change it. + * We're at the head of the list. Get the inode's on-disk + * buffer to see if there is anyone after us on the list. + * Only modify our next pointer if it is not already NULLAGINO. + * This saves us the overhead of dealing with the buffer when + * there is no need to change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - xfs_warn(mp, "%s: xfs_itobp() returned error %d.", + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } @@ -1472,34 +1391,45 @@ xfs_iunlink_remove( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); last_ibp = NULL; while (next_agino != agino) { - /* - * If the last inode wasn't the one pointing to - * us, then release its buffer since we're not - * going to do anything with it. - */ - if (last_ibp != NULL) { + struct xfs_imap imap; + + if (last_ibp) xfs_trans_brelse(tp, last_ibp); - } + + imap.im_blkno = 0; next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); - error = xfs_inotobp(mp, tp, next_ino, &last_dip, - &last_ibp, &last_offset, 0); + + error = xfs_imap(mp, tp, next_ino, &imap, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, + &last_ibp, 0, 0); if (error) { xfs_warn(mp, - "%s: xfs_inotobp() returned error %d.", + "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } + + last_offset = imap.im_boffset; next_agino = be32_to_cpu(last_dip->di_next_unlinked); ASSERT(next_agino != NULLAGINO); ASSERT(next_agino != 0); } + /* - * Now last_ibp points to the buffer previous to us on - * the unlinked list. Pull us from the list. + * Now last_ibp points to the buffer previous to us on the + * unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", + xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", __func__, error); return error; } @@ -1749,7 +1679,8 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) return error; @@ -2428,7 +2359,7 @@ xfs_iflush( /* * For stale inodes we cannot rely on the backing buffer remaining * stale in cache for the remaining life of the stale inode and so - * xfs_itobp() below may give us a buffer that no longer contains + * xfs_imap_to_bp() below may give us a buffer that no longer contains * inodes below. We have to check this after ensuring the inode is * unpinned so that it is safe to reclaim the stale inode after the * flush call. @@ -2454,7 +2385,8 @@ xfs_iflush( /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, + 0); if (error || !bp) { xfs_ifunlock(ip); return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1efff36a75b6..94b32f906e79 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -487,8 +487,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) -extern struct lock_class_key xfs_iolock_reclaimable; - /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and @@ -517,7 +515,7 @@ void xfs_inode_free(struct xfs_inode *ip); */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, boolean_t *, xfs_inode_t **); + struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode *); @@ -557,12 +555,9 @@ do { \ #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 -int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, - xfs_ino_t, struct xfs_dinode **, - struct xfs_buf **, int *, uint); -int xfs_itobp(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, struct xfs_dinode **, - struct xfs_buf **, uint); +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_dinode_to_disk(struct xfs_dinode *, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f1535d25a9b..0e0232c3b6d9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -364,9 +364,15 @@ xfs_fssetdm_by_handle( if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(parfilp); + if (error) + return error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); return PTR_ERR(dentry); + } if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); @@ -382,6 +388,7 @@ xfs_fssetdm_by_handle( fsd.fsd_dmstate); out: + mnt_drop_write_file(parfilp); dput(dentry); return error; } @@ -634,7 +641,11 @@ xfs_ioc_space( if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + mnt_drop_write_file(filp); return -error; } @@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr( { struct fsxattr fa; unsigned int mask; + int error; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; @@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) mask |= FSX_NONBLOCK; - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1196,6 +1213,7 @@ xfs_ioc_setxflags( struct fsxattr fa; unsigned int flags; unsigned int mask; + int error; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -1210,7 +1228,12 @@ xfs_ioc_setxflags( mask |= FSX_NONBLOCK; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1385,8 +1408,13 @@ xfs_file_ioctl( if (copy_from_user(&dmi, arg, sizeof(dmi))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, dmi.fsd_dmstate); + mnt_drop_write_file(filp); return -error; } @@ -1434,7 +1462,11 @@ xfs_file_ioctl( if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } @@ -1463,9 +1495,14 @@ xfs_file_ioctl( if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + /* input parameter is passed in resblks field of structure */ in = inout.resblks; error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); if (error) return -error; @@ -1496,7 +1533,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1506,7 +1547,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1516,7 +1561,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4f2da0d2bf5..1244274a5674 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -600,7 +600,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_data_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSGROWFSRT_32: { @@ -608,7 +612,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_rt_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } #endif @@ -627,7 +635,11 @@ xfs_file_compat_ioctl( offsetof(struct xfs_swapext, sx_stat)) || xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSBULKSTAT_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index aadfce6681ee..973dff6ad935 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -285,7 +285,7 @@ xfs_iomap_eof_want_preallocate( * do any speculative allocation. */ start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; @@ -416,8 +416,8 @@ retry: * Make sure preallocation does not create extents beyond the range we * actually support in this filesystem. */ - if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) - last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); ASSERT(last_fsb > offset_fsb); @@ -680,9 +680,9 @@ xfs_iomap_write_unwritten( * the same inode that we complete here and might deadlock * on the iolock. */ - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE; + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 9c4340f5c3e0..4e00cf091d2c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -897,6 +897,47 @@ xfs_vn_setattr( return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); } +STATIC int +xfs_vn_update_time( + struct inode *inode, + struct timespec *now, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + trace_xfs_update_time(ip); + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & S_CTIME) { + inode->i_ctime = *now; + ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_MTIME) { + inode->i_mtime = *now; + ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_ATIME) { + inode->i_atime = *now; + ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; + } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + return -xfs_trans_commit(tp, 0); +} + #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -991,6 +1032,7 @@ static const struct inode_operations xfs_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .fiemap = xfs_vn_fiemap, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_dir_inode_operations = { @@ -1016,6 +1058,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1041,6 +1084,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_symlink_inode_operations = { @@ -1054,6 +1098,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; STATIC void diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index eff577a9b67f..01d10a66e302 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -555,7 +555,7 @@ xfs_bulkstat_single( /* * note that requesting valid inode numbers which are not allocated - * to inodes will most likely cause xfs_itobp to generate warning + * to inodes will most likely cause xfs_imap_to_bp to generate warning * messages about bad magic numbers. This is ok. The fact that * the inode isn't actually an inode is handled by the * error check below. Done this way to make the usual case faster diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d90d4a388609..7f4f9370d0e7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -45,51 +45,85 @@ xlog_commit_record( struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp); -STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks); +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); STATIC int xlog_space_left( struct xlog *log, atomic64_t *head); -STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_dealloc_log(xlog_t *log); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); -STATIC int xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclog, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp); -STATIC int xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog); -STATIC void xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size); -STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); STATIC void xlog_grant_push_ail( - struct xlog *log, - int need_bytes); -STATIC void xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket); -STATIC void xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket); + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); STATIC void xlog_verify_grant_tail( - struct xlog *log); -STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, - int count, boolean_t syncing); -STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn); + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + boolean_t syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) @@ -97,7 +131,9 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, #define xlog_verify_tail_lsn(a,b,c) #endif -STATIC int xlog_iclogs_empty(xlog_t *log); +STATIC int +xlog_iclogs_empty( + struct xlog *log); static void xlog_grant_sub_space( @@ -684,7 +720,7 @@ xfs_log_mount_finish(xfs_mount_t *mp) int xfs_log_unmount_write(xfs_mount_t *mp) { - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; xlog_in_core_t *iclog; #ifdef DEBUG xlog_in_core_t *first_iclog; @@ -893,7 +929,7 @@ int xfs_log_need_covered(xfs_mount_t *mp) { int needed = 0; - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; if (!xfs_fs_writable(mp)) return 0; @@ -1024,9 +1060,9 @@ xlog_space_left( void xlog_iodone(xfs_buf_t *bp) { - xlog_in_core_t *iclog = bp->b_fspriv; - xlog_t *l = iclog->ic_log; - int aborted = 0; + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; /* * Race to shutdown the filesystem if we see an error. @@ -1067,8 +1103,9 @@ xlog_iodone(xfs_buf_t *bp) */ STATIC void -xlog_get_iclog_buffer_size(xfs_mount_t *mp, - xlog_t *log) +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) { int size; int xhdrs; @@ -1129,13 +1166,14 @@ done: * Its primary purpose is to fill in enough, so recovery can occur. However, * some other stuff may be filled in too. */ -STATIC xlog_t * -xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - xlog_t *log; + struct xlog *log; xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; @@ -1144,7 +1182,7 @@ xlog_alloc_log(xfs_mount_t *mp, int error = ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1434,8 +1472,9 @@ xlog_bdstrat( */ STATIC int -xlog_sync(xlog_t *log, - xlog_in_core_t *iclog) +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) { xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; @@ -1584,7 +1623,8 @@ xlog_sync(xlog_t *log, * Deallocate a log structure */ STATIC void -xlog_dealloc_log(xlog_t *log) +xlog_dealloc_log( + struct xlog *log) { xlog_in_core_t *iclog, *next_iclog; int i; @@ -1616,10 +1656,11 @@ xlog_dealloc_log(xlog_t *log) */ /* ARGSUSED */ static inline void -xlog_state_finish_copy(xlog_t *log, - xlog_in_core_t *iclog, - int record_cnt, - int copy_bytes) +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) { spin_lock(&log->l_icloglock); @@ -2142,7 +2183,8 @@ xlog_write( * State Change: DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log(xlog_t *log) +xlog_state_clean_log( + struct xlog *log) { xlog_in_core_t *iclog; int changed = 0; @@ -2222,7 +2264,7 @@ xlog_state_clean_log(xlog_t *log) STATIC xfs_lsn_t xlog_get_lowest_lsn( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *lsn_log; xfs_lsn_t lowest_lsn, lsn; @@ -2245,9 +2287,9 @@ xlog_get_lowest_lsn( STATIC void xlog_state_do_callback( - xlog_t *log, - int aborted, - xlog_in_core_t *ciclog) + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've @@ -2467,7 +2509,7 @@ xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { - xlog_t *log = iclog->ic_log; + struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); @@ -2521,12 +2563,13 @@ xlog_state_done_syncing( * is copied. */ STATIC int -xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclogp, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp) +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) { int log_offset; xlog_rec_header_t *head; @@ -2631,8 +2674,9 @@ restart: * move grant reservation head forward. */ STATIC void -xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { trace_xfs_log_regrant_reserve_enter(log, ticket); @@ -2677,8 +2721,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, * in the current reservation field. */ STATIC void -xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { int bytes; @@ -2717,8 +2762,8 @@ xlog_ungrant_log_space(xlog_t *log, */ STATIC int xlog_state_release_iclog( - xlog_t *log, - xlog_in_core_t *iclog) + struct xlog *log, + struct xlog_in_core *iclog) { int sync = 0; /* do we sync? */ @@ -2768,9 +2813,10 @@ xlog_state_release_iclog( * that every data block. We have run out of space in this log record. */ STATIC void -xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size) +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); if (!eventual_size) @@ -3114,7 +3160,9 @@ xfs_log_force_lsn( * disk. */ STATIC void -xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) { assert_spin_locked(&log->l_icloglock); @@ -3158,7 +3206,7 @@ xfs_log_ticket_get( /* * Allocate and initialise a new log ticket. */ -xlog_ticket_t * +struct xlog_ticket * xlog_ticket_alloc( struct xlog *log, int unit_bytes, @@ -3346,9 +3394,10 @@ xlog_verify_grant_tail( /* check if it will fit */ STATIC void -xlog_verify_tail_lsn(xlog_t *log, - xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn) +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) { int blocks; @@ -3385,10 +3434,11 @@ xlog_verify_tail_lsn(xlog_t *log, * the cycle numbers agree with the current cycle number. */ STATIC void -xlog_verify_iclog(xlog_t *log, - xlog_in_core_t *iclog, - int count, - boolean_t syncing) +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + boolean_t syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3482,7 +3532,7 @@ xlog_verify_iclog(xlog_t *log, */ STATIC int xlog_state_ioerror( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *iclog, *ic; @@ -3527,7 +3577,7 @@ xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_t *log; + struct xlog *log; int retval; log = mp->m_log; @@ -3634,7 +3684,8 @@ xfs_log_force_umount( } STATIC int -xlog_iclogs_empty(xlog_t *log) +xlog_iclogs_empty( + struct xlog *log) { xlog_in_core_t *iclog; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 72eba2201b14..18a801d76a42 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -487,7 +487,7 @@ struct xlog_grant_head { * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct xlog { +struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ @@ -540,7 +540,7 @@ typedef struct xlog { char *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif -} xlog_t; +}; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) @@ -548,9 +548,17 @@ typedef struct xlog { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ -extern int xlog_recover(xlog_t *log); -extern int xlog_recover_finish(xlog_t *log); -extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); +extern void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a7be98abd6a9..5da3ace352bf 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -43,10 +43,18 @@ #include "xfs_utils.h" #include "xfs_trace.h" -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) #endif @@ -74,7 +82,7 @@ struct xfs_buf_cancel { static inline int xlog_buf_bbcount_valid( - xlog_t *log, + struct xlog *log, int bbcount) { return bbcount > 0 && bbcount <= log->l_logBBsize; @@ -87,7 +95,7 @@ xlog_buf_bbcount_valid( */ STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, + struct xlog *log, int nbblks) { struct xfs_buf *bp; @@ -138,10 +146,10 @@ xlog_put_bp( */ STATIC xfs_caddr_t xlog_align( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); @@ -155,10 +163,10 @@ xlog_align( */ STATIC int xlog_bread_noalign( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -189,10 +197,10 @@ xlog_bread_noalign( STATIC int xlog_bread( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t *offset) { int error; @@ -211,10 +219,10 @@ xlog_bread( */ STATIC int xlog_bread_offset( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; @@ -241,10 +249,10 @@ xlog_bread_offset( */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -378,8 +386,8 @@ xlog_recover_iodone( */ STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) @@ -421,7 +429,7 @@ xlog_find_cycle_start( */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -490,7 +498,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -600,7 +608,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -871,7 +879,7 @@ validate_head: */ STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -1080,7 +1088,7 @@ done: */ STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -1183,7 +1191,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1205,7 +1213,7 @@ xlog_add_record( STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1305,7 +1313,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -2050,11 +2058,11 @@ xfs_qm_dqcheck( */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; @@ -2108,9 +2116,9 @@ xlog_recover_do_dquot_buffer( */ STATIC int xlog_recover_buffer_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; @@ -2189,9 +2197,9 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; @@ -2452,14 +2460,14 @@ error: } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int xlog_recover_quotaoff_pass1( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); @@ -2483,9 +2491,9 @@ xlog_recover_quotaoff_pass1( */ STATIC int xlog_recover_dquot_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; @@ -2578,9 +2586,9 @@ xlog_recover_dquot_pass2( */ STATIC int xlog_recover_efi_pass2( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn) + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; xfs_mount_t *mp = log->l_mp; @@ -2616,8 +2624,8 @@ xlog_recover_efi_pass2( */ STATIC int xlog_recover_efd_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2812,9 +2820,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, + struct xlog *log, struct hlist_head rhash[], - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2986,7 +2994,7 @@ abort_error: */ STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; @@ -3098,7 +3106,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; @@ -3147,7 +3155,7 @@ xlog_recover_process_one_iunlink( */ STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; @@ -3209,9 +3217,9 @@ xlog_recover_process_iunlinks( #ifdef DEBUG STATIC void xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) + struct xlog *log, + struct xlog_in_core *iclog, + int size) { int i; __be32 *up; @@ -3234,8 +3242,8 @@ xlog_pack_data_checksum( */ void xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, + struct xlog *log, + struct xlog_in_core *iclog, int roundoff) { int i, j, k; @@ -3274,9 +3282,9 @@ xlog_pack_data( STATIC void xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; @@ -3299,8 +3307,8 @@ xlog_unpack_data( STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; @@ -3343,7 +3351,7 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) @@ -3595,7 +3603,7 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3646,7 +3654,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3721,7 +3729,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3767,7 +3775,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3814,7 +3822,7 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 536021fb3d4e..29c2f83d4147 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1200,8 +1200,6 @@ xfs_mountfs( xfs_set_maxicount(mp); - mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); - error = xfs_uuid_mount(mp); if (error) goto out; @@ -1531,6 +1529,15 @@ xfs_unmountfs( xfs_ail_push_all_sync(mp->m_ail); xfs_wait_buftarg(mp->m_ddev_targp); + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); @@ -1544,7 +1551,7 @@ xfs_unmountfs( int xfs_fs_writable(xfs_mount_t *mp) { - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 90c1fc9eaea4..05a05a7b6119 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -176,7 +176,6 @@ typedef struct xfs_mount { uint m_qflags; /* quota status flags */ xfs_trans_reservations_t m_reservations;/* precomputed res values */ __uint64_t m_maxicount; /* maximum inode count */ - __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ @@ -297,8 +296,6 @@ xfs_preferred_iosize(xfs_mount_t *mp) PAGE_CACHE_SIZE)); } -#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) - #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) @@ -314,9 +311,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ -#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) - /* * Flags for xfs_mountfs */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 249db1987764..2e86fa0cfc0d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -940,7 +940,7 @@ xfs_qm_dqiterate( map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); lblkno = 0; - maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { nmaps = XFS_DQITER_MAP_SIZE; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0d9de41a7151..bdaf4cb9f4a2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -868,67 +868,14 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -/* - * This is called by the VFS when dirtying inode metadata. This can happen - * for a few reasons, but we only care about timestamp updates, given that - * we handled the rest ourselves. In theory no other calls should happen, - * but for example generic_write_end() keeps dirtying the inode after - * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC, - * and skip this call otherwise. - * - * We'll hopefull get a different method just for updating timestamps soon, - * at which point this hack can go away, and maybe we'll also get real - * error handling here. - */ -STATIC void -xfs_fs_dirty_inode( - struct inode *inode, - int flags) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - if (flags != I_DIRTY_SYNC) - return; - - trace_xfs_dirty_inode(ip); - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto trouble; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - /* - * Grab all the latest timestamps from the Linux inode. - */ - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; - ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - error = xfs_trans_commit(tp, 0); - if (error) - goto trouble; - return; - -trouble: - xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino); -} - STATIC void xfs_fs_evict_inode( struct inode *inode) { xfs_inode_t *ip = XFS_I(inode); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); @@ -937,22 +884,6 @@ xfs_fs_evict_inode( XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); - /* - * The iolock is used by the file system to coordinate reads, - * writes, and block truncates. Up to this point the lock - * protected concurrent accesses by users of the inode. But - * from here forward we're doing some final processing of the - * inode because we're done with it, and although we reuse the - * iolock for protection it is really a distinct lock class - * (in the lockdep sense) from before. To keep lockdep happy - * (and basically indicate what we are doing), we explicitly - * re-init the iolock here. - */ - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_reclaimable, "xfs_iolock_reclaimable"); - xfs_inactive(ip); } @@ -1436,7 +1367,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .dirty_inode = xfs_fs_dirty_inode, .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, @@ -1491,13 +1421,9 @@ xfs_init_zones(void) if (!xfs_da_state_zone) goto out_destroy_btree_cur_zone; - xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); - if (!xfs_dabuf_zone) - goto out_destroy_da_state_zone; - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); if (!xfs_ifork_zone) - goto out_destroy_dabuf_zone; + goto out_destroy_da_state_zone; xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); if (!xfs_trans_zone) @@ -1514,9 +1440,8 @@ xfs_init_zones(void) * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / - NBWORD) * sizeof(int))), "xfs_buf_item"); + xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), + "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_log_item_desc_zone; @@ -1561,8 +1486,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: kmem_zone_destroy(xfs_ifork_zone); - out_destroy_dabuf_zone: - kmem_zone_destroy(xfs_dabuf_zone); out_destroy_da_state_zone: kmem_zone_destroy(xfs_da_state_zone); out_destroy_btree_cur_zone: @@ -1590,7 +1513,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_dabuf_zone); kmem_zone_destroy(xfs_da_state_zone); kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 1e9ee064dbb2..96548176db80 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -359,6 +359,15 @@ xfs_quiesce_attr( * added an item to the AIL, thus flush it again. */ xfs_ail_push_all_sync(mp->m_ail); + + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); } static void @@ -394,7 +403,7 @@ xfs_sync_worker( if (!(mp->m_super->s_flags & MS_ACTIVE) && !(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else @@ -712,8 +721,8 @@ restart: * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer would result - * in an ABBA deadlock with xfs_ifree_cluster(). + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index caf5dabfd553..e5795dd6013a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -578,8 +578,8 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_dirty_inode); DEFINE_INODE_EVENT(xfs_evict_inode); +DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); DEFINE_INODE_EVENT(xfs_dquot_dqdetach); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdf324508c5e..06ed520a767f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -576,8 +576,12 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type, KM_SLEEP); + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; } xfs_trans_t * @@ -588,6 +592,7 @@ _xfs_trans_alloc( { xfs_trans_t *tp; + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, memflags); @@ -611,6 +616,8 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); } @@ -643,7 +650,11 @@ xfs_trans_dup( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7c37b533aa8e..db056544cbb5 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -179,6 +179,8 @@ struct xfs_log_item_desc { #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ /* * Values for call flags parameter. @@ -448,11 +450,51 @@ xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); -struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, - int, uint); -int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, - struct xfs_buftarg *, xfs_daddr_t, int, uint, - struct xfs_buf **); + +struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + uint flags); + +static inline struct xfs_buf * +xfs_trans_get_buf( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + uint flags) +{ + struct xfs_buf_map map = { + .bm_bn = blkno, + .bm_len = numblks, + }; + return xfs_trans_get_buf_map(tp, target, &map, 1, flags); +} + +int xfs_trans_read_buf_map(struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp); + +static inline int +xfs_trans_read_buf( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + struct xfs_buf_map map = { + .bm_bn = blkno, + .bm_len = numblks, + }; + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); +} + struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9c514483e599..6011ee661339 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -383,6 +383,12 @@ xfsaild_push( } spin_lock(&ailp->xa_lock); + + /* barrier matches the xa_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->xa_target; + ailp->xa_target_prev = target; + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip) { /* @@ -397,7 +403,6 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail); lsn = lip->li_lsn; - target = ailp->xa_target; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; @@ -527,8 +532,32 @@ xfsaild( __set_current_state(TASK_KILLABLE); else __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(tout ? - msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + spin_lock(&ailp->xa_lock); + + /* + * Idle if the AIL is empty and we are not racing with a target + * update. We check the AIL after we set the task to a sleep + * state to guarantee that we either catch an xa_target update + * or that a wake_up resets the state to TASK_RUNNING. + * Otherwise, we run the risk of sleeping indefinitely. + * + * The barrier matches the xa_target update in xfs_ail_push(). + */ + smp_rmb(); + if (!xfs_ail_min(ailp) && + ailp->xa_target == ailp->xa_target_prev) { + spin_unlock(&ailp->xa_lock); + schedule(); + tout = 0; + continue; + } + spin_unlock(&ailp->xa_lock); + + if (tout) + schedule_timeout(msecs_to_jiffies(tout)); + + __set_current_state(TASK_RUNNING); try_to_freeze(); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 21c5a5e3700d..6311b99c267f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -41,20 +41,26 @@ STATIC struct xfs_buf * xfs_trans_buf_item_match( struct xfs_trans *tp, struct xfs_buftarg *target, - xfs_daddr_t blkno, - int len) + struct xfs_buf_map *map, + int nmaps) { struct xfs_log_item_desc *lidp; struct xfs_buf_log_item *blip; + int len = 0; + int i; + + for (i = 0; i < nmaps; i++) + len += map[i].bm_len; - len = BBTOB(len); list_for_each_entry(lidp, &tp->t_items, lid_trans) { blip = (struct xfs_buf_log_item *)lidp->lid_item; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && - XFS_BUF_ADDR(blip->bli_buf) == blkno && - BBTOB(blip->bli_buf->b_length) == len) + XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + blip->bli_buf->b_length == len) { + ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; + } } return NULL; @@ -128,21 +134,19 @@ xfs_trans_bjoin( * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ -xfs_buf_t * -xfs_trans_get_buf(xfs_trans_t *tp, - xfs_buftarg_t *target_dev, - xfs_daddr_t blkno, - int len, - uint flags) +struct xfs_buf * +xfs_trans_get_buf_map( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; - /* - * Default to a normal get_buf() call if the tp is NULL. - */ - if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, flags); + if (!tp) + return xfs_buf_get_map(target, map, nmaps, flags); /* * If we find the buffer in the cache with this transaction @@ -150,7 +154,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { @@ -167,7 +171,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return (bp); } - bp = xfs_buf_get(target_dev, blkno, len, flags); + bp = xfs_buf_get_map(target, map, nmaps, flags); if (bp == NULL) { return NULL; } @@ -246,26 +250,22 @@ int xfs_error_mod = 33; * read_buf() call. */ int -xfs_trans_read_buf( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) +xfs_trans_read_buf_map( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; int error; *bpp = NULL; - - /* - * Default to a normal get_buf() call if the tp is NULL. - */ - if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags); + if (!tp) { + bp = xfs_buf_read_map(target, map, nmaps, flags); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -303,7 +303,7 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - bp = xfs_trans_buf_item_match(tp, target, blkno, len); + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); @@ -349,7 +349,7 @@ xfs_trans_read_buf( return 0; } - bp = xfs_buf_read(target, blkno, len, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index fb62377d1cbc..53b7c9b0f8f7 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -67,6 +67,7 @@ struct xfs_ail { struct task_struct *xa_task; struct list_head xa_ail; xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; struct list_head xa_cursors; spinlock_t xa_lock; xfs_lsn_t xa_last_pushed_lsn; diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 398cf681d025..7a41874f4c20 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -133,6 +133,20 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ /* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + +/* * Min numbers of data/attr fork btree root pointers. */ #define MINDBTPTRS 3 diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 4e5b9ad5cb97..0025c78ac03c 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -65,7 +65,6 @@ xfs_dir_ialloc( xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; - boolean_t call_again = B_FALSE; int code; uint log_res; uint log_count; @@ -91,7 +90,7 @@ xfs_dir_ialloc( * the inode(s) that we've just allocated. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &call_again, &ip); + &ialloc_context, &ip); /* * Return an error if we were unable to allocate a new inode. @@ -102,19 +101,18 @@ xfs_dir_ialloc( *ipp = NULL; return code; } - if (!call_again && (ip == NULL)) { + if (!ialloc_context && !ip) { *ipp = NULL; return XFS_ERROR(ENOSPC); } /* - * If call_again is set, then we were unable to get an + * If the AGI buffer is non-NULL, then we were unable to get an * inode in one operation. We need to commit the current * transaction and call xfs_ialloc() again. It is guaranteed * to succeed the second time. */ - if (call_again) { - + if (ialloc_context) { /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -195,7 +193,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &call_again, &ip); + okalloc, &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -206,12 +204,11 @@ xfs_dir_ialloc( *ipp = NULL; return code; } - ASSERT ((!call_again) && (ip != NULL)); + ASSERT(!ialloc_context && ip); } else { - if (committed != NULL) { + if (committed != NULL) *committed = 0; - } } *ipp = ip; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b6a82d817a82..2a5c637344b4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -146,11 +146,6 @@ xfs_readlink( } /* - * Flags for xfs_free_eofblocks - */ -#define XFS_FREE_EOF_TRYLOCK (1<<0) - -/* * This is called by xfs_inactive to free any blocks beyond eof * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. @@ -159,7 +154,7 @@ STATIC int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, - int flags) + bool need_iolock) { xfs_trans_t *tp; int error; @@ -174,7 +169,7 @@ xfs_free_eofblocks( * of the file. If not, then there is nothing to do. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return 0; map_len = last_fsb - end_fsb; @@ -201,13 +196,11 @@ xfs_free_eofblocks( */ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (flags & XFS_FREE_EOF_TRYLOCK) { + if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); return 0; } - } else { - xfs_ilock(ip, XFS_IOLOCK_EXCL); } error = xfs_trans_reserve(tp, 0, @@ -217,7 +210,8 @@ xfs_free_eofblocks( if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -244,7 +238,10 @@ xfs_free_eofblocks( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); } - xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); } return error; } @@ -282,23 +279,15 @@ xfs_inactive_symlink_rmt( * free them all in one bunmapi call. */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - *tpp = NULL; - return error; - } + /* * Lock the inode, fix the size, and join it to the transaction. * Hold it so in the normal path, we still have it locked for * the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ - xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -385,114 +374,14 @@ xfs_inactive_symlink_rmt( ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error0; } - /* - * Return with the inode locked but not joined to the transaction. - */ + + xfs_trans_ijoin(tp, ip, 0); *tpp = tp; return 0; error1: xfs_bmap_cancel(&free_list); error0: - /* - * Have to come here with the inode locked and either - * (held and in the transaction) or (not in the transaction). - * If the inode isn't held then cancel would iput it, but - * that's wrong since this is inactive and the vnode ref - * count is 0 already. - * Cancel won't do anything to the inode if held, but it still - * needs to be locked until the cancel is done, if it was - * joined to the transaction. - */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - *tpp = NULL; - return error; - -} - -STATIC int -xfs_inactive_symlink_local( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - int error; - - ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink which fit into - * the inode. Just free the memory used - * to hold the old symlink. - */ - error = xfs_trans_reserve(*tpp, 0, - XFS_ITRUNCATE_LOG_RES(ip->i_mount), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - - if (error) { - xfs_trans_cancel(*tpp, 0); - *tpp = NULL; - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - /* - * Zero length symlinks _can_ exist. - */ - if (ip->i_df.if_bytes > 0) { - xfs_idata_realloc(ip, - -(ip->i_df.if_bytes), - XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - } - return 0; -} - -STATIC int -xfs_inactive_attrs( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_trans_t *tp; - int error; - xfs_mount_t *mp; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_forkoff != 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_unlock; - - error = xfs_attr_inactive(ip); - if (error) - goto error_unlock; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) - goto error_cancel; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - ASSERT(ip->i_d.di_anextents == 0); - - *tpp = tp; - return 0; - -error_cancel: - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); -error_unlock: - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -574,8 +463,7 @@ xfs_release( if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) return 0; - error = xfs_free_eofblocks(mp, ip, - XFS_FREE_EOF_TRYLOCK); + error = xfs_free_eofblocks(mp, ip, true); if (error) return error; @@ -604,7 +492,7 @@ xfs_inactive( xfs_trans_t *tp; xfs_mount_t *mp; int error; - int truncate; + int truncate = 0; /* * If the inode is already free, then there can be nothing @@ -616,17 +504,6 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - /* - * Only do a truncate if it's a regular file with - * some actual space in it. It's OK to look at the - * inode's fields without the lock because we're the - * only one with a reference to the inode. - */ - truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || - (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && - S_ISREG(ip->i_d.di_mode)); - mp = ip->i_mount; error = 0; @@ -643,99 +520,100 @@ xfs_inactive( (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || ip->i_delayed_blks != 0))) { - error = xfs_free_eofblocks(mp, ip, 0); + error = xfs_free_eofblocks(mp, ip, false); if (error) return VN_INACTIVE_CACHE; } goto out; } - ASSERT(ip->i_d.di_nlink == 0); + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; error = xfs_qm_dqattach(ip, 0); if (error) return VN_INACTIVE_CACHE; tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (truncate) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - /* Don't call itruncate_cleanup */ - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return VN_INACTIVE_CACHE; - } + error = xfs_trans_reserve(tp, 0, + (truncate || S_ISLNK(ip->i_d.di_mode)) ? + XFS_ITRUNCATE_LOG_RES(mp) : + XFS_IFREE_LOG_RES(mp), + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return VN_INACTIVE_CACHE; + } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + if (S_ISLNK(ip->i_d.di_mode)) { + /* + * Zero length symlinks _can_ exist. + */ + if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) { + error = xfs_inactive_symlink_rmt(ip, &tp); + if (error) + goto out_cancel; + } else if (ip->i_df.if_bytes > 0) { + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + } + } else if (truncate) { ip->i_d.di_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); - if (error) { - xfs_trans_cancel(tp, - XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - return VN_INACTIVE_CACHE; - } + if (error) + goto out_cancel; ASSERT(ip->i_d.di_nextents == 0); - } else if (S_ISLNK(ip->i_d.di_mode)) { + } - /* - * If we get an error while cleaning up a - * symlink we bail out. - */ - error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ? - xfs_inactive_symlink_rmt(ip, &tp) : - xfs_inactive_symlink_local(ip, &tp); + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. We need to just commit the current transaction + * because we can't use it for xfs_attr_inactive(). + */ + if (ip->i_d.di_anextents > 0) { + ASSERT(ip->i_d.di_forkoff != 0); - if (error) { - ASSERT(tp == NULL); - return VN_INACTIVE_CACHE; - } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_unlock; - xfs_trans_ijoin(tp, ip, 0); - } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_attr_inactive(ip); + if (error) + goto out; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_INACTIVE_LOG_COUNT); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); - return VN_INACTIVE_CACHE; + goto out; } - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); } - /* - * If there are attributes associated with the file - * then blow them away now. The code calls a routine - * that recursively deconstructs the attribute fork. - * We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). - */ - if (ip->i_d.di_anextents > 0) { - error = xfs_inactive_attrs(ip, &tp); - /* - * If we got an error, the transaction is already - * cancelled, and the inode is unlocked. Just get out. - */ - if (error) - return VN_INACTIVE_CACHE; - } else if (ip->i_afp) { + if (ip->i_afp) xfs_idestroy_fork(ip, XFS_ATTR_FORK); - } + + ASSERT(ip->i_d.di_anextents == 0); /* * Free the inode. @@ -779,10 +657,13 @@ xfs_inactive( * Release the dquots held by inode, if any. */ xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - - out: +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: return VN_INACTIVE_CACHE; +out_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + goto out_unlock; } /* @@ -2262,10 +2143,10 @@ xfs_change_file_space( llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; - if ( (bf->l_start < 0) - || (bf->l_start > XFS_MAXIOFFSET(mp)) - || (bf->l_start + llen < 0) - || (bf->l_start + llen > XFS_MAXIOFFSET(mp))) + if (bf->l_start < 0 || + bf->l_start > mp->m_super->s_maxbytes || + bf->l_start + llen < 0 || + bf->l_start + llen > mp->m_super->s_maxbytes) return XFS_ERROR(EINVAL); bf->l_whence = 0; |