diff options
Diffstat (limited to 'fs')
238 files changed, 4571 insertions, 3075 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index ed835836e0dc..32ef4009d030 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,7 +40,9 @@ extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; +extern const struct file_operations v9fs_file_operations_dotl; extern const struct file_operations v9fs_dir_operations; +extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 0adfd64dfcee..d61e3b28ce37 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = { .open = v9fs_file_open, .release = v9fs_dir_release, }; + +const struct file_operations v9fs_dir_operations_dotl = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = v9fs_dir_readdir, + .open = v9fs_file_open, + .release = v9fs_dir_release, +}; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index df52d488d2a6..25b300e1c9d7 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -296,3 +296,14 @@ const struct file_operations v9fs_file_operations = { .mmap = generic_file_readonly_mmap, .fsync = v9fs_file_fsync, }; + +const struct file_operations v9fs_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_file_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index f2434fc9d2c4..4331b3b5ee1c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -44,9 +44,12 @@ #include "cache.h" static const struct inode_operations v9fs_dir_inode_operations; -static const struct inode_operations v9fs_dir_inode_operations_ext; +static const struct inode_operations v9fs_dir_inode_operations_dotu; +static const struct inode_operations v9fs_dir_inode_operations_dotl; static const struct inode_operations v9fs_file_inode_operations; +static const struct inode_operations v9fs_file_inode_operations_dotl; static const struct inode_operations v9fs_symlink_inode_operations; +static const struct inode_operations v9fs_symlink_inode_operations_dotl; /** * unixmode2p9mode - convert unix mode bits to plan 9 @@ -253,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) return ERR_PTR(-ENOMEM); } - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -275,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) init_special_inode(inode, inode->i_mode, inode->i_rdev); break; case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } + break; + case S_IFLNK: - if (!v9fs_proto_dotu(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " + "legacy protocol.\n"); err = -EINVAL; goto error; } - inode->i_op = &v9fs_symlink_inode_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_symlink_inode_operations_dotl; + else + inode->i_op = &v9fs_symlink_inode_operations; + break; case S_IFDIR: inc_nlink(inode); - if (v9fs_proto_dotu(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotl; + else if (v9fs_proto_dotu(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotu; else inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_fop = &v9fs_dir_operations_dotl; + else + inode->i_fop = &v9fs_dir_operations; + break; default: P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", @@ -434,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) { int retval; struct inode *file_inode; - struct v9fs_session_info *v9ses; struct p9_fid *v9fid; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, rmdir); file_inode = file->d_inode; - v9ses = v9fs_inode2v9ses(file_inode); v9fid = v9fs_fid_clone(file); if (IS_ERR(v9fid)) return PTR_ERR(v9fid); @@ -484,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = NULL; fid = NULL; name = (char *) dentry->d_name.name; - dfid = v9fs_fid_clone(dentry->d_parent); + dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); - dfid = NULL; - goto error; + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return ERR_PTR(err); } /* clone a fid to use for creation */ @@ -497,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - ofid = NULL; - goto error; + return ERR_PTR(err); } err = p9_client_fcreate(ofid, name, perm, mode, extension); @@ -508,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* now walk from the parent so we can get unopened fid */ - fid = p9_client_walk(dfid, 1, &name, 0); + fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; - } else - dfid = NULL; + } /* instantiate inode and assign the unopened fid to the dentry */ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); @@ -538,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, return ofid; error: - if (dfid) - p9_client_clunk(dfid); - if (ofid) p9_client_clunk(ofid); @@ -675,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(fid)) { result = PTR_ERR(fid); if (result == -ENOENT) { - d_add(dentry, NULL); - return NULL; + inode = NULL; + goto inst_out; } return ERR_PTR(result); @@ -693,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (result < 0) goto error; - if ((fid->qid.version) && (v9ses->cache)) +inst_out: + if (v9ses->cache) dentry->d_op = &v9fs_cached_dentry_operations; else dentry->d_op = &v9fs_dentry_operations; @@ -772,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto clunk_olddir; } + if (v9fs_proto_dotl(v9ses)) { + retval = p9_client_rename(oldfid, newdirfid, + (char *) new_dentry->d_name.name); + if (retval != -ENOSYS) + goto clunk_newdir; + } + /* 9P can only handle file rename in the same directory */ if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { P9_DPRINTK(P9_DEBUG_ERROR, @@ -1197,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISFIFO(mode)) *name = 0; + else if (S_ISSOCK(mode)) + *name = 0; else { __putname(name); return -EINVAL; @@ -1208,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } -static const struct inode_operations v9fs_dir_inode_operations_ext = { +static const struct inode_operations v9fs_dir_inode_operations_dotu = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .symlink = v9fs_vfs_symlink, + .link = v9fs_vfs_link, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, .symlink = v9fs_vfs_symlink, @@ -1239,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = { .setattr = v9fs_vfs_setattr, }; +static const struct inode_operations v9fs_file_inode_operations_dotl = { + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, @@ -1246,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = { .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; + +static const struct inode_operations v9fs_symlink_inode_operations_dotl = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 806da5d3b3a0..be74d020436e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -38,6 +38,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/statfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -45,7 +46,7 @@ #include "v9fs_vfs.h" #include "fid.h" -static const struct super_operations v9fs_super_ops; +static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; /** * v9fs_set_super - set the superblock @@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; - sb->s_op = &v9fs_super_ops; + if (v9fs_proto_dotl(v9ses)) + sb->s_op = &v9fs_super_ops_dotl; + else + sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | @@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb) v9fs_session_begin_cancel(v9ses); } +static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_rstatfs rs; + int res; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + res = PTR_ERR(fid); + goto done; + } + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9fs_proto_dotl(v9ses)) { + res = p9_client_statfs(fid, &rs); + if (res == 0) { + buf->f_type = rs.type; + buf->f_bsize = rs.bsize; + buf->f_blocks = rs.blocks; + buf->f_bfree = rs.bfree; + buf->f_bavail = rs.bavail; + buf->f_files = rs.files; + buf->f_ffree = rs.ffree; + buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = rs.namelen; + } + if (res != -ENOSYS) + goto done; + } + res = simple_statfs(dentry, buf); +done: + return res; +} + static const struct super_operations v9fs_super_ops = { #ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, @@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = { .umount_begin = v9fs_umount_begin, }; +static const struct super_operations v9fs_super_ops_dotl = { +#ifdef CONFIG_9P_FSCACHE + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, +#endif + .statfs = v9fs_statfs, + .clear_inode = v9fs_clear_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, +}; + struct file_system_type v9fs_fs_type = { .name = "9p", .get_sb = v9fs_get_sb, diff --git a/fs/Makefile b/fs/Makefile index 97f340f14ba2..e6ec1d309b1d 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o fs_struct.o + stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/afs/dir.c b/fs/afs/dir.c index adc1cb771b57..b42d5cc1d6d2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, struct key *key) { struct page *page; - struct file file = { - .private_data = key, - }; - _enter("{%lu},%lu", dir->i_ino, index); - page = read_mapping_page(dir->i_mapping, index, &file); + page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); if (!IS_ERR(page)) { kmap(page); if (!PageChecked(page)) diff --git a/fs/afs/file.c b/fs/afs/file.c index 0df9bc2b724d..14d89fa58fee 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page, #endif /* - * AFS read page from file, directory or symlink + * read page from file, directory or symlink, given a key to use */ -static int afs_readpage(struct file *file, struct page *page) +int afs_page_filler(void *data, struct page *page) { - struct afs_vnode *vnode; - struct inode *inode; - struct key *key; + struct inode *inode = page->mapping->host; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key = data; size_t len; off_t offset; int ret; - inode = page->mapping->host; - - if (file) { - key = file->private_data; - ASSERT(key != NULL); - } else { - key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_nokey; - } - } - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); - vnode = AFS_FS_I(inode); - BUG_ON(!PageLocked(page)); ret = -ESTALE; @@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } - if (!file) - key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); - if (!file) - key_put(key); -error_nokey: _leave(" = %d", ret); return ret; } /* + * read page from file, directory or symlink, given a file to nominate the key + * to be used + */ +static int afs_readpage(struct file *file, struct page *page) +{ + struct key *key; + int ret; + + if (file) { + key = file->private_data; + ASSERT(key != NULL); + ret = afs_page_filler(key, page); + } else { + struct inode *inode = page->mapping->host; + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + } else { + ret = afs_page_filler(key, page); + key_put(key); + } + } + return ret; +} + +/* * read a set of pages */ static int afs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct key *key = file->private_data; struct afs_vnode *vnode; int ret = 0; - _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); + _enter("{%d},{%lu},,%d", + key_serial(key), mapping->host->i_ino, nr_pages); + + ASSERT(key != NULL); vnode = AFS_FS_I(mapping->host); if (vnode->flags & AFS_VNODE_DELETED) { @@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping, } /* load the missing pages from the network */ - ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); + ret = read_cache_pages(mapping, pages, afs_page_filler, key); _leave(" = %d [netting]", ret); return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a10f2582844f..807f284cc75e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations; extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); +extern int afs_page_filler(void *, struct page *); /* * flock.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index b3feddc4f7d6..a9e23039ea34 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60; */ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) { - struct file file = { - .private_data = key, - }; struct page *page; size_t size; char *buf; @@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* read the contents of the symlink into the pagecache */ - page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); + page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, + afs_page_filler, key); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e4b75d6eda83..9bd4b3876c99 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void) * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d29b7f6df862..d832062869f6 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -736,11 +736,14 @@ static const struct file_operations _dev_ioctl_fops = { }; static struct miscdevice _autofs_dev_ioctl_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops }; +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + /* Register/deregister misc character device */ int autofs_dev_ioctl_init(void) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index e8e5e63ac950..db4117ed7803 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -18,13 +18,14 @@ #include <linux/slab.h> #include <linux/param.h> #include <linux/time.h> +#include <linux/smp_lock.h> #include "autofs_i.h" static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); -static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); +static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static void *autofs4_follow_link(struct dentry *, struct nameidata *); @@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = { .read = generic_read_dir, .readdir = dcache_readdir, .llseek = dcache_dir_lseek, - .ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs4_root_ioctl, }; const struct file_operations autofs4_dir_operations = { @@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) { struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); void __user *p = (void __user *)arg; @@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, return -ENOSYS; } } + +static long autofs4_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + long ret; + struct inode *inode = filp->f_dentry->d_inode; + + lock_kernel(); + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + unlock_kernel(); + + return ret; +} diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 1e41aadb1068..8f73841fc974 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, } set_bit(ino, info->si_imap); info->si_freei--; - inode->i_uid = current_fsuid(); - inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; - inode->i_mode = mode; inode->i_ino = ino; BFS_I(inode)->i_dsk_ino = ino; BFS_I(inode)->i_sblock = 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 6dcee88c2e5d..26e5f5026620 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb = get_active_super(bdev); if (!sb) goto out; - if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - up_write(&sb->s_umount); + error = freeze_super(sb); + if (error) { + deactivate_super(sb); + bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } - - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - sync_filesystem(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - deactivate_locked_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } + return ERR_PTR(error); } - up_write(&sb->s_umount); - + deactivate_super(sb); out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); @@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) mutex_lock(&bdev->bd_fsfreeze_mutex); if (!bdev->bd_fsfreeze_count) - goto out_unlock; + goto out; error = 0; if (--bdev->bd_fsfreeze_count > 0) - goto out_unlock; + goto out; if (!sb) - goto out_unlock; - - BUG_ON(sb->s_bdev != bdev); - down_write(&sb->s_umount); - if (sb->s_flags & MS_RDONLY) - goto out_unfrozen; - - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - -out_unfrozen: - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + goto out; - if (sb) - deactivate_locked_super(sb); -out_unlock: + error = thaw_super(sb); + if (error) { + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } +out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) */ mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); if (error == -EOPNOTSUPP) error = 0; @@ -668,41 +627,209 @@ void bd_forget(struct inode *inode) iput(bdev->bd_inode); } -int bd_claim(struct block_device *bdev, void *holder) +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @whole: whole block device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Test whther @bdev can be claimed by @holder. + * + * CONTEXT: + * spin_lock(&bdev_lock). + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, + void *holder) { - int res; - spin_lock(&bdev_lock); - - /* first decide result */ if (bdev->bd_holder == holder) - res = 0; /* already a holder */ + return true; /* already a holder */ else if (bdev->bd_holder != NULL) - res = -EBUSY; /* held by someone else */ + return false; /* held by someone else */ else if (bdev->bd_contains == bdev) - res = 0; /* is a whole device which isn't held */ + return true; /* is a whole device which isn't held */ - else if (bdev->bd_contains->bd_holder == bd_claim) - res = 0; /* is a partition of a device that is being partitioned */ - else if (bdev->bd_contains->bd_holder != NULL) - res = -EBUSY; /* is a partition of a held device */ + else if (whole->bd_holder == bd_claim) + return true; /* is a partition of a device that is being partitioned */ + else if (whole->bd_holder != NULL) + return false; /* is a partition of a held device */ else - res = 0; /* is a partition of an un-held device */ + return true; /* is a partition of an un-held device */ +} + +/** + * bd_prepare_to_claim - prepare to claim a block device + * @bdev: block device of interest + * @whole: the whole device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Prepare to claim @bdev. This function fails if @bdev is already + * claimed by another holder and waits if another claiming is in + * progress. This function doesn't actually claim. On successful + * return, the caller has ownership of bd_claiming and bd_holder[s]. + * + * CONTEXT: + * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab + * it multiple times. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +static int bd_prepare_to_claim(struct block_device *bdev, + struct block_device *whole, void *holder) +{ +retry: + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, whole, holder)) + return -EBUSY; + + /* if someone else is claiming, wait for it to finish */ + if (whole->bd_claiming && whole->bd_claiming != holder) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + spin_lock(&bdev_lock); + goto retry; + } + + /* yay, all mine */ + return 0; +} - /* now impose change */ - if (res==0) { +/** + * bd_start_claiming - start claiming a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * + * @bdev is about to be opened exclusively. Check @bdev can be opened + * exclusively and mark that an exclusive open is in progress. Each + * successful call to this function must be matched with a call to + * either bd_claim() or bd_abort_claiming(). If this function + * succeeds, the matching bd_claim() is guaranteed to succeed. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to the block device containing @bdev on success, ERR_PTR() + * value on failure. + */ +static struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder) +{ + struct gendisk *disk; + struct block_device *whole; + int partno, err; + + might_sleep(); + + /* + * @bdev might not have been initialized properly yet, look up + * and grab the outer block device the hard way. + */ + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + return ERR_PTR(-ENXIO); + + whole = bdget_disk(disk, 0); + put_disk(disk); + if (!whole) + return ERR_PTR(-ENOMEM); + + /* prepare to claim, if successful, mark claiming in progress */ + spin_lock(&bdev_lock); + + err = bd_prepare_to_claim(bdev, whole, holder); + if (err == 0) { + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); + return whole; + } else { + spin_unlock(&bdev_lock); + bdput(whole); + return ERR_PTR(err); + } +} + +/* releases bdev_lock */ +static void __bd_abort_claiming(struct block_device *whole, void *holder) +{ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + bdput(whole); +} + +/** + * bd_abort_claiming - abort claiming a block device + * @whole: whole block device returned by bd_start_claiming() + * @holder: holder trying to claim @bdev + * + * Abort a claiming block started by bd_start_claiming(). Note that + * @whole is not the block device to be claimed but the whole device + * returned by bd_start_claiming(). + * + * CONTEXT: + * Grabs and releases bdev_lock. + */ +static void bd_abort_claiming(struct block_device *whole, void *holder) +{ + spin_lock(&bdev_lock); + __bd_abort_claiming(whole, holder); /* releases bdev_lock */ +} + +/** + * bd_claim - claim a block device + * @bdev: block device to claim + * @holder: holder trying to claim @bdev + * + * Try to claim @bdev which must have been opened successfully. This + * function may be called with or without preceding + * blk_start_claiming(). In the former case, this function is always + * successful and terminates the claiming block. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 if successful, -EBUSY if @bdev is already claimed. + */ +int bd_claim(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev->bd_contains; + int res; + + might_sleep(); + + spin_lock(&bdev_lock); + + res = bd_prepare_to_claim(bdev, whole, holder); + if (res == 0) { /* note that for a whole device bd_holders * will be incremented twice, and bd_holder will * be set to bd_claim before being set to holder */ - bdev->bd_contains->bd_holders ++; - bdev->bd_contains->bd_holder = bd_claim; + whole->bd_holders++; + whole->bd_holder = bd_claim; bdev->bd_holders++; bdev->bd_holder = holder; } - spin_unlock(&bdev_lock); + + if (whole->bd_claiming) + __bd_abort_claiming(whole, holder); /* releases bdev_lock */ + else + spin_unlock(&bdev_lock); + return res; } - EXPORT_SYMBOL(bd_claim); void bd_release(struct block_device *bdev) @@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get); static int blkdev_open(struct inode * inode, struct file * filp) { + struct block_device *whole = NULL; struct block_device *bdev; int res; @@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; + if (filp->f_mode & FMODE_EXCL) { + whole = bd_start_claiming(bdev, filp); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + filp->f_mapping = bdev->bd_inode->i_mapping; res = blkdev_get(bdev, filp->f_mode); - if (res) - return res; - if (filp->f_mode & FMODE_EXCL) { - res = bd_claim(bdev, filp); - if (res) - goto out_blkdev_put; + if (whole) { + if (res == 0) + BUG_ON(bd_claim(bdev, filp) != 0); + else + bd_abort_claiming(whole, filp); } - return 0; - - out_blkdev_put: - blkdev_put(bdev, filp->f_mode); return res; } @@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev); */ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) { - struct block_device *bdev; - int error = 0; + struct block_device *bdev, *whole; + int error; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return whole; + } + error = blkdev_get(bdev, mode); if (error) - return ERR_PTR(error); + goto out_abort_claiming; + error = -EACCES; if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto blkdev_put; - error = bd_claim(bdev, holder); - if (error) - goto blkdev_put; + goto out_blkdev_put; + BUG_ON(bd_claim(bdev, holder) != 0); return bdev; - -blkdev_put: + +out_blkdev_put: blkdev_put(bdev, mode); +out_abort_claiming: + bd_abort_claiming(whole, holder); return ERR_PTR(error); } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ef7b26724ec..8d432cd9d580 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode) return ret; } -struct xattr_handler btrfs_xattr_acl_default_handler = { +const struct xattr_handler btrfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = btrfs_xattr_acl_get, .set = btrfs_xattr_acl_set, }; -struct xattr_handler btrfs_xattr_acl_access_handler = { +const struct xattr_handler btrfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = btrfs_xattr_acl_get, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b34d32fdaaec..c6a4f459ad76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bfdc641d4e3..d601629b85d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail; - inode->i_uid = current_fsuid(); - - if (dir && (dir->i_mode & S_ISGID)) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = objectid; inode_set_bytes(inode, 0); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1866dff0538e..2909a03e5230 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -832,11 +832,14 @@ static const struct file_operations btrfs_ctl_fops = { }; static struct miscdevice btrfs_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + static int btrfs_interface_init(void) { return misc_register(&btrfs_misc); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 193b58f7d3f3..59acd3eb288a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -282,7 +282,7 @@ err: * List of handlers for synthetic system.* attributes. All real ondisk * attributes are handled directly. */ -struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 721efa0346e0..7a43fd640bbb 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,9 +21,9 @@ #include <linux/xattr.h> -extern struct xattr_handler btrfs_xattr_acl_access_handler; -extern struct xattr_handler btrfs_xattr_acl_default_handler; -extern struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler btrfs_xattr_acl_access_handler; +extern const struct xattr_handler btrfs_xattr_acl_default_handler; +extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); diff --git a/fs/buffer.c b/fs/buffer.c index c9c266db0624..e8aa7081d25c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev) return; invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } EXPORT_SYMBOL(invalidate_bdev); @@ -560,26 +561,17 @@ repeat: return err; } -static void do_thaw_all(struct work_struct *work) +static void do_thaw_one(struct super_block *sb, void *unused) { - struct super_block *sb; char b[BDEVNAME_SIZE]; + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); +} - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) - printk(KERN_WARNING "Emergency Thaw on %s\n", - bdevname(sb->s_bdev, b)); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); +static void do_thaw_all(struct work_struct *work) +{ + iterate_supers(do_thaw_one, NULL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a9005d862ed4..d9c60b84949a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; int rc = 0; struct page **pages; - struct pagevec pvec; loff_t offset; u64 len; @@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc < 0) goto out; - /* set uptodate and add to lru in pagevec-sized chunks */ - pagevec_init(&pvec, 0); for (; !list_empty(page_list) && len > 0; rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { struct page *page = @@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, zero_user_segment(page, s, PAGE_CACHE_SIZE); } - if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { page_cache_release(page); dout("readpages %p add_to_page_cache failed %p\n", inode, page); @@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); - if (pagevec_add(&pvec, page) == 0) - pagevec_lru_add_file(&pvec); /* add to lru */ + page_cache_release(page); } - pagevec_lru_add_file(&pvec); rc = 0; out: @@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_release_pages(req->r_pages, req->r_num_pages); if (req->r_pages_from_pool) mempool_free(req->r_pages, - ceph_client(inode->i_sb)->wb_pagevec_pool); + ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else kfree(req->r_pages); ceph_osdc_put_request(req); diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 818afe72e6c7..9f46de2ba7a7 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac, ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building request\n", ret); + pr_err("error %d building auth method %s request\n", ret, + ac->ops->name); return ret; } dout(" built request %d bytes\n", ret); @@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ac->protocol != protocol) { ret = ceph_auth_init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); + pr_err("error %d on auth method %s init\n", + ret, ac->ops->name); goto out; } } @@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ret == -EAGAIN) { return ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { - pr_err("authentication error %d\n", ret); + pr_err("auth method '%s' error %d\n", ac->ops->name, ret); return ret; } return 0; diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h index ca4f57cfb267..4429a707c021 100644 --- a/fs/ceph/auth.h +++ b/fs/ceph/auth.h @@ -15,6 +15,8 @@ struct ceph_auth_client; struct ceph_authorizer; struct ceph_auth_client_ops { + const char *name; + /* * true if we are authenticated and can connect to * services. diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c index 8cd9e3af07f7..24407c119291 100644 --- a/fs/ceph/auth_none.c +++ b/fs/ceph/auth_none.c @@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, } static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index fee5a08da881..7b206231566d 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, int ret; char *dbuf; char *ticket_buf; - u8 struct_v; + u8 reply_struct_v; dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) @@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, goto out_dbuf; ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + reply_struct_v = ceph_decode_8(&p); + if (reply_struct_v != 1) goto bad; num = ceph_decode_32(&p); dout("%d tickets\n", num); while (num--) { int type; - u8 struct_v; + u8 tkt_struct_v, blob_struct_v; struct ceph_x_ticket_handler *th; void *dp, *dend; int dlen; @@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, type = ceph_decode_32(&p); dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&p); + if (tkt_struct_v != 1) goto bad; th = get_ticket_handler(ac, type); @@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, dend = dbuf + dlen; dp = dbuf; - struct_v = ceph_decode_8(&dp); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) goto bad; memcpy(&old_key, &th->session_key, sizeof(old_key)); @@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, tpend = tp + dlen; dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - struct_v = ceph_decode_8(&tp); + blob_struct_v = ceph_decode_8(&tp); new_secret_id = ceph_decode_64(&tp); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); if (ret) @@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, static const struct ceph_auth_client_ops ceph_x_ops = { + .name = "x", .is_authenticated = ceph_x_is_authenticated, .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d9400534b279..0dd0b81e64f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + if (!msg) + return -ENOMEM; msg->hdr.tid = cpu_to_le64(flush_tid); @@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) */ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1663,7 +1665,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1716,10 +1718,9 @@ out_unlocked: static int caps_are_flushed(struct inode *inode, unsigned tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int dirty, i, ret = 1; + int i, ret = 1; spin_lock(&inode->i_lock); - dirty = __ceph_caps_dirty(ci); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 0c2241ef3653..3b9eeed097b3 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -19,7 +19,7 @@ * Ceph release version */ #define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 19 +#define CEPH_VERSION_MINOR 20 #define CEPH_VERSION_PATCH 0 #define _CEPH_STRINGIFY(x) #x @@ -36,7 +36,7 @@ * client-facing protocol. */ #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ #define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */ @@ -53,8 +53,18 @@ /* * feature bits */ -#define CEPH_FEATURE_SUPPORTED 0 -#define CEPH_FEATURE_REQUIRED 0 +#define CEPH_FEATURE_UID 1 +#define CEPH_FEATURE_NOSRCADDR 2 +#define CEPH_FEATURE_FLOCK 4 + +#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK +#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR /* @@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_CEPHX 0x2 +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + /********************************************* * message layer @@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + /* osd */ #define CEPH_MSG_OSD_MAP 41 #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 auid; + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + struct ceph_osd_getmap { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; @@ -308,6 +361,7 @@ union ceph_mds_request_args { struct { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 8e4be6a80c62..7503aee828ce 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type) case CEPH_ENTITY_TYPE_OSD: return "osd"; case CEPH_ENTITY_TYPE_MON: return "mon"; case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; case CEPH_ENTITY_TYPE_AUTH: return "auth"; default: return "unknown"; } @@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; case CEPH_OSD_OP_PULL: return "pull"; case CEPH_OSD_OP_PUSH: return "push"; @@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o) } return "???"; } + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f7048da92acc..3be33fb066cc 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p) static int monc_show(struct seq_file *s, void *p) { struct ceph_client *client = s->private; - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_client *monc = &client->monc; struct rb_node *rp; @@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p) if (monc->want_next_osdmap) seq_printf(s, "want next osdmap\n"); - for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { - req = rb_entry(rp, struct ceph_mon_statfs_request, node); - seq_printf(s, "%lld statfs\n", req->tid); + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); } mutex_unlock(&monc->mutex); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 650d2db5ed26..4fd30900eff7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry) return -ENOMEM; /* oh well */ spin_lock(&dentry->d_lock); - if (dentry->d_fsdata) /* lost a race */ + if (dentry->d_fsdata) { + /* lost a race */ + kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; + } di->dentry = dentry; di->lease_session = NULL; dentry->d_fsdata = di; @@ -125,7 +128,8 @@ more: dentry = list_entry(p, struct dentry, d_u.d_child); di = ceph_dentry(dentry); while (1) { - dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, + dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, + d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { fi->at_end = 1; @@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; const int max_entries = client->mount_args->max_readdir; + const int max_bytes = client->mount_args->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -312,6 +317,7 @@ more: req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.max_entries = cpu_to_le32(max_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { @@ -335,7 +341,7 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 0; + fi->next_offset = 2; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ @@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - di->offset = ci->i_max_offset++; spin_unlock(&dir->i_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); @@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - new_dentry->d_time = jiffies; - ceph_dentry(new_dentry)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(new_dentry); } ceph_mdsc_put_request(req); return err; } +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + ceph_dentry(dentry)->lease_shared_gen = 0; + spin_unlock(&dentry->d_lock); +} /* * Check if dentry lease is valid. If not, delete the lease. Try to @@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode, + ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); + dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, + dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9d67572fb328..17447644d675 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", fh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); @@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, } dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { @@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ed6f19721d6e..6512b6701b9e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages) /* * allocate a vector new pages */ -static struct page **alloc_page_vector(int num_pages) +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) { struct page **pages; int i; - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * num_pages, flags); if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(flags); if (pages[i] == NULL) { ceph_release_page_vector(pages, i); return ERR_PTR(-ENOMEM); @@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, * in sequence. */ } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); } if (IS_ERR(pages)) return PTR_ERR(pages); @@ -649,8 +649,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; num_pages = calc_pages_for(pos, len); @@ -668,7 +668,7 @@ more: truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_CACHE_SIZE-1)); } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; loff_t endoff = pos + iov->iov_len; int got = 0; int ret, err; @@ -844,8 +844,7 @@ retry_snap: if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, file->f_path.dentry, - pos, pos + ret - 1, 1); + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); if (err < 0) ret = err; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 85b4d2ffdeba..a81b8b662c7b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_client(ci->vfs_inode.i_sb)->mdsc; + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + xattr_blob = NULL; } inode->i_mapping->a_ops = &ceph_aops; inode->i_mapping->backing_dev_info = - &ceph_client(inode->i_sb)->backing_dev_info; + &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode, /* set dir completion flag? */ if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_max_offset = 2; } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) + if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -802,6 +804,37 @@ out_unlock: } /* + * Set dentry's directory position based on the current dir's max, and + * order it in d_subdirs, so that dcache_readdir behaves. + */ +static void ceph_set_dentry_offset(struct dentry *dn) +{ + struct dentry *dir = dn->d_parent; + struct inode *inode = dn->d_parent->d_inode; + struct ceph_dentry_info *di; + + BUG_ON(!inode); + + di = ceph_dentry(dn); + + spin_lock(&inode->i_lock); + if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + spin_unlock(&inode->i_lock); + return; + } + di->offset = ceph_inode(inode)->i_max_offset++; + spin_unlock(&inode->i_lock); + + spin_lock(&dcache_lock); + spin_lock(&dn->d_lock); + list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, + dn->d_u.d_child.prev, dn->d_u.d_child.next); + spin_unlock(&dn->d_lock); + spin_unlock(&dcache_lock); +} + +/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; + BUG_ON(dn->d_inode); + /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); @@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", dn, dn->d_inode, ceph_vinop(dn->d_inode)); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); + ceph_set_dentry_offset(dn); out: return dn; } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - di = ceph_dentry(dn); - - spin_lock(&inode->i_lock); - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); - - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); - list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); -} - -/* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., * after a lookup). @@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); - dout(" clearing %p complete (empty trace)\n", - req->r_locked_dir); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } + if (rinfo->head->result == 0 && req->r_locked_dir) + ceph_invalidate_dir_request(req); return 0; } @@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, dn, dn->d_name.len, dn->d_name.name); + /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - dn->d_time = jiffies; - ceph_dentry(dn)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(dn); + /* take overwritten dentry's readdir offset */ + dout("dn %p gets %p offset %lld (old offset %lld)\n", + req->r_old_dentry, dn, ceph_dentry(dn)->offset, + ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset = ceph_dentry(dn)->offset; + dn = req->r_old_dentry; /* use old_dentry */ in = dn->d_inode; } @@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - ceph_set_dentry_offset(dn); igrab(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { @@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, err = PTR_ERR(dn); goto done; } - ceph_set_dentry_offset(dn); req->r_dentry = dn; /* may have spliced */ igrab(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ @@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - if (queue_work(ceph_client(inode->i_sb)->trunc_wq, + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); igrab(inode); @@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8a5bcae62846..d085f07756b4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 24561a557e01..885aa5710cfd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -40,7 +40,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); -const static struct ceph_connection_operations mds_con_ops; +static const struct ceph_connection_operations mds_con_ops; /* @@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); @@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int mstate; int mds = session->s_mds; - int err = 0; /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); @@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc, /* send connect message */ msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); - if (IS_ERR(msg)) { - err = PTR_ERR(msg); - goto out; - } + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); - -out: return 0; } @@ -804,12 +799,49 @@ out: } static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) + void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + int drop = 0; + dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - ceph_remove_cap(cap); + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + if (!__ceph_is_any_real_caps(ci)) { + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + pr_info(" dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + drop = 1; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_info(" dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + drop = 1; + } + if (drop && ci->i_wrbuffer_ref) { + pr_info(" dropping dirty data for %p %lld\n", + inode, ceph_ino(inode)); + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + drop++; + } + spin_unlock(&mdsc->cap_dirty_lock); + } + spin_unlock(&inode->i_lock); + while (drop--) + iput(inode); return 0; } @@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session) dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); } @@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, ceph_mds_state_name(state)); msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (IS_ERR(msg)) - return PTR_ERR(msg); + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } @@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; - int err = 0; dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, session_state_name(session->s_state), session->s_seq); msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); - if (IS_ERR(msg)) - err = PTR_ERR(msg); - else - ceph_con_send(&session->s_con, msg); - return err; + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; } /* @@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - 0, 0, NULL); + GFP_NOFS); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg; dout("send_cap_releases mds%d\n", session->s_mds); - while (1) { - spin_lock(&session->s_cap_lock); - if (list_empty(&session->s_cap_releases_done)) - break; + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases_done)) { msg = list_first_entry(&session->s_cap_releases_done, struct ceph_msg, list_head); list_del_init(&msg->list_head); @@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); + spin_lock(&session->s_cap_lock); } spin_unlock(&session->s_cap_lock); } +static void discard_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + unsigned num; + + dout("discard_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); + head->num = cpu_to_le32(0); + session->s_num_cap_releases += num; + + /* requeue completed messages */ + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, + num); + session->s_num_cap_releases += num; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + list_add(&msg->list_head, &session->s_cap_releases); + } + + spin_unlock(&session->s_cap_lock); +} + /* * requests */ @@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) if (!req) return ERR_PTR(-ENOMEM); + mutex_init(&req->r_fill_mutex); req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1251,7 +1320,7 @@ retry: len += 1 + temp->d_name.len; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry %p\n", dentry); + pr_err("build_path corrupt dentry %p\n", dentry); return ERR_PTR(-EINVAL); } } @@ -1267,7 +1336,7 @@ retry: struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { - dout("build_path_dentry path+%d: %p SNAPDIR\n", + dout("build_path path+%d: %p SNAPDIR\n", pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { @@ -1278,20 +1347,18 @@ retry: break; strncpy(path + pos, temp->d_name.name, temp->d_name.len); - dout("build_path_dentry path+%d: %p '%.*s'\n", - pos, temp, temp->d_name.len, path + pos); } if (pos) path[--pos] = '/'; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry\n"); + pr_err("build_path corrupt dentry\n"); kfree(path); return ERR_PTR(-EINVAL); } } if (pos != 0) { - pr_err("build_path_dentry did not end path lookup where " + pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not @@ -1303,7 +1370,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; - dout("build_path_dentry on %p %d built %llx '%.*s'\n", + dout("build_path on %p %d built %llx '%.*s'\n", dentry, atomic_read(&dentry->d_count), *base, len, path); return path; } @@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + if (!msg) { + msg = ERR_PTR(-ENOMEM); goto out_free2; + } msg->hdr.tid = cpu_to_le64(req->r_tid); @@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, } msg = create_request_message(mdsc, req, mds); if (IS_ERR(msg)) { - req->r_reply = ERR_PTR(PTR_ERR(msg)); + req->r_err = PTR_ERR(msg); complete_request(mdsc, req); - return -PTR_ERR(msg); + return PTR_ERR(msg); } req->r_request = msg; @@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_reply) + if (req->r_err || req->r_got_result) goto out; if (req->r_timeout && @@ -1609,7 +1678,7 @@ out: return err; finish: - req->r_reply = ERR_PTR(err); + req->r_err = err; complete_request(mdsc, req); goto out; } @@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc, /* * Wake up threads with requests pending for @mds, so that they can - * resubmit their requests to a possibly different mds. If @all is set, - * wake up if their requests has been forwarded to @mds, too. + * resubmit their requests to a possibly different mds. */ -static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) +static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_mds_request *req; struct rb_node *p; @@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, __register_request(mdsc, req, dir); __do_request(mdsc, req); - /* wait */ - if (!req->r_reply) { - mutex_unlock(&mdsc->mutex); - if (req->r_timeout) { - err = (long)wait_for_completion_interruptible_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - req->r_reply = ERR_PTR(-EIO); - else if (err < 0) - req->r_reply = ERR_PTR(err); - } else { - err = wait_for_completion_interruptible( - &req->r_completion); - if (err) - req->r_reply = ERR_PTR(err); - } - mutex_lock(&mdsc->mutex); + if (req->r_err) { + err = req->r_err; + __unregister_request(mdsc, req); + dout("do_request early error %d\n", err); + goto out; } - if (IS_ERR(req->r_reply)) { - err = PTR_ERR(req->r_reply); - req->r_reply = NULL; + /* wait */ + mutex_unlock(&mdsc->mutex); + dout("do_request waiting\n"); + if (req->r_timeout) { + err = (long)wait_for_completion_interruptible_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + err = -EIO; + } else { + err = wait_for_completion_interruptible(&req->r_completion); + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); - if (err == -ERESTARTSYS) { - /* aborted */ - req->r_aborted = true; + /* only abort if we didn't race with a real reply */ + if (req->r_got_result) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); - if (req->r_locked_dir && - (req->r_op & CEPH_MDS_OP_WRITE)) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + req->r_aborted = true; + mutex_unlock(&req->r_fill_mutex); - dout("aborted, clearing I_COMPLETE on %p\n", - req->r_locked_dir); - spin_lock(&req->r_locked_dir->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - spin_unlock(&req->r_locked_dir->i_lock); - } - } else { - /* clean up this request */ - __unregister_request(mdsc, req); - if (!list_empty(&req->r_unsafe_item)) - list_del_init(&req->r_unsafe_item); - complete(&req->r_safe_completion); - } - } else if (req->r_err) { - err = req->r_err; + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); } else { - err = le32_to_cpu(req->r_reply_info.head->result); + err = req->r_err; } - mutex_unlock(&mdsc->mutex); +out: + mutex_unlock(&mdsc->mutex); dout("do_request %p done, result %d\n", req, err); return err; } /* + * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *inode = req->r_locked_dir; + struct ceph_inode_info *ci = ceph_inode(inode); + + dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + spin_unlock(&inode->i_lock); + + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* * Handle mds reply. * * We take the session mutex and parse and process the reply immediately. @@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } + if (req->r_got_safe && !head->safe) { + pr_warning("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } result = le32_to_cpu(head->result); @@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } - } - - BUG_ON(req->r_reply); - - if (!head->safe) { + } else { req->r_got_unsafe = true; list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } @@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(&req->r_caps_reservation); } + mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); out_err: - if (err) { - req->r_err = err; + mutex_lock(&mdsc->mutex); + if (!req->r_aborted) { + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + req->r_got_result = true; + } } else { - req->r_reply = msg; - ceph_msg_get(msg); + dout("reply arrived after request %lld was aborted\n", tid); } + mutex_unlock(&mdsc->mutex); add_cap_releases(mdsc, req->r_session, -1); mutex_unlock(&session->s_mutex); @@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session, switch (op) { case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; renewed_caps(mdsc, session, 0); wake = 1; @@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ complete(&mdsc->session_close_waiters); - kick_requests(mdsc, mds, 0); /* cur only */ + kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: @@ -2132,54 +2229,44 @@ out: * * called with mdsc->mutex held. */ -static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { - struct ceph_mds_session *session = NULL; struct ceph_msg *reply; struct rb_node *p; + int mds = session->s_mds; int err = -ENOMEM; struct ceph_pagelist *pagelist; - pr_info("reconnect to recovering mds%d\n", mds); + pr_info("mds%d reconnect start\n", mds); pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + if (!reply) goto fail_nomsg; - } - - /* find session */ - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); /* drop lock for duration */ - if (session) { - mutex_lock(&session->s_mutex); + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; - session->s_state = CEPH_MDS_SESSION_RECONNECTING; - session->s_seq = 0; + ceph_con_open(&session->s_con, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - ceph_con_open(&session->s_con, - ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - - /* replay unsafe requests */ - replay_unsafe_requests(mdsc, session); - } else { - dout("no session for mds%d, will send short reconnect\n", - mds); - } + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); down_read(&mdsc->snap_rwsem); - if (!session) - goto send; dout("session %p state %s\n", session, session_state_name(session->s_state)); + /* drop old cap expires; we're about to reestablish that state */ + discard_cap_releases(mdsc, session); + /* traverse this session's caps */ err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); if (err) @@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) goto fail; } -send: reply->pagelist = pagelist; reply->hdr.data_len = cpu_to_le32(pagelist->length); reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); - session->s_state = CEPH_MDS_SESSION_OPEN; mutex_unlock(&session->s_mutex); mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); - ceph_put_mds_session(session); - up_read(&mdsc->snap_rwsem); - mutex_lock(&mdsc->mutex); return; fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); fail_nomsg: ceph_pagelist_release(pagelist); kfree(pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); - mutex_lock(&mdsc->mutex); return; } @@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i, 1); + kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, * send reconnect? */ if (s->s_state == CEPH_MDS_SESSION_RESTARTING && - newstate >= CEPH_MDS_STATE_RECONNECT) - send_mds_reconnect(mdsc, i); + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } /* - * kick requests on any mds that has gone active. - * - * kick requests on cur or forwarder: we may have sent - * the request to mds1, mds1 told us it forwarded it - * to mds2, but then we learn mds1 failed and can't be - * sure it successfully forwarded our request before - * it died. + * kick request on any mds that has gone active. */ if (oldstate < CEPH_MDS_STATE_ACTIVE && newstate >= CEPH_MDS_STATE_ACTIVE) { - pr_info("mds%d reconnect completed\n", s->s_mds); - kick_requests(mdsc, i, 1); + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); ceph_kick_flushing_caps(mdsc, s); wake_up_session_caps(s, 1); } @@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + if (!msg) return; lease = msg->front.iov_base; lease->action = action; @@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work) else ceph_con_keepalive(&s->s_con); add_cap_releases(mdsc, s, -1); - send_cap_releases(mdsc, s); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) mdsc->client = client; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (mdsc->mdsmap == NULL) + return -ENOMEM; + init_completion(&mdsc->safe_umount_waiters); init_completion(&mdsc->session_close_waiters); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) init_waitqueue_head(&mdsc->cap_flushing_wq); spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + return 0; } @@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return; + dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; @@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con) static void peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; - pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", - s->s_mds); + pr_warning("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); } static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) @@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->client->monc); } -const static struct ceph_connection_operations mds_con_ops = { +static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, .dispatch = dispatch, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 961cc6f65878..d9936c4f1212 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -165,6 +165,8 @@ struct ceph_mds_request { struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ struct inode *r_target_inode; /* resulting inode */ + struct mutex r_fill_mutex; + union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ @@ -213,7 +215,7 @@ struct ceph_mds_request { struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe; + bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; u32 r_readdir_offset; @@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, struct dentry *dn, int mask); +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); + extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index cd4fadb6491a..60b74839ebec 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); static void ceph_fault(struct ceph_connection *con); -const char *ceph_name_type_str(int t) -{ - switch (t) { - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; - default: return "???"; - } -} - /* * nicely render a sockaddr as a string. */ @@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con) clear_bit(WRITE_PENDING, &con->state); mutex_lock(&con->mutex); reset_connection(con); + con->peer_global_seq = 0; cancel_delayed_work(&con->work); mutex_unlock(&con->mutex); queue_con(con); @@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED; + con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; + u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con) clear_bit(CONNECTING, &con->state); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; + con->peer_features = server_feat; dout("process_connect got READY gseq %d cseq %d (%d)\n", con->peer_global_seq, le32_to_cpu(con->in_reply.connect_seq), @@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ - dout("alloc_msg returned NULL, skipping message\n"); + dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; return 0; } - if (IS_ERR(con->in_msg)) { - ret = PTR_ERR(con->in_msg); - con->in_msg = NULL; + if (!con->in_msg) { con->error_msg = "error allocating memory for incoming message"; - return ret; + return -ENOMEM; } m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ @@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con) /* if first message, set peer_name */ if (con->peer_name.type == 0) - con->peer_name = msg->hdr.src.name; + con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), - ENTITY_NAME(msg->hdr.src.name), + ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), @@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con) dout("try_write start %p state %lu nref %d\n", con, con->state, atomic_read(&con->nref)); - mutex_lock(&con->mutex); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); @@ -1639,7 +1627,6 @@ do_next: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_write done on %p\n", con); return ret; } @@ -1651,7 +1638,6 @@ out: */ static int try_read(struct ceph_connection *con) { - struct ceph_messenger *msgr; int ret = -1; if (!con->sock) @@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con) return 0; dout("try_read start on %p\n", con); - msgr = con->msgr; - - mutex_lock(&con->mutex); more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, @@ -1758,7 +1741,6 @@ more: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_read done on %p\n", con); return ret; @@ -1830,6 +1812,8 @@ more: dout("con_work %p start, clearing QUEUED\n", con); clear_bit(QUEUED, &con->state); + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -1844,11 +1828,16 @@ more: if (test_and_clear_bit(SOCK_CLOSED, &con->state) || try_read(con) < 0 || try_write(con) < 0) { + mutex_unlock(&con->mutex); backoff = 1; ceph_fault(con); /* error/fault path */ + goto done_unlocked; } done: + mutex_unlock(&con->mutex); + +done_unlocked: clear_bit(BUSY, &con->state); dout("con->state=%lu\n", con->state); if (test_bit(QUEUED, &con->state)) { @@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) /* the zero page is needed if a request is "canceled" while the message * is being written over the socket */ - msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); if (!msgr->zero_page) { kfree(msgr); return ERR_PTR(-ENOMEM); @@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) } /* set src+dst */ - msg->hdr.src.name = con->msgr->inst.name; - msg->hdr.src.addr = con->msgr->my_enc_addr; - msg->hdr.orig_src = msg->hdr.src; + msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); @@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con) * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, struct page **pages) +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) { struct ceph_msg *m; - m = kmalloc(sizeof(*m), GFP_NOFS); + m = kmalloc(sizeof(*m), flags); if (m == NULL) goto out; kref_init(&m->kref); @@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); m->hdr.middle_len = 0; - m->hdr.data_len = cpu_to_le32(page_len); - m->hdr.data_off = cpu_to_le16(page_off); + m->hdr.data_len = 0; + m->hdr.data_off = 0; m->hdr.reserved = 0; m->footer.front_crc = 0; m->footer.middle_crc = 0; @@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, /* front */ if (front_len) { if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, GFP_NOFS, + m->front.iov_base = __vmalloc(front_len, flags, PAGE_KERNEL); m->front_is_vmalloc = true; } else { - m->front.iov_base = kmalloc(front_len, GFP_NOFS); + m->front.iov_base = kmalloc(front_len, flags); } if (m->front.iov_base == NULL) { pr_err("msg_new can't allocate %d bytes\n", @@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->middle = NULL; /* data */ - m->nr_pages = calc_pages_for(page_off, page_len); - m->pages = pages; + m->nr_pages = 0; + m->pages = NULL; m->pagelist = NULL; - dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, - m->nr_pages); + dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: - pr_err("msg_new can't create type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + pr_err("msg_new can't create type %d front %d\n", type, front_len); + return NULL; } /* @@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (IS_ERR(msg)) - return msg; - - if (*skip) + if (!msg || *skip) return NULL; } if (!msg) { *skip = 0; - msg = ceph_msg_new(type, front_len, 0, 0, NULL); + msg = ceph_msg_new(type, front_len, GFP_NOFS); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + return NULL; } } memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len) { + if (middle_len && !msg->middle) { ret = ceph_alloc_middle(con, msg); - if (ret < 0) { ceph_msg_put(msg); - return msg; + return NULL; } } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a5caf91cc971..00a9430b1ffc 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -49,10 +49,8 @@ struct ceph_connection_operations { int *skip); }; -extern const char *ceph_name_type_str(int t); - /* use format string %s%d */ -#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) +#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ @@ -144,6 +142,7 @@ struct ceph_connection { struct ceph_entity_addr peer_addr; /* peer address */ struct ceph_entity_name peer_name; /* peer name */ struct ceph_entity_addr peer_addr_for_me; + unsigned peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -158,7 +157,6 @@ struct ceph_connection { struct list_head out_queue; struct list_head out_sent; /* sending or sent but unacked */ u64 out_seq; /* last message queued for send */ - u64 out_seq_sent; /* last message sent */ bool out_keepalive_pending; u64 in_seq, in_seq_acked; /* last message received, acked */ @@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); -extern struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, - struct page **pages); +extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 8fdc011ca956..f6510a476e7e 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -28,7 +28,7 @@ * resend any outstanding requests. */ -const static struct ceph_connection_operations mon_con_ops; +static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); @@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_con_revoke(monc->con, monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ ceph_con_send(monc->con, monc->m_auth); } @@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc) monc->want_next_osdmap); if ((__sub_expired(monc) && !monc->sub_sent) || monc->want_next_osdmap == 1) { - struct ceph_msg *msg; + struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; - msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL); - if (!msg) - return; - p = msg->front.iov_base; - end = p + msg->front.iov_len; + end = p + msg->front_max; dout("__send_subscribe to 'mdsmap' %u+\n", (unsigned)monc->have_mdsmap); @@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_send(monc->con, msg); + ceph_con_revoke(monc->con, msg); + ceph_con_send(monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -353,14 +351,14 @@ out: /* * statfs */ -static struct ceph_mon_statfs_request *__lookup_statfs( +static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) { - struct ceph_mon_statfs_request *req; - struct rb_node *n = monc->statfs_request_tree.rb_node; + struct ceph_mon_generic_request *req; + struct rb_node *n = monc->generic_request_tree.rb_node; while (n) { - req = rb_entry(n, struct ceph_mon_statfs_request, node); + req = rb_entry(n, struct ceph_mon_generic_request, node); if (tid < req->tid) n = n->rb_left; else if (tid > req->tid) @@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs( return NULL; } -static void __insert_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *new) +static void __insert_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *new) { - struct rb_node **p = &monc->statfs_request_tree.rb_node; + struct rb_node **p = &monc->generic_request_tree.rb_node; struct rb_node *parent = NULL; - struct ceph_mon_statfs_request *req = NULL; + struct ceph_mon_generic_request *req = NULL; while (*p) { parent = *p; - req = rb_entry(parent, struct ceph_mon_statfs_request, node); + req = rb_entry(parent, struct ceph_mon_generic_request, node); if (new->tid < req->tid) p = &(*p)->rb_left; else if (new->tid > req->tid) @@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc, } rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->statfs_request_tree); + rb_insert_color(&new->node, &monc->generic_request_tree); +} + +static void release_generic_request(struct kref *kref) +{ + struct ceph_mon_generic_request *req = + container_of(kref, struct ceph_mon_generic_request, kref); + + if (req->reply) + ceph_msg_put(req->reply); + if (req->request) + ceph_msg_put(req->request); +} + +static void put_generic_request(struct ceph_mon_generic_request *req) +{ + kref_put(&req->kref, release_generic_request); +} + +static void get_generic_request(struct ceph_mon_generic_request *req) +{ + kref_get(&req->kref); +} + +static struct ceph_msg *get_generic_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(hdr->tid); + struct ceph_msg *m; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (!req) { + dout("get_generic_reply %lld dne\n", tid); + *skip = 1; + m = NULL; + } else { + dout("get_generic_reply %lld got %p\n", tid, req->reply); + m = ceph_msg_get(req->reply); + /* + * we don't need to track the connection reading into + * this reply because we only have one open connection + * at a time, ever. + */ + } + mutex_unlock(&monc->mutex); + return m; } static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs_reply *reply = msg->front.iov_base; - u64 tid; + u64 tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len != sizeof(*reply)) goto bad; - tid = le64_to_cpu(msg->hdr.tid); dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = __lookup_statfs(monc, tid); + req = __lookup_generic_req(monc, tid); if (req) { - *req->buf = reply->st; + *(struct ceph_statfs *)req->buf = reply->st; req->result = 0; + get_generic_request(req); } mutex_unlock(&monc->mutex); - if (req) + if (req) { complete(&req->completion); + put_generic_request(req); + } return; bad: - pr_err("corrupt statfs reply, no tid\n"); + pr_err("corrupt generic reply, no tid\n"); ceph_msg_dump(msg); } /* - * (re)send a statfs request + * Do a synchronous statfs(). */ -static int send_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *req) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) { - struct ceph_msg *msg; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; + int err; - dout("send_statfs tid %llu\n", req->tid); - msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); - req->request = msg; - msg->hdr.tid = cpu_to_le64(req->tid); - h = msg->front.iov_base; + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + h = req->request->front.iov_base; h->monhdr.have_version = 0; h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; - ceph_con_send(monc->con, msg); - return 0; -} - -/* - * Do a synchronous statfs(). - */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) -{ - struct ceph_mon_statfs_request req; - int err; - - req.buf = buf; - init_completion(&req.completion); - - /* allocate memory for reply */ - err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1); - if (err) - return err; /* register request */ mutex_lock(&monc->mutex); - req.tid = ++monc->last_tid; - req.last_attempt = jiffies; - req.delay = BASE_DELAY_INTERVAL; - __insert_statfs(monc, &req); - monc->num_statfs_requests++; + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; mutex_unlock(&monc->mutex); /* send request and wait */ - err = send_statfs(monc, &req); - if (!err) - err = wait_for_completion_interruptible(&req.completion); + ceph_con_send(monc->con, ceph_msg_get(req->request)); + err = wait_for_completion_interruptible(&req->completion); mutex_lock(&monc->mutex); - rb_erase(&req.node, &monc->statfs_request_tree); - monc->num_statfs_requests--; - ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; mutex_unlock(&monc->mutex); if (!err) - err = req.result; + err = req->result; + +out: + kref_put(&req->kref, release_generic_request); return err; } /* * Resend pending statfs requests. */ -static void __resend_statfs(struct ceph_mon_client *monc) +static void __resend_generic_request(struct ceph_mon_client *monc) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct rb_node *p; - for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_mon_statfs_request, node); - send_statfs(monc, req); + for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_generic_request, node); + ceph_con_revoke(monc->con, req->request); + ceph_con_send(monc->con, ceph_msg_get(req->request)); } } @@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; - /* msg pools */ - err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, - sizeof(struct ceph_mon_subscribe_ack), 1, false); - if (err < 0) + /* msgs */ + err = -ENOMEM; + monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, + sizeof(struct ceph_mon_subscribe_ack), + GFP_NOFS); + if (!monc->m_subscribe_ack) goto out_monmap; - err = ceph_msgpool_init(&monc->msgpool_statfs_reply, - sizeof(struct ceph_mon_statfs_reply), 0, false); - if (err < 0) - goto out_pool1; - err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); - if (err < 0) - goto out_pool2; - - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); + if (!monc->m_subscribe) + goto out_subscribe_ack; + + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); + if (!monc->m_auth_reply) + goto out_subscribe; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); monc->pending_auth = 0; - if (IS_ERR(monc->m_auth)) { - err = PTR_ERR(monc->m_auth); - monc->m_auth = NULL; - goto out_pool3; - } + if (!monc->m_auth) + goto out_auth_reply; monc->cur_mon = -1; monc->hunting = true; @@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->sub_sent = 0; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); - monc->statfs_request_tree = RB_ROOT; - monc->num_statfs_requests = 0; + monc->generic_request_tree = RB_ROOT; + monc->num_generic_requests = 0; monc->last_tid = 0; monc->have_mdsmap = 0; @@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->want_next_osdmap = 1; return 0; -out_pool3: - ceph_msgpool_destroy(&monc->msgpool_auth_reply); -out_pool2: - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); -out_pool1: - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); +out_auth_reply: + ceph_msg_put(monc->m_auth_reply); +out_subscribe: + ceph_msg_put(monc->m_subscribe); +out_subscribe_ack: + ceph_msg_put(monc->m_subscribe_ack); out_monmap: kfree(monc->monmap); out: @@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_auth_destroy(monc->auth); ceph_msg_put(monc->m_auth); - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); - ceph_msgpool_destroy(&monc->msgpool_auth_reply); + ceph_msg_put(monc->m_auth_reply); + ceph_msg_put(monc->m_subscribe); + ceph_msg_put(monc->m_subscribe_ack); kfree(monc->monmap); } @@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->client->msgr->inst.name.num = monc->auth->global_id; __send_subscribe(monc); - __resend_statfs(monc); + __resend_generic_request(monc); } mutex_unlock(&monc->mutex); } @@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_MON_SUBSCRIBE_ACK: - m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); + m = ceph_msg_get(monc->m_subscribe_ack); break; case CEPH_MSG_STATFS_REPLY: - m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); - break; + return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: - m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); + m = ceph_msg_get(monc->m_auth_reply); break; case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: - m = ceph_msg_new(type, front_len, 0, 0, NULL); + m = ceph_msg_new(type, front_len, GFP_NOFS); break; } @@ -826,7 +867,7 @@ out: mutex_unlock(&monc->mutex); } -const static struct ceph_connection_operations mon_con_ops = { +static const struct ceph_connection_operations mon_con_ops = { .get = ceph_con_get, .put = ceph_con_put, .dispatch = dispatch, diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index b958ad5afa06..174d794321d0 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -2,10 +2,10 @@ #define _FS_CEPH_MON_CLIENT_H #include <linux/completion.h> +#include <linux/kref.h> #include <linux/rbtree.h> #include "messenger.h" -#include "msgpool.h" struct ceph_client; struct ceph_mount_args; @@ -22,7 +22,7 @@ struct ceph_monmap { }; struct ceph_mon_client; -struct ceph_mon_statfs_request; +struct ceph_mon_generic_request; /* @@ -40,17 +40,19 @@ struct ceph_mon_request { }; /* - * statfs() is done a bit differently because we need to get data back + * ceph_mon_generic_request is being used for the statfs and poolop requests + * which are bening done a bit differently because we need to get data back * to the caller */ -struct ceph_mon_statfs_request { +struct ceph_mon_generic_request { + struct kref kref; u64 tid; struct rb_node node; int result; - struct ceph_statfs *buf; + void *buf; struct completion completion; - unsigned long last_attempt, delay; /* jiffies */ struct ceph_msg *request; /* original request */ + struct ceph_msg *reply; /* and reply */ }; struct ceph_mon_client { @@ -61,7 +63,7 @@ struct ceph_mon_client { struct delayed_work delayed_work; struct ceph_auth_client *auth; - struct ceph_msg *m_auth; + struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; int pending_auth; bool hunting; @@ -70,14 +72,9 @@ struct ceph_mon_client { struct ceph_connection *con; bool have_fsid; - /* msg pools */ - struct ceph_msgpool msgpool_subscribe_ack; - struct ceph_msgpool msgpool_statfs_reply; - struct ceph_msgpool msgpool_auth_reply; - - /* pending statfs requests */ - struct rb_root statfs_request_tree; - int num_statfs_requests; + /* pending generic requests */ + struct rb_root generic_request_tree; + int num_generic_requests; u64 last_tid; /* mds/osd map */ diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c index ca3b44a89f2d..dd65a6438131 100644 --- a/fs/ceph/msgpool.c +++ b/fs/ceph/msgpool.c @@ -7,180 +7,58 @@ #include "msgpool.h" -/* - * We use msg pools to preallocate memory for messages we expect to - * receive over the wire, to avoid getting ourselves into OOM - * conditions at unexpected times. We take use a few different - * strategies: - * - * - for request/response type interactions, we preallocate the - * memory needed for the response when we generate the request. - * - * - for messages we can receive at any time from the MDS, we preallocate - * a pool of messages we can re-use. - * - * - for writeback, we preallocate some number of messages to use for - * requests and their replies, so that we always make forward - * progress. - * - * The msgpool behaves like a mempool_t, but keeps preallocated - * ceph_msgs strung together on a list_head instead of using a pointer - * vector. This avoids vector reallocation when we adjust the number - * of preallocated items (which happens frequently). - */ +static void *alloc_fn(gfp_t gfp_mask, void *arg) +{ + struct ceph_msgpool *pool = arg; + void *p; + p = ceph_msg_new(0, pool->front_len, gfp_mask); + if (!p) + pr_err("msgpool %s alloc failed\n", pool->name); + return p; +} -/* - * Allocate or release as necessary to meet our target pool size. - */ -static int __fill_msgpool(struct ceph_msgpool *pool) +static void free_fn(void *element, void *arg) { - struct ceph_msg *msg; - - while (pool->num < pool->min) { - dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num, - pool->min); - spin_unlock(&pool->lock); - msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL); - spin_lock(&pool->lock); - if (IS_ERR(msg)) - return PTR_ERR(msg); - msg->pool = pool; - list_add(&msg->list_head, &pool->msgs); - pool->num++; - } - while (pool->num > pool->min) { - msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head); - dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num, - pool->min, msg); - list_del_init(&msg->list_head); - pool->num--; - ceph_msg_kfree(msg); - } - return 0; + ceph_msg_put(element); } int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int min, bool blocking) + int front_len, int size, bool blocking, const char *name) { - int ret; - - dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min); - spin_lock_init(&pool->lock); pool->front_len = front_len; - INIT_LIST_HEAD(&pool->msgs); - pool->num = 0; - pool->min = min; - pool->blocking = blocking; - init_waitqueue_head(&pool->wait); - - spin_lock(&pool->lock); - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; + pool->pool = mempool_create(size, alloc_fn, free_fn, pool); + if (!pool->pool) + return -ENOMEM; + pool->name = name; + return 0; } void ceph_msgpool_destroy(struct ceph_msgpool *pool) { - dout("msgpool_destroy %p\n", pool); - spin_lock(&pool->lock); - pool->min = 0; - __fill_msgpool(pool); - spin_unlock(&pool->lock); + mempool_destroy(pool->pool); } -int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, + int front_len) { - int ret; - - spin_lock(&pool->lock); - dout("msgpool_resv %p delta %d\n", pool, delta); - pool->min += delta; - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; -} - -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) -{ - wait_queue_t wait; - struct ceph_msg *msg; - - if (front_len && front_len > pool->front_len) { - pr_err("msgpool_get pool %p need front %d, pool size is %d\n", - pool, front_len, pool->front_len); + if (front_len > pool->front_len) { + pr_err("msgpool_get pool %s need front %d, pool size is %d\n", + pool->name, front_len, pool->front_len); WARN_ON(1); /* try to alloc a fresh message */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - } - - if (!front_len) - front_len = pool->front_len; - - if (pool->blocking) { - /* mempool_t behavior; first try to alloc */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; + return ceph_msg_new(0, front_len, GFP_NOFS); } - while (1) { - spin_lock(&pool->lock); - if (likely(pool->num)) { - msg = list_entry(pool->msgs.next, struct ceph_msg, - list_head); - list_del_init(&msg->list_head); - pool->num--; - dout("msgpool_get %p got %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - return msg; - } - pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num, - pool->min, pool->blocking ? "waiting" : "may fail"); - spin_unlock(&pool->lock); - - if (!pool->blocking) { - WARN_ON(1); - - /* maybe we can allocate it now? */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - - pr_err("msgpool_get %p empty + alloc failed\n", pool); - return ERR_PTR(-ENOMEM); - } - - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - schedule(); - finish_wait(&pool->wait, &wait); - } + return mempool_alloc(pool->pool, GFP_NOFS); } void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) { - spin_lock(&pool->lock); - if (pool->num < pool->min) { - /* reset msg front_len; user may have changed it */ - msg->front.iov_len = pool->front_len; - msg->hdr.front_len = cpu_to_le32(pool->front_len); + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); - kref_set(&msg->kref, 1); /* retake a single ref */ - list_add(&msg->list_head, &pool->msgs); - pool->num++; - dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - wake_up(&pool->wait); - } else { - dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - ceph_msg_kfree(msg); - } + kref_init(&msg->kref); /* retake single ref */ } diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h index bc834bfcd720..a362605f9368 100644 --- a/fs/ceph/msgpool.h +++ b/fs/ceph/msgpool.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_MSGPOOL #define _FS_CEPH_MSGPOOL +#include <linux/mempool.h> #include "messenger.h" /* @@ -8,18 +9,15 @@ * avoid unexpected OOM conditions. */ struct ceph_msgpool { - spinlock_t lock; + const char *name; + mempool_t *pool; int front_len; /* preallocated payload size */ - struct list_head msgs; /* msgs in the pool; each has 1 ref */ - int num, min; /* cur, min # msgs in the pool */ - bool blocking; - wait_queue_head_t wait; }; extern int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int size, bool blocking); + int front_len, int size, bool blocking, + const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta); extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, int front_len); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 8aaab414f3f8..892a0298dfdf 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -50,7 +50,6 @@ struct ceph_entity_name { #define CEPH_ENTITY_TYPE_MDS 0x02 #define CEPH_ENTITY_TYPE_OSD 0x04 #define CEPH_ENTITY_TYPE_CLIENT 0x08 -#define CEPH_ENTITY_TYPE_ADMIN 0x10 #define CEPH_ENTITY_TYPE_AUTH 0x20 #define CEPH_ENTITY_TYPE_ANY 0xFF @@ -120,7 +119,7 @@ struct ceph_msg_connect_reply { /* * message header */ -struct ceph_msg_header { +struct ceph_msg_header_old { __le64 seq; /* message seq# for this session */ __le64 tid; /* transaction id */ __le16 type; /* message type */ @@ -138,6 +137,24 @@ struct ceph_msg_header { __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 3514f71ff85f..afa7bb3895c4 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -16,7 +16,7 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 -const static struct ceph_connection_operations osd_con_ops; +static const struct ceph_connection_operations osd_con_ops; static int __kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *kickosd); @@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req = kzalloc(sizeof(*req), GFP_NOFS); } if (req == NULL) - return ERR_PTR(-ENOMEM); + return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; @@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); - if (IS_ERR(msg)) { + OSD_OPREPLY_FRONT_LEN, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } req->r_reply = msg; @@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); @@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work) * should mark the osd as failed and we should find out about * it from an updated osd map. */ - while (!list_empty(&osdc->req_lru)) { + while (timeout && !list_empty(&osdc->req_lru)) { req = list_entry(osdc->req_lru.next, struct ceph_osd_request, r_req_lru_item); @@ -1078,6 +1078,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); + wake_up(&osdc->client->auth_wq); return; bad: @@ -1087,45 +1088,6 @@ bad: return; } - -/* - * A read request prepares specific pages that data is to be read into. - * When a message is being read off the wire, we call prepare_pages to - * find those pages. - * 0 = success, -1 failure. - */ -static int __prepare_pages(struct ceph_connection *con, - struct ceph_msg_header *hdr, - struct ceph_osd_request *req, - u64 tid, - struct ceph_msg *m) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - int ret = -1; - int data_len = le32_to_cpu(hdr->data_len); - unsigned data_off = le16_to_cpu(hdr->data_off); - - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - - if (!osd) - return -1; - - osdc = osd->o_osdc; - - dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m, - tid, req->r_num_pages, want); - if (unlikely(req->r_num_pages < want)) - goto out; - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - ret = 0; /* success */ -out: - BUG_ON(ret < 0 || m->nr_pages < want); - - return ret; -} - /* * Register request, send initial attempt. */ @@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); + err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, - OSD_OPREPLY_FRONT_LEN, 10, true); + OSD_OPREPLY_FRONT_LEN, 10, true, + "osd_op_reply"); if (err < 0) goto out_msgpool; return 0; @@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short write due to an object boundary */ req->r_pages = pages; @@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) } /* - * lookup and return message for incoming reply + * lookup and return message for incoming reply. set up reply message + * pages. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; - int err; tid = le64_to_cpu(hdr->tid); mutex_lock(&osdc->request_mutex); @@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_con_filling_msg); ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); ceph_con_put(req->r_con_filling_msg); + req->r_con_filling_msg = NULL; } if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); - if (IS_ERR(m)) + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); + if (!m) goto out; ceph_msg_put(req->r_reply); req->r_reply = m; @@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - err = __prepare_pages(con, hdr, req, tid, m); - if (err < 0) { + unsigned data_off = le16_to_cpu(hdr->data_off); + int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + + if (unlikely(req->r_num_pages < want)) { + pr_warning("tid %lld reply %d > expected %d pages\n", + tid, want, m->nr_pages); *skip = 1; ceph_msg_put(m); - m = ERR_PTR(err); + m = NULL; + goto out; } + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); @@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: - return ceph_msg_new(type, front, 0, 0, NULL); + return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: @@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } -const static struct ceph_connection_operations osd_con_ops = { +static const struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, .put = put_osd_con, .dispatch = dispatch, diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c index 5f8dbf7c745a..b6859f47d364 100644 --- a/fs/ceph/pagelist.c +++ b/fs/ceph/pagelist.c @@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl) static int ceph_pagelist_addpage(struct ceph_pagelist *pl) { - struct page *page = alloc_page(GFP_NOFS); + struct page *page = __page_cache_alloc(GFP_NOFS); if (!page) return -ENOMEM; pl->room += PAGE_SIZE; diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index fd56451a871f..8fcc023056c7 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -101,8 +101,8 @@ struct ceph_pg_pool { __le64 snap_seq; /* seq for per-pool snapshot */ __le32 snap_epoch; /* epoch of last snap */ __le32 num_snaps; - __le32 num_removed_snap_intervals; - __le64 uid; + __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ + __le64 auid; /* who owns the pg */ } __attribute__ ((packed)); /* @@ -208,6 +208,7 @@ enum { /* read */ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, /* write */ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, @@ -305,6 +306,22 @@ enum { #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ #define EBLACKLISTED ESHUTDOWN /* blacklisted */ +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_NOP = 0, + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + /* * an individual object operation. each may be accompanied by some data * payload @@ -321,6 +338,8 @@ struct ceph_osd_op { struct { __le32 name_len; __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ } __attribute__ ((packed)) xattr; struct { __u8 class_len; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index d5114db70453..c0b26b6badba 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 110857ba9269..7c663d9b9f81 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -8,14 +8,11 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/parser.h> -#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> -#include <linux/version.h> -#include <linux/vmalloc.h> #include "decode.h" #include "super.h" @@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_syncfs(struct super_block *sb, int wait) { dout("sync_fs %d\n", wait); - ceph_osdc_sync(&ceph_client(sb)->osdc); - ceph_mdsc_sync(&ceph_client(sb)->mdsc); + ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); + ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); dout("sync_fs %d done\n", wait); return 0; } +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} /** * ceph_show_options - Show mount options in /proc/mounts @@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",nocrc"); if (args->flags & CEPH_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + + if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", args->mount_timeout); + if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); + if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", args->osd_timeout); + if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + args->osd_keepalive_timeout); + if (args->wsize) + seq_printf(m, ",wsize=%d", args->wsize); + if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", args->rsize); + if (args->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); + if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + args->caps_wanted_delay_min); + if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + args->caps_wanted_delay_max); + if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + args->cap_release_safety); + if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); + if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", args->snapdir_name); if (args->name) @@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo) inode_init_once(&ci->vfs_inode); } -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - static int __init init_caches(void) { ceph_inode_cachep = kmem_cache_create("ceph_inode_info", @@ -308,7 +333,9 @@ enum { Opt_osd_idle_ttl, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_cap_release_safety, Opt_readdir_max_entries, + Opt_readdir_max_bytes, Opt_congestion_kb, Opt_last_int, /* int args above */ @@ -339,7 +366,9 @@ static match_table_t arg_tokens = { {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, @@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; - args->max_readdir = 1024; + args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + args->max_readdir = CEPH_MAX_READDIR_DEFAULT; + args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; args->congestion_kb = default_congestion_kb(); /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ @@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_readdir_max_entries: args->max_readdir = intval; break; + case Opt_readdir_max_bytes: + args->max_readdir_bytes = intval; + break; case Opt_congestion_kb: args->congestion_kb = intval; break; @@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) /* * true if we have the mon map (and have thus joined the cluster) */ -static int have_mon_map(struct ceph_client *client) +static int have_mon_and_osd_map(struct ceph_client *client) { - return client->monc.monmap && client->monc.monmap->epoch; + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; } /* @@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, if (err < 0) goto out; - while (!have_mon_map(client)) { + while (!have_mon_and_osd_map(client)) { err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) goto out; @@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_map(client) || (client->auth_err < 0), - timeout); + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); if (err == -EINTR || err == -ERESTARTSYS) goto out; if (client->auth_err < 0) { @@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data) /* * construct our own bdi so we can control readahead, etc. */ +static atomic_long_t bdi_seq = ATOMIC_INIT(0); + static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { int err; @@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) client->backing_dev_info.ra_pages = (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); + err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &client->backing_dev_info; return err; @@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type, goto out; } - if (ceph_client(sb) != client) { + if (ceph_sb_to_client(sb) != client) { ceph_destroy_client(client); - client = ceph_client(sb); + client = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", client); } else { dout("get_sb using new client %p\n", client); @@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type, out_splat: ceph_mdsc_close_sessions(&client->mdsc); - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); goto out_final; out: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 13513b80d87f..3725c9ee9d08 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -52,24 +52,25 @@ struct ceph_mount_args { int sb_flags; + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; int num_mon; struct ceph_entity_addr *mon_addr; - int flags; int mount_timeout; int osd_idle_ttl; - int caps_wanted_delay_min, caps_wanted_delay_max; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int wsize; - int rsize; /* max readahead */ - int max_readdir; /* max readdir size */ - int congestion_kb; /* max readdir size */ int osd_timeout; int osd_keepalive_timeout; + int wsize; + int rsize; /* max readahead */ + int congestion_kb; /* max writeback in flight */ + int caps_wanted_delay_min, caps_wanted_delay_max; + int cap_release_safety; + int max_readdir; /* max readdir result (entires) */ + int max_readdir_bytes; /* max readdir result (bytes) */ char *snapdir_name; /* default ".snap" */ char *name; char *secret; - int cap_release_safety; }; /* @@ -80,13 +81,14 @@ struct ceph_mount_args { #define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" #define CEPH_AUTH_NAME_DEFAULT "guest" - /* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap @@ -96,6 +98,7 @@ struct ceph_mount_args { #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) /* mount state */ enum { @@ -160,12 +163,6 @@ struct ceph_client { #endif }; -static inline struct ceph_client *ceph_client(struct super_block *sb) -{ - return sb->s_fs_info; -} - - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2845422907fc..68aeebc69681 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -7,7 +7,8 @@ static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, + return !strncmp(name, "ceph.", 5) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); @@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "user.ceph.dir.files", ceph_vxattrcb_files}, - { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, "ceph.dir.entries", ceph_vxattrcb_entries}, + { true, "ceph.dir.files", ceph_vxattrcb_files}, + { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, + { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, + { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, + { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, + { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, { true, NULL, NULL } }; @@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "user.ceph.layout", ceph_vxattrcb_layout}, + { true, "ceph.layout", ceph_vxattrcb_layout}, { NULL, NULL } }; @@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci, ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; } - if (!xattr) { - pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n", - &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name, - xattr->val); - return -ENOMEM; - } ci->i_xattrs.names_size += name_len; ci->i_xattrs.vals_size += val_len; if (val) @@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ci->i_xattrs.version, ci->i_xattrs.index_version); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version > ci->i_xattrs.version)) { + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { spin_unlock(&inode->i_lock); @@ -622,7 +617,7 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; @@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; err = -ENOMEM; for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(GFP_NOFS); if (!pages[i]) { nr_pages = i; goto out; @@ -779,7 +774,7 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = &client->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; diff --git a/fs/coda/file.c b/fs/coda/file.c index 4c813f2cdc52..7196077b1688 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); + err = vfs_fsync(host_file, datasync); if ( !err && !datasync ) { lock_kernel(); err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 773f2ce9aa06..ca25d96d45c9 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -1,6 +1,6 @@ /* * Pioctl operations for Coda. - * Original version: (C) 1996 Peter Braam + * Original version: (C) 1996 Peter Braam * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University * * Carnegie Mellon encourages users of this code to contribute improvements @@ -23,21 +23,22 @@ #include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include <linux/smp_lock.h> + /* pioctl ops */ static int coda_ioctl_permission(struct inode *inode, int mask); -static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data); +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data); /* exported from this file */ -const struct inode_operations coda_ioctl_inode_operations = -{ +const struct inode_operations coda_ioctl_inode_operations = { .permission = coda_ioctl_permission, .setattr = coda_setattr, }; const struct file_operations coda_ioctl_operations = { .owner = THIS_MODULE, - .ioctl = coda_pioctl, + .unlocked_ioctl = coda_pioctl, }; /* the coda pioctl inode ops */ @@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask) return (mask & MAY_EXEC) ? -EACCES : 0; } -static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data) +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data) { struct path path; - int error; + int error; struct PioctlData data; - struct inode *target_inode = NULL; - struct coda_inode_info *cnp; + struct inode *inode = filp->f_dentry->d_inode; + struct inode *target_inode = NULL; + struct coda_inode_info *cnp; - /* get the Pioctl data arguments from user space */ - if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { - return -EINVAL; - } - - /* - * Look up the pathname. Note that the pathname is in - * user memory, and namei takes care of this - */ - if (data.follow) { - error = user_path(data.path, &path); - } else { - error = user_lpath(data.path, &path); + lock_kernel(); + + /* get the Pioctl data arguments from user space */ + if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { + error = -EINVAL; + goto out; } - - if ( error ) { - return error; - } else { + + /* + * Look up the pathname. Note that the pathname is in + * user memory, and namei takes care of this + */ + if (data.follow) + error = user_path(data.path, &path); + else + error = user_lpath(data.path, &path); + + if (error) + goto out; + else target_inode = path.dentry->d_inode; - } - + /* return if it is not a Coda inode */ - if ( target_inode->i_sb != inode->i_sb ) { + if (target_inode->i_sb != inode->i_sb) { path_put(&path); - return -EINVAL; + error = -EINVAL; + goto out; } /* now proceed to make the upcall */ - cnp = ITOC(target_inode); + cnp = ITOC(target_inode); error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); path_put(&path); - return error; -} +out: + unlock_kernel(); + return error; +} diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index be4392ca2098..66b9cf79c5ba 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) return mask; } -static int coda_psdev_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg) +static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg) { unsigned int data; @@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = { .read = coda_psdev_read, .write = coda_psdev_write, .poll = coda_psdev_poll, - .ioctl = coda_psdev_ioctl, + .unlocked_ioctl = coda_psdev_ioctl, .open = coda_psdev_open, .release = coda_psdev_release, }; diff --git a/fs/dcache.c b/fs/dcache.c index f1358e5c3a59..d96047b4a633 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -536,7 +536,7 @@ restart: */ static void prune_dcache(int count) { - struct super_block *sb; + struct super_block *sb, *n; int w_count; int unused = dentry_stat.nr_unused; int prune_ratio; @@ -545,13 +545,14 @@ static void prune_dcache(int count) if (unused == 0 || count == 0) return; spin_lock(&dcache_lock); -restart: if (count >= unused) prune_ratio = 1; else prune_ratio = unused / count; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_nr_dentry_unused == 0) continue; sb->s_count++; @@ -590,14 +591,10 @@ restart: } spin_lock(&sb_lock); count -= pruned; - /* - * restart only when sb is no longer on the list and - * we have more work to do. - */ - if (__put_super_and_need_restart(sb) && count > 0) { - spin_unlock(&sb_lock); - goto restart; - } + __put_super(sb); + /* more work left to do? */ + if (count <= 0) + break; } spin_unlock(&sb_lock); spin_unlock(&dcache_lock); @@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry) spin_lock(&dentry->d_lock); isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { + dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_iput(dentry); fsnotify_nameremove(dentry, isdir); return; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0120247b41c0..8b3ffd5b5235 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, s); - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); error = mknod_ptmx(s); if (error) - goto out_dput; + goto out_undo_sget; - return 0; + simple_set_mnt(mnt, s); -out_dput: - dput(s->s_root); /* undo dget() in simple_set_mnt() */ + return 0; out_undo_sget: deactivate_locked_super(s); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 31f4b0e6d72c..83c4f600786a 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -12,7 +12,7 @@ /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; -static void drop_pagecache_sb(struct super_block *sb) +static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; @@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb) iput(toput_inode); } -static void drop_pagecache(void) -{ - struct super_block *sb; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - drop_pagecache_sb(sb); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); -} - static void drop_slab(void) { int nr_objects; @@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, proc_dointvec_minmax(table, write, buffer, length, ppos); if (write) { if (sysctl_drop_caches & 1) - drop_pagecache(); + iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) drop_slab(); } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index bfc2e0f78f00..0032a9f5a3a9 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, struct page *page_for_lower, size_t offset_in_page, size_t size); -int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, - size_t size); +int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode); int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); -struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, struct user_namespace *user_ns); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index e7440a6f5ebf..3bdddbcc785f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - return vfs_fsync(ecryptfs_file_to_lower(file), - ecryptfs_dentry_to_lower(dentry), - datasync); + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } static int ecryptfs_fasync(int fd, struct file *file, int flag) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e2d4418affac..65dee2f336ae 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -142,19 +142,10 @@ out: static int grow_file(struct dentry *ecryptfs_dentry) { struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; - struct file fake_file; - struct ecryptfs_file_info tmp_file_info; char zero_virt[] = { 0x00 }; int rc = 0; - memset(&fake_file, 0, sizeof(fake_file)); - fake_file.f_path.dentry = ecryptfs_dentry; - memset(&tmp_file_info, 0, sizeof(tmp_file_info)); - ecryptfs_set_file_private(&fake_file, &tmp_file_info); - ecryptfs_set_file_lower( - &fake_file, - ecryptfs_inode_to_private(ecryptfs_inode)->lower_file); - rc = ecryptfs_write(&fake_file, zero_virt, 0, 1); + rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1); i_size_write(ecryptfs_inode, 0); rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= @@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, { int rc = 0; struct inode *inode = dentry->d_inode; - struct dentry *lower_dentry; - struct file fake_ecryptfs_file; struct ecryptfs_crypt_stat *crypt_stat; loff_t i_size = i_size_read(inode); loff_t lower_size_before_truncate; @@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, goto out; } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - /* Set up a fake ecryptfs file, this is used to interface with - * the file in the underlying filesystem so that the - * truncation has an effect there as well. */ - memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file)); - fake_ecryptfs_file.f_path.dentry = dentry; - /* Released at out_free: label */ - ecryptfs_set_file_private(&fake_ecryptfs_file, - kmem_cache_alloc(ecryptfs_file_info_cache, - GFP_KERNEL)); - if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) { - rc = -ENOMEM; - goto out; - } - lower_dentry = ecryptfs_dentry_to_lower(dentry); - ecryptfs_set_file_lower( - &fake_ecryptfs_file, - ecryptfs_inode_to_private(dentry->d_inode)->lower_file); /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { char zero[] = { 0x00 }; @@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, * this triggers code that will fill in 0's throughout * the intermediate portion of the previous end of the * file and the new and of the file */ - rc = ecryptfs_write(&fake_ecryptfs_file, zero, + rc = ecryptfs_write(inode, zero, (ia->ia_size - 1), 1); } else { /* ia->ia_size < i_size_read(inode) */ /* We're chopping off all the pages down to the page @@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = vmtruncate(inode, ia->ia_size); if (rc) - goto out_free; + goto out; lower_ia->ia_size = ia->ia_size; lower_ia->ia_valid |= ATTR_SIZE; - goto out_free; + goto out; } if (num_zeros) { char *zeros_virt; @@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, zeros_virt = kzalloc(num_zeros, GFP_KERNEL); if (!zeros_virt) { rc = -ENOMEM; - goto out_free; + goto out; } - rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, + rc = ecryptfs_write(inode, zeros_virt, ia->ia_size, num_zeros); kfree(zeros_virt); if (rc) { printk(KERN_ERR "Error attempting to zero out " "the remainder of the end page on " "reducing truncate; rc = [%d]\n", rc); - goto out_free; + goto out; } } vmtruncate(inode, ia->ia_size); @@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, printk(KERN_ERR "Problem with " "ecryptfs_write_inode_size_to_metadata; " "rc = [%d]\n", rc); - goto out_free; + goto out; } /* We are reducing the size of the ecryptfs file, and need to * know if we need to reduce the size of the lower file. */ @@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, } else lower_ia->ia_valid &= ~ATTR_SIZE; } -out_free: - if (ecryptfs_file_to_private(&fake_ecryptfs_file)) - kmem_cache_free(ecryptfs_file_info_cache, - ecryptfs_file_to_private(&fake_ecryptfs_file)); out: return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 760983d0f25e..cbd4e18adb20 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct super_block *sb, char *options) +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) { char *p; int rc = 0; @@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; @@ -483,68 +483,7 @@ out: } struct kmem_cache *ecryptfs_sb_info_cache; - -/** - * ecryptfs_fill_super - * @sb: The ecryptfs super block - * @raw_data: The options passed to mount - * @silent: Not used but required by function prototype - * - * Sets up what we can of the sb, rest is done in ecryptfs_read_super - * - * Returns zero on success; non-zero otherwise - */ -static int -ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) -{ - struct ecryptfs_sb_info *esi; - int rc = 0; - - /* Released in ecryptfs_put_super() */ - ecryptfs_set_superblock_private(sb, - kmem_cache_zalloc(ecryptfs_sb_info_cache, - GFP_KERNEL)); - esi = ecryptfs_superblock_to_private(sb); - if (!esi) { - ecryptfs_printk(KERN_WARNING, "Out of memory\n"); - rc = -ENOMEM; - goto out; - } - - rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); - if (rc) - goto out; - - sb->s_bdi = &esi->bdi; - sb->s_op = &ecryptfs_sops; - /* Released through deactivate_super(sb) from get_sb_nodev */ - sb->s_root = d_alloc(NULL, &(const struct qstr) { - .hash = 0,.name = "/",.len = 1}); - if (!sb->s_root) { - ecryptfs_printk(KERN_ERR, "d_alloc failed\n"); - rc = -ENOMEM; - goto out; - } - sb->s_root->d_op = &ecryptfs_dops; - sb->s_root->d_sb = sb; - sb->s_root->d_parent = sb->s_root; - /* Released in d_release when dput(sb->s_root) is called */ - /* through deactivate_super(sb) from get_sb_nodev() */ - ecryptfs_set_dentry_private(sb->s_root, - kmem_cache_zalloc(ecryptfs_dentry_info_cache, - GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(sb->s_root)) { - ecryptfs_printk(KERN_ERR, - "dentry_info_cache alloc failed\n"); - rc = -ENOMEM; - goto out; - } - rc = 0; -out: - /* Should be able to rely on deactivate_super called from - * get_sb_nodev */ - return rc; -} +static struct file_system_type ecryptfs_fs_type; /** * ecryptfs_read_super @@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name) ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); goto out; } + if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { + rc = -EINVAL; + printk(KERN_ERR "Mount on filesystem of type " + "eCryptfs explicitly disallowed due to " + "known incompatibilities\n"); + goto out_free; + } ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; sb->s_blocksize = path.dentry->d_sb->s_blocksize; @@ -588,11 +534,8 @@ out: * @dev_name: The path to mount over * @raw_data: The options passed into the kernel * - * The whole ecryptfs_get_sb process is broken into 4 functions: + * The whole ecryptfs_get_sb process is broken into 3 functions: * ecryptfs_parse_options(): handle options passed to ecryptfs, if any - * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block - * with as much information as it can before needing - * the lower filesystem. * ecryptfs_read_super(): this accesses the lower filesystem and uses * ecryptfs_interpose to perform most of the linking * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) @@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { + struct super_block *s; + struct ecryptfs_sb_info *sbi; + struct ecryptfs_dentry_info *root_info; + const char *err = "Getting sb failed"; int rc; - struct super_block *sb; - rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); - if (rc < 0) { - printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); + sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); + if (!sbi) { + rc = -ENOMEM; goto out; } - sb = mnt->mnt_sb; - rc = ecryptfs_parse_options(sb, raw_data); + + rc = ecryptfs_parse_options(sbi, raw_data); if (rc) { - printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); - goto out_abort; + err = "Error parsing options"; + goto out; + } + + s = sget(fs_type, NULL, set_anon_super, NULL); + if (IS_ERR(s)) { + rc = PTR_ERR(s); + goto out; } - rc = ecryptfs_read_super(sb, dev_name); + + s->s_flags = flags; + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); if (rc) { - printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); - goto out_abort; + deactivate_locked_super(s); + goto out; } - goto out; -out_abort: - dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ - deactivate_locked_super(sb); + + ecryptfs_set_superblock_private(s, sbi); + s->s_bdi = &sbi->bdi; + + /* ->kill_sb() will take care of sbi after that point */ + sbi = NULL; + s->s_op = &ecryptfs_sops; + + rc = -ENOMEM; + s->s_root = d_alloc(NULL, &(const struct qstr) { + .hash = 0,.name = "/",.len = 1}); + if (!s->s_root) { + deactivate_locked_super(s); + goto out; + } + s->s_root->d_op = &ecryptfs_dops; + s->s_root->d_sb = s; + s->s_root->d_parent = s->s_root; + + root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + if (!root_info) { + deactivate_locked_super(s); + goto out; + } + /* ->kill_sb() will take care of root_info */ + ecryptfs_set_dentry_private(s->s_root, root_info); + s->s_flags |= MS_ACTIVE; + rc = ecryptfs_read_super(s, dev_name); + if (rc) { + deactivate_locked_super(s); + err = "Reading sb failed"; + goto out; + } + simple_set_mnt(mnt, s); + return 0; + out: + if (sbi) { + ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sbi); + } + printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return rc; } @@ -633,11 +624,16 @@ out: * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. - * Private data is free'd in ecryptfs_put_super() */ static void ecryptfs_kill_block_super(struct super_block *sb) { - generic_shutdown_super(sb); + struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + kill_anon_super(sb); + if (!sb_info) + return; + ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); + kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2ee9a3a7b68c..b1d82756544b 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -44,17 +44,9 @@ * Returns locked and up-to-date page (if ok), with increased * refcnt. */ -struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) { - struct dentry *dentry; - struct inode *inode; - struct address_space *mapping; - struct page *page; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, index, (void *)file); + struct page *page = read_mapping_page(inode->i_mapping, index, NULL); if (!IS_ERR(page)) lock_page(page); return page; @@ -198,7 +190,7 @@ out: static int ecryptfs_readpage(struct file *file, struct page *page) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; int rc = 0; if (!crypt_stat @@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file, if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private( - file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { @@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file, unsigned to = from + copied; struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc; if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 0cc4fafd6552..db184ef15d3d 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, /** * ecryptfs_write - * @ecryptfs_file: The eCryptfs file into which to write + * @ecryptfs_inode: The eCryptfs file into which to write * @data: Virtual address where data to write is located * @offset: Offset in the eCryptfs file at which to begin writing the * data from @data @@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, * * Returns zero on success; non-zero otherwise */ -int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, +int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { struct page *ecryptfs_page; struct ecryptfs_crypt_stat *crypt_stat; - struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; char *ecryptfs_page_virt; loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; @@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; } - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, ecryptfs_page_idx); if (IS_ERR(ecryptfs_page)) { rc = PTR_ERR(ecryptfs_page); @@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int ecryptfs_read(char *data, loff_t offset, size_t size, struct file *ecryptfs_file) { + struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; struct page *ecryptfs_page; char *ecryptfs_page_virt; - loff_t ecryptfs_file_size = - i_size_read(ecryptfs_file->f_dentry->d_inode); + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; loff_t pos; int rc = 0; @@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size, if (num_bytes > total_remaining_bytes) num_bytes = total_remaining_bytes; - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, ecryptfs_page_idx); if (IS_ERR(ecryptfs_page)) { rc = PTR_ERR(ecryptfs_page); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0c0ae491d231..0435886e4a9f 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode) } /** - * ecryptfs_put_super - * @sb: Pointer to the ecryptfs super block - * - * Final actions when unmounting a file system. - * This will handle deallocation and release of our private data. - */ -static void ecryptfs_put_super(struct super_block *sb) -{ - struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); - - lock_kernel(); - - ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); - bdi_destroy(&sb_info->bdi); - kmem_cache_free(ecryptfs_sb_info_cache, sb_info); - ecryptfs_set_superblock_private(sb, NULL); - - unlock_kernel(); -} - -/** * ecryptfs_statfs * @sb: The ecryptfs super block * @buf: The struct kstatfs to fill in with stats @@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = { .alloc_inode = ecryptfs_alloc_inode, .destroy_inode = ecryptfs_destroy_inode, .drop_inode = generic_delete_inode, - .put_super = ecryptfs_put_super, .statfs = ecryptfs_statfs, .remount_fs = NULL, .clear_inode = ecryptfs_clear_inode, diff --git a/fs/exec.c b/fs/exec.c index e6e94c626c2c..9badbc0bfb1d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; + vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); @@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); @@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4cfab1cc75c0..d91e9d829bc1 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) de->inode_no = cpu_to_le64(parent->i_ino); memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); exofs_set_de_type(de, inode); - kunmap_atomic(page, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); err = exofs_commit_chunk(page, 0, chunk_size); fail: page_cache_release(page); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 76d2a79ef93e..4bb6ef822e46 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping, return ret; } +static int exofs_releasepage(struct page *page, gfp_t gfp) +{ + EXOFS_DBGMSG("page 0x%lx\n", page->index); + WARN_ON(1); + return try_to_free_buffers(page); +} + +static void exofs_invalidatepage(struct page *page, unsigned long offset) +{ + EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page)); + WARN_ON(1); + + block_invalidatepage(page, offset); +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = { .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, + .releasepage = exofs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .invalidatepage = exofs_invalidatepage, + + /* Not implemented Yet */ + .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ + .direct_IO = NULL, /* TODO: Should be trivial to do */ + + /* With these NULL has special meaning or default is not exported */ + .sync_page = NULL, + .get_xip_mem = NULL, + .migratepage = NULL, + .launder_page = NULL, + .is_partially_uptodate = NULL, + .error_remove_page = NULL, }; /****************************************************************************** @@ -1123,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; sb->s_dirt = 1; - inode->i_uid = current->cred->fsuid; - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current->cred->fsgid; - } - inode->i_mode = mode; - + inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index a99e54318c3d..ca7e2a0ed98a 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -420,7 +420,7 @@ release_and_out: return error; } -struct xattr_handler ext2_xattr_acl_access_handler = { +const struct xattr_handler ext2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext2_xattr_list_acl_access, @@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = { .set = ext2_xattr_set_acl, }; -struct xattr_handler ext2_xattr_acl_default_handler = { +const struct xattr_handler ext2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext2_xattr_list_acl_default, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index f0c5286f9342..938dbc739d00 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -549,16 +549,12 @@ got: sb->s_dirt = 1; mark_buffer_dirty(bh2); - inode->i_uid = current_fsuid(); - if (test_opt (sb, GRPID)) + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 3b96045a00ce..7c3915780b19 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *, static struct mb_cache *ext2_xattr_cache; -static struct xattr_handler *ext2_xattr_handler_map[] = { +static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, @@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext2_xattr_handlers[] = { +const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext2_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) handler = ext2_xattr_handler_map[name_index]; @@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", /* list the attribute names */ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); entry = EXT2_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext2_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index bf8175b2ced9..a1a1c2184616 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -55,11 +55,11 @@ struct ext2_xattr_entry { # ifdef CONFIG_EXT2_FS_XATTR -extern struct xattr_handler ext2_xattr_user_handler; -extern struct xattr_handler ext2_xattr_trusted_handler; -extern struct xattr_handler ext2_xattr_acl_access_handler; -extern struct xattr_handler ext2_xattr_acl_default_handler; -extern struct xattr_handler ext2_xattr_security_handler; +extern const struct xattr_handler ext2_xattr_user_handler; +extern const struct xattr_handler ext2_xattr_trusted_handler; +extern const struct xattr_handler ext2_xattr_acl_access_handler; +extern const struct xattr_handler ext2_xattr_acl_default_handler; +extern const struct xattr_handler ext2_xattr_security_handler; extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); @@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *); extern int init_ext2_xattr(void); extern void exit_ext2_xattr(void); -extern struct xattr_handler *ext2_xattr_handlers[]; +extern const struct xattr_handler *ext2_xattr_handlers[]; # else /* CONFIG_EXT2_FS_XATTR */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index b118c6383c6d..3004e15d5da5 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext2_xattr_security_handler = { +const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, .get = ext2_xattr_security_get, diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 2a26d71f4771..667e46a8d62d 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name, value, size, flags); } -struct xattr_handler ext2_xattr_trusted_handler = { +const struct xattr_handler ext2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext2_xattr_trusted_list, .get = ext2_xattr_trusted_get, diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 3f6caf3684b4..099d20f47163 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext2_xattr_user_handler = { +const struct xattr_handler ext2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext2_xattr_user_list, .get = ext2_xattr_user_get, diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 82ba34158661..01552abbca3c 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -456,7 +456,7 @@ release_and_out: return error; } -struct xattr_handler ext3_xattr_acl_access_handler = { +const struct xattr_handler ext3_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext3_xattr_list_acl_access, @@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = { .set = ext3_xattr_set_acl, }; -struct xattr_handler ext3_xattr_acl_default_handler = { +const struct xattr_handler ext3_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext3_xattr_list_acl_default, diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 26289e8f4163..fcf7487734b6 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) * storage */ if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 0d0e97ed3ff6..498021eb88fb 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -538,16 +538,13 @@ got: if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - inode->i_uid = current_fsuid(); - if (test_opt (sb, GRPID)) - inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 534a94c3a933..71fb8d65e54c 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer, static struct mb_cache *ext3_xattr_cache; -static struct xattr_handler *ext3_xattr_handler_map[] = { +static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, @@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext3_xattr_handlers[] = { +const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL @@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext3_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) handler = ext3_xattr_handler_map[name_index]; @@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext3_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 148a4dfc82ab..377fe7201169 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -58,11 +58,11 @@ struct ext3_xattr_entry { # ifdef CONFIG_EXT3_FS_XATTR -extern struct xattr_handler ext3_xattr_user_handler; -extern struct xattr_handler ext3_xattr_trusted_handler; -extern struct xattr_handler ext3_xattr_acl_access_handler; -extern struct xattr_handler ext3_xattr_acl_default_handler; -extern struct xattr_handler ext3_xattr_security_handler; +extern const struct xattr_handler ext3_xattr_user_handler; +extern const struct xattr_handler ext3_xattr_trusted_handler; +extern const struct xattr_handler ext3_xattr_acl_access_handler; +extern const struct xattr_handler ext3_xattr_acl_default_handler; +extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); @@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *); extern int init_ext3_xattr(void); extern void exit_ext3_xattr(void); -extern struct xattr_handler *ext3_xattr_handlers[]; +extern const struct xattr_handler *ext3_xattr_handlers[]; # else /* CONFIG_EXT3_FS_XATTR */ diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3af91f476dff..03a99bfc59f9 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext3_xattr_security_handler = { +const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, .get = ext3_xattr_security_get, diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index e5562845ed96..dc8edda9ffe0 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name, value, size, flags); } -struct xattr_handler ext3_xattr_trusted_handler = { +const struct xattr_handler ext3_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext3_xattr_trusted_list, .get = ext3_xattr_trusted_get, diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 3bcfe9ee0a68..7a321974d584 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext3_xattr_user_handler = { +const struct xattr_handler ext3_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext3_xattr_user_list, .get = ext3_xattr_user_get, diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8a2a29d35a6f..feaf498feaa6 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -454,7 +454,7 @@ release_and_out: return error; } -struct xattr_handler ext4_xattr_acl_access_handler = { +const struct xattr_handler ext4_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext4_xattr_list_acl_access, @@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = { .set = ext4_xattr_set_acl, }; -struct xattr_handler ext4_xattr_acl_default_handler = { +const struct xattr_handler ext4_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext4_xattr_list_acl_default, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0d0c3239c1cd..ef3d980e67cb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ext4_should_writeback_data(inode) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 57f6eef6ccd6..1a0e183a2f04 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -979,16 +979,12 @@ got: atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - inode->i_uid = current_fsuid(); - if (test_opt(sb, GRPID)) + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b4c5aa8489d8..2de0e9515089 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer, static struct mb_cache *ext4_xattr_cache; -static struct xattr_handler *ext4_xattr_handler_map[] = { +static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, @@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; @@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 8ede88b18c29..518e96e43905 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -65,11 +65,11 @@ struct ext4_xattr_entry { # ifdef CONFIG_EXT4_FS_XATTR -extern struct xattr_handler ext4_xattr_user_handler; -extern struct xattr_handler ext4_xattr_trusted_handler; -extern struct xattr_handler ext4_xattr_acl_access_handler; -extern struct xattr_handler ext4_xattr_acl_default_handler; -extern struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_acl_access_handler; +extern const struct xattr_handler ext4_xattr_acl_default_handler; +extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); @@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, extern int init_ext4_xattr(void); extern void exit_ext4_xattr(void); -extern struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler *ext4_xattr_handlers[]; # else /* CONFIG_EXT4_FS_XATTR */ diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 8b145e98df07..9b21268e121c 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext4_xattr_security_handler = { +const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, .get = ext4_xattr_security_get, diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 15b50edc6587..37e6ebca2cc3 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext4_xattr_trusted_handler = { +const struct xattr_handler ext4_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext4_xattr_trusted_list, .get = ext4_xattr_trusted_get, diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index c4ce05746ce1..98c375352d0e 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext4_xattr_user_handler = { +const struct xattr_handler ext4_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext4_xattr_user_list, .get = ext4_xattr_user_get, diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 113f0a1e565d..ae8200f84e39 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) while (*fclus < cluster) { /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { - fat_fs_error(sb, "%s: detected the cluster chain loop" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, + "%s: detected the cluster chain loop" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } @@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) if (nr < 0) goto out; else if (nr == FAT_ENT_FREE) { - fat_fs_error(sb, "%s: invalid cluster chain" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, "%s: invalid cluster chain" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 530b4ca01510..ee42b9e0b16a 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> #include <linux/compat.h> #include <asm/uaccess.h> +#include <linux/kernel.h> #include "fat.h" /* @@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, { const wchar_t *ip; wchar_t ec; - unsigned char *op, nc; + unsigned char *op; int charlen; - int k; ip = uni; op = ascii; while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; - if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { + if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { op += charlen; len -= charlen; } else { if (uni_xlate == 1) { - *op = ':'; - for (k = 4; k > 0; k--) { - nc = ec & 0xF; - op[k] = nc > 9 ? nc + ('a' - 10) - : nc + '0'; - ec >>= 4; - } - op += 5; + *op++ = ':'; + op = pack_hex_byte(op, ec >> 8); + op = pack_hex_byte(op, ec); len -= 5; } else { *op++ = '?'; @@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp, return ret; } -static int fat_dir_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long fat_dir_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; @@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp, both = 1; break; default: - return fat_generic_ioctl(inode, filp, cmd, arg); + return fat_generic_ioctl(filp, cmd, arg); } if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) @@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, both = 1; break; default: - return -ENOIOCTLCMD; + return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) @@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = fat_readdir, - .ioctl = fat_dir_ioctl, + .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, #endif diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e6efdfa0f6db..53dba57b49a1 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -6,6 +6,7 @@ #include <linux/nls.h> #include <linux/fs.h> #include <linux/mutex.h> +#include <linux/ratelimit.h> #include <linux/msdos_fs.h> /* @@ -82,6 +83,8 @@ struct msdos_sb_info { struct fatent_operations *fatent_ops; struct inode *fat_inode; + struct ratelimit_state ratelimit; + spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; }; @@ -298,8 +301,8 @@ extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); /* fat/file.c */ -extern int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); +extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct dentry * dentry, struct iattr * attr); @@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void fat_fs_error(struct super_block *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))) __cold; +extern void +__fat_fs_error(struct super_block *s, int report, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +#define fat_fs_error(s, fmt, args...) \ + __fat_fs_error(s, 1, fmt , ## args) +#define fat_fs_error_ratelimit(s, fmt, args...) \ + __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/file.c b/fs/fat/file.c index e8c159de236b..a14c2f6a489e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -8,6 +8,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/compat.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/buffer_head.h> @@ -114,9 +115,9 @@ out: return err; } -int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { @@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, } } +#ifdef CONFIG_COMPAT +static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) + +{ + return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && @@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .release = fat_file_release, - .ioctl = fat_generic_ioctl, + .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_generic_compat_ioctl, +#endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 0ce143bd7d56..ed33904926ee 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; sbi->dir_ops = fs_dir_inode_ops; + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); error = parse_options(data, isvfat, silent, &debug, &sbi->options); if (error) @@ -1497,10 +1499,8 @@ out_fail: iput(fat_inode); if (root_inode) iput(root_inode); - if (sbi->nls_io) - unload_nls(sbi->nls_io); - if (sbi->nls_disk) - unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + unload_nls(sbi->nls_disk); if (sbi->options.iocharset != fat_default_iocharset) kfree(sbi->options.iocharset); sb->s_fs_info = NULL; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index d3da05f26465..1fa23f6ffba5 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -20,27 +20,29 @@ * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ -void fat_fs_error(struct super_block *s, const char *fmt, ...) +void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...) { struct fat_mount_options *opts = &MSDOS_SB(s)->options; va_list args; - printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); + if (report) { + printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); - printk(KERN_ERR " "); - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - printk("\n"); + printk(KERN_ERR " "); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + printk("\n"); + } if (opts->errors == FAT_ERRORS_PANIC) - panic(" FAT fs panic from previous error\n"); + panic("FAT: fs panic from previous error\n"); else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { s->s_flags |= MS_RDONLY; - printk(KERN_ERR " File system has been set read-only\n"); + printk(KERN_ERR "FAT: Filesystem has been set read-only\n"); } } -EXPORT_SYMBOL_GPL(fat_fs_error); +EXPORT_SYMBOL_GPL(__fat_fs_error); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ diff --git a/fs/fcntl.c b/fs/fcntl.c index 0a140741b39e..f74d270ba155 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -14,6 +14,7 @@ #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> @@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_NOTIFY: err = fcntl_dirnotify(fd, filp, arg); break; + case F_SETPIPE_SZ: + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; default: break; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b37f7cea4dd..ea8592b90696 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,9 +42,10 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate:1; - int range_cyclic:1; - int for_background:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; + unsigned int sb_pinned:1; }; /* @@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) } static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args) + struct wb_writeback_args *args, + int wait) { struct bdi_work *work; @@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, if (work) { bdi_work_init(work, args); bdi_queue_work(bdi, work); + if (wait) + bdi_wait_on_work_clear(work); } else { struct bdi_writeback *wb = &bdi->wb; @@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, + /* + * Setting sb_pinned is not necessary for WB_SYNC_ALL, but + * lets make it explicitly clear. + */ + .sb_pinned = 1, }; struct bdi_work work; @@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, * @bdi: the backing device to write from * @sb: write inodes from this super_block * @nr_pages: the number of pages to write + * @sb_locked: caller already holds sb umount sem. * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * completion. Caller specifies whether sb umount sem is held already or not. * */ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages) + long nr_pages, int sb_locked) { struct wb_writeback_args args = { .sb = sb, .sync_mode = WB_SYNC_NONE, .nr_pages = nr_pages, .range_cyclic = 1, + .sb_pinned = sb_locked, }; /* @@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, args.for_background = 1; } - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, sb_locked); } /* @@ -398,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - do { + while (inode->i_state & I_SYNC) { spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); - } while (inode->i_state & I_SYNC); + } } /* @@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) BUG_ON(inode->i_state & I_SYNC); - /* Set I_SYNC, reset I_DIRTY */ - dirty = inode->i_state & I_DIRTY; + /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY; - + inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode_lock); + dirty = inode->i_state & I_DIRTY; + inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, /* * Caller must already hold the ref for this */ - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); return SB_NOT_PINNED; } @@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb, .for_kupdate = args->for_kupdate, .for_background = args->for_background, .range_cyclic = args->range_cyclic, + .sb_pinned = args->sb_pinned, }; unsigned long oldest_jif; long wrote = 0; @@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) unsigned long expired; long nr_pages; + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) @@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) while ((work = get_next_work_item(bdi, wb)) != NULL) { struct wb_writeback_args args = work->args; + int post_clear; /* * Override sync mode, in case we must wait for completion @@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; + post_clear = WB_SYNC_ALL || args.sb_pinned; + /* * If this isn't a data integrity operation, just notify * that we have seen this work and we are now starting it. */ - if (args.sync_mode == WB_SYNC_NONE) + if (!post_clear) wb_clear_pending(wb, work); wrote += wb_writeback(wb, &args); @@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * This is a data integrity writeback, so only do the * notification when we have completed the work. */ - if (args.sync_mode == WB_SYNC_ALL) + if (post_clear) wb_clear_pending(wb, work); } @@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb) break; } - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty_careful(&wb->bdi->work_list) && + !kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + try_to_freeze(); } @@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages) if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, 0); } rcu_read_unlock(); @@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) +{ + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; + + nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); +} + /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb) */ void writeback_inodes_sb(struct super_block *sb) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write); + __writeback_inodes_sb(sb, 0); } EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_locked - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Like writeback_inodes_sb(), except the caller already holds the + * sb umount sem. + */ +void writeback_inodes_sb_locked(struct super_block *sb) +{ + __writeback_inodes_sb(sb, 1); +} + +/** * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock * diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 1e1f286dd70e..4a8eb31c5338 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) /* banners (can't represent line 0 by pos 0 as that would involve * returning a NULL pointer) */ if (pos == 0) - return (struct fscache_object *) ++(*_pos); + return (struct fscache_object *)(long)++(*_pos); if (pos < 3) return (struct fscache_object *)pos; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb7e9423691f..e53df5ebb2b8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -18,6 +18,7 @@ #include <linux/slab.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; diff --git a/fs/generic_acl.c b/fs/generic_acl.c index fe5df5457656..99800e564157 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask) return -EAGAIN; } -struct xattr_handler generic_acl_access_handler = { +const struct xattr_handler generic_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = generic_acl_list, @@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = { .set = generic_acl_set, }; -struct xattr_handler generic_acl_default_handler = { +const struct xattr_handler generic_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = generic_acl_list, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 87ee309d4c24..48171f4c943d 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int xtype) { struct inode *inode = dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct posix_acl *acl; int type; int error; + if (!sdp->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + type = gfs2_acl_type(name); if (type < 0) return type; @@ -335,7 +339,7 @@ out: return error; } -struct xattr_handler gfs2_xattr_system_handler = { +const struct xattr_handler gfs2_xattr_system_handler = { .prefix = XATTR_SYSTEM_PREFIX, .flags = GFS2_EATYPE_SYS, .get = gfs2_xattr_system_get, diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 9306a2e6620c..b522b0cb39ea 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -19,6 +19,6 @@ extern int gfs2_check_acl(struct inode *inode, int mask); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); -extern struct xattr_handler gfs2_xattr_system_handler; +extern const struct xattr_handler gfs2_xattr_system_handler; #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e6dd2aec6f82..b20bfcc9fa2d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_drop_write; + error = -EACCES; + if (!is_owner_or_cap(inode)) + goto out; + + error = 0; flags = ip->i_diskflags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) @@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) { struct inode *inode = filp->f_path.dentry->d_inode; u32 fsflags, gfsflags; + if (get_user(fsflags, ptr)) return -EFAULT; + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); if (!S_ISDIR(inode->i_mode)) { if (gfsflags & GFS2_DIF_INHERIT_JDATA) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 51d8061fa07a..b5612cbb62a5 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -242,34 +242,38 @@ fail: } /** - * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation + * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation + * and try to reclaim it by doing iput. + * + * This function assumes no rgrp locks are currently held. + * * @sb: The super block * no_addr: The inode number - * @@inode: A pointer to the inode found, if any * - * Returns: 0 and *inode if no errors occurred. If an error occurs, - * the resulting *inode may or may not be NULL. */ -int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, - struct inode **inode) +void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) { struct gfs2_sbd *sdp; struct gfs2_inode *ip; struct gfs2_glock *io_gl; int error; struct gfs2_holder gh; + struct inode *inode; - *inode = gfs2_iget_skip(sb, no_addr); + inode = gfs2_iget_skip(sb, no_addr); - if (!(*inode)) - return -ENOBUFS; + if (!inode) + return; - if (!((*inode)->i_state & I_NEW)) - return -ENOBUFS; + /* If it's not a new inode, someone's using it, so leave it alone. */ + if (!(inode->i_state & I_NEW)) { + iput(inode); + return; + } - ip = GFS2_I(*inode); - sdp = GFS2_SB(*inode); + ip = GFS2_I(inode); + sdp = GFS2_SB(inode); ip->i_no_formal_ino = -1; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, &ip->i_iopen_gh); - if (unlikely(error)) { - if (error == GLR_TRYFAILED) - error = 0; + if (unlikely(error)) goto fail_iopen; - } + ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); - (*inode)->i_mode = DT2IF(DT_UNKNOWN); + inode->i_mode = DT2IF(DT_UNKNOWN); /* * We must read the inode in order to work out its type in @@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, &gh); - if (unlikely(error)) { - if (error == GLR_TRYFAILED) - error = 0; + if (unlikely(error)) goto fail_glock; - } + /* Inode is now uptodate */ gfs2_glock_dq_uninit(&gh); - gfs2_set_iop(*inode); + gfs2_set_iop(inode); + + /* The iput will cause it to be deleted. */ + iput(inode); + return; - return 0; fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: @@ -321,7 +324,8 @@ fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); fail: - return error; + iget_failed(inode); + return; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index e161461d4c57..300ada3f21de 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); -extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, - struct inode **inode); +extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index b593f0e28f25..6a857e24f947 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) * */ -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { struct gfs2_ail *ai; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index eb570b4ad443..0d007f920234 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); -int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, +extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); +extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, struct buffer_head *real); -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); -static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) -{ - if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) - __gfs2_log_flush(sbd, gl); -} - -void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -void gfs2_remove_from_ail(struct gfs2_bufdata *bd); - -void gfs2_log_shutdown(struct gfs2_sbd *sdp); -void gfs2_meta_syncfs(struct gfs2_sbd *sdp); -int gfs2_logd(void *data); +extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); +extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); +extern int gfs2_logd(void *data); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8bce73ed4d8e..171a744f8e45 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | + BLKDEV_IFL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -869,7 +870,7 @@ start_new_extent: } if (nr_sects) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); if (rv) goto fail; } @@ -1191,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; - struct inode *inode; int error = 0; u64 last_unlinked = NO_BLOCK, unlinked; @@ -1209,22 +1209,27 @@ try_again: if (error) return error; + /* Find an rgrp suitable for allocation. If it encounters any unlinked + dinodes along the way, error will equal -EAGAIN and unlinked will + contains it block address. We then need to look up that inode and + try to free it, and try the allocation again. */ error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); if (error != -EAGAIN) return error; - error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, - unlinked, &inode); - if (inode) - iput(inode); + + gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); + /* regardless of whether or not gfs2_process_unlinked_inode + was successful, we don't want to repeat it again. */ + last_unlinked = unlinked; gfs2_log_flush(sdp, NULL); - if (error == GLR_TRYFAILED) - error = 0; + error = 0; + goto try_again; } - + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 3df60f2d84e3..a0464680af0b 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern struct xattr_handler *gfs2_xattr_handlers[]; +extern const struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index c2ebdf2c01d4..82f93da00d1b 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1535,21 +1535,21 @@ out_alloc: return error; } -static struct xattr_handler gfs2_xattr_user_handler = { +static const struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = GFS2_EATYPE_USR, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; -static struct xattr_handler gfs2_xattr_security_handler = { +static const struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = GFS2_EATYPE_SECURITY, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; -struct xattr_handler *gfs2_xattr_handlers[] = { +const struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, &gfs2_xattr_system_handler, diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 5f4023678251..764fd1bdca88 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { const struct file_operations hfsplus_dir_operations = { .read = generic_read_dir, .readdir = hfsplus_readdir, - .ioctl = hfsplus_ioctl, + .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, }; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 5c10d803d9df..6505c30ad965 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int); void hfsplus_delete_inode(struct inode *); /* ioctl.c */ -int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg); +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); int hfsplus_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 1bcf597c0562..9bbb82924a22 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = { .fsync = file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, - .ioctl = hfsplus_ioctl, + .unlocked_ioctl = hfsplus_ioctl, }; struct inode *hfsplus_new_inode(struct super_block *sb, int mode) diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index f457d2ca51ab..ac405f099026 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -17,14 +17,16 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> +#include <linux/smp_lock.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" -int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg) +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; unsigned int flags; + lock_kernel(); switch (cmd) { case HFSPLUS_IOC_EXT2_GETFLAGS: flags = 0; @@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, case HFSPLUS_IOC_EXT2_SETFLAGS: { int err = 0; err = mnt_want_write(filp->f_path.mnt); - if (err) + if (err) { + unlock_kernel(); return err; + } if (!is_owner_or_cap(inode)) { err = -EACCES; @@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, mark_inode_dirty(inode); setflags_out: mnt_drop_write(filp->f_path.mnt); + unlock_kernel(); return err; } default: + unlock_kernel(); return -ENOTTY; } } diff --git a/fs/inode.c b/fs/inode.c index 258ec22bb298..2bee20ae3d65 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -286,11 +286,9 @@ static void init_once(void *foo) */ void __iget(struct inode *inode) { - if (atomic_read(&inode->i_count)) { - atomic_inc(&inode->i_count); + if (atomic_inc_return(&inode->i_count) != 1) return; - } - atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_in_use); inodes_stat.nr_unused--; @@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_ino); } EXPORT_SYMBOL(init_special_inode); + +/** + * Init uid,gid,mode for new inode according to posix standards + * @inode: New inode + * @dir: Directory inode + * @mode: mode of the new inode + */ +void inode_init_owner(struct inode *inode, const struct inode *dir, + mode_t mode) +{ + inode->i_uid = current_fsuid(); + if (dir && dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +} +EXPORT_SYMBOL(inode_init_owner); diff --git a/fs/internal.h b/fs/internal.h index 8a03a5447bdf..6b706bc60a66 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); +extern void __put_super(struct super_block *sb); +extern void put_super(struct super_block *sb); /* * open.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 7faefb4da939..2d140a713861 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp) if (sb->s_op->freeze_fs == NULL) return -EOPNOTSUPP; - /* If a blockdevice-backed filesystem isn't specified, return. */ - if (sb->s_bdev == NULL) - return -EINVAL; - /* Freeze */ - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) - return PTR_ERR(sb); - return 0; + return freeze_super(sb); } static int ioctl_fsthaw(struct file *filp) @@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - /* Thaw */ - return thaw_bdev(sb->s_bdev, sb); + return thaw_super(sb); } /* diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 30beb11ef928..076d1cc44f95 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) */ if ((journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 671da7fb7ffd..75716d3d2be0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -717,7 +717,8 @@ start_journal_io: if (commit_transaction->t_flushed_data_blocks && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -727,7 +728,8 @@ start_journal_io: if (err) __jbd2_journal_abort_hard(journal); if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_dev, NULL); + blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } err = journal_finish_inode_data_buffers(journal, commit_transaction); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7cdc3196476a..a33aab6b5e68 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, return rc; } -struct xattr_handler jffs2_acl_access_xattr_handler = { +const struct xattr_handler jffs2_acl_access_xattr_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_access_listxattr, @@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = { .set = jffs2_acl_setxattr, }; -struct xattr_handler jffs2_acl_default_xattr_handler = { +const struct xattr_handler jffs2_acl_default_xattr_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_default_listxattr, diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index f0ba63e3c36b..5e42de8d9541 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); -extern struct xattr_handler jffs2_acl_access_xattr_handler; -extern struct xattr_handler jffs2_acl_default_xattr_handler; +extern const struct xattr_handler jffs2_acl_access_xattr_handler; +extern const struct xattr_handler jffs2_acl_default_xattr_handler; #else diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index eaccee058583..239f51216a68 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_security_xattr_handler = { +const struct xattr_handler jffs2_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = jffs2_security_listxattr, .set = jffs2_security_setxattr, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 9e75c62c85d6..a2d58c96f1b4 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) * is an implementation of setxattr handler on jffs2. * -------------------------------------------------- */ -struct xattr_handler *jffs2_xattr_handlers[] = { +const struct xattr_handler *jffs2_xattr_handlers[] = { &jffs2_user_xattr_handler, #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, @@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *xprefix_to_handler(int xprefix) { - struct xattr_handler *ret; +static const struct xattr_handler *xprefix_to_handler(int xprefix) { + const struct xattr_handler *ret; switch (xprefix) { case JFFS2_XPREFIX_USER: @@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_inode_cache *ic = f->inocache; struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; - struct xattr_handler *xhandle; + const struct xattr_handler *xhandle; ssize_t len, rc; int retry = 0; diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 6e3b5ddfb7ab..cf4f5759b42b 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, const char *buffer, size_t size, int flags); -extern struct xattr_handler *jffs2_xattr_handlers[]; -extern struct xattr_handler jffs2_user_xattr_handler; -extern struct xattr_handler jffs2_trusted_xattr_handler; +extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler jffs2_user_xattr_handler; +extern const struct xattr_handler jffs2_trusted_xattr_handler; extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #define jffs2_getxattr generic_getxattr @@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #ifdef CONFIG_JFFS2_FS_SECURITY extern int jffs2_init_security(struct inode *inode, struct inode *dir); -extern struct xattr_handler jffs2_security_xattr_handler; +extern const struct xattr_handler jffs2_security_xattr_handler; #else #define jffs2_init_security(inode,dir) (0) #endif /* CONFIG_JFFS2_FS_SECURITY */ diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 3e5a5e356e05..1c868194c504 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_trusted_xattr_handler = { +const struct xattr_handler jffs2_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = jffs2_trusted_listxattr, .set = jffs2_trusted_setxattr, diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 8544af67dffe..916b5c966039 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_user_xattr_handler = { +const struct xattr_handler jffs2_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .list = jffs2_user_listxattr, .set = jffs2_user_setxattr, diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 829921b67765..2686531e235a 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_unlock; } - inode->i_uid = current_fsuid(); - if (parent->i_mode & S_ISGID) { - inode->i_gid = parent->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - + inode_init_owner(inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used @@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (rc) goto fail_drop; - inode->i_mode = mode; /* inherit flags from parent */ jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; @@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (S_ISLNK(mode)) jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); } - jfs_inode->mode2 |= mode; + jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 755a92e8daa7..f602e230e162 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode) inode->i_mode = mode; logfs_set_ino_generation(sb, inode); - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - + inode_init_owner(inode, dir, mode); logfs_inode_setops(inode); insert_inode_hash(inode); diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 6ac693faae49..482779fe4e7c 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode) clear_inode(inode); /* clear in-memory copy */ } -struct inode * minix_new_inode(const struct inode * dir, int * error) +struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); @@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) iput(inode); return NULL; } - inode->i_uid = current_fsuid(); - inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 9dcf95b42116..111f34ee9e3b 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -46,7 +46,7 @@ struct minix_sb_info { extern struct inode *minix_iget(struct super_block *, unsigned long); extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode * dir, int * error); +extern struct inode * minix_new_inode(const struct inode *, int, int *); extern void minix_free_inode(struct inode * inode); extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); extern int minix_new_block(struct inode * inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 32b131cd6121..e20ee85955d1 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_ if (!old_valid_dev(rdev)) return -EINVAL; - inode = minix_new_inode(dir, &error); + inode = minix_new_inode(dir, mode, &error); if (inode) { - inode->i_mode = mode; minix_set_inode(inode, rdev); mark_inode_dirty(inode); error = add_nondir(dentry, inode); @@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry, if (i > dir->i_sb->s_blocksize) goto out; - inode = minix_new_inode(dir, &err); + inode = minix_new_inode(dir, S_IFLNK | 0777, &err); if (!inode) goto out; - inode->i_mode = S_IFLNK | 0777; minix_set_inode(inode, 0); err = page_symlink(inode, symname, i); if (err) @@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, &err); + inode = minix_new_inode(dir, mode, &err); if (!inode) goto out_dir; - inode->i_mode = S_IFDIR | mode; - if (dir->i_mode & S_ISGID) - inode->i_mode |= S_ISGID; minix_set_inode(inode, 0); inode_inc_link_count(inode); diff --git a/fs/namei.c b/fs/namei.c index b86b96fe1dc3..48e1f60520ea 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) static inline void path_to_nameidata(struct path *path, struct nameidata *nd) { dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) + if (nd->path.mnt != path->mnt) { mntput(nd->path.mnt); - nd->path.mnt = path->mnt; + nd->path.mnt = path->mnt; + } nd->path.dentry = path->dentry; } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 7edfcd4d5e52..92dde6f8d893 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -51,7 +51,7 @@ const struct file_operations ncp_dir_operations = { .read = generic_read_dir, .readdir = ncp_readdir, - .ioctl = ncp_ioctl, + .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, #endif diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 1daabb90e0a5..b93870892892 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations = .llseek = ncp_remote_llseek, .read = ncp_file_read, .write = ncp_file_write, - .ioctl = ncp_ioctl, + .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, #endif diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60a5e2864ea8..023c03d02070 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -20,6 +20,7 @@ #include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/ncp_fs.h> @@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) } #endif /* CONFIG_NCPFS_NLS */ -static int __ncp_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; struct ncp_server *server = NCP_SERVER(inode); int result; struct ncp_ioctl_request request; @@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd) } } -int ncp_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - int ret; + long ret; + lock_kernel(); if (ncp_ioctl_need_write(cmd)) { /* * inside the ioctl(), any failures which @@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp, * -EACCESS, so it seems consistent to keep * that here. */ - if (mnt_want_write(filp->f_path.mnt)) - return -EACCES; + if (mnt_want_write(filp->f_path.mnt)) { + ret = -EACCES; + goto out; + } } - ret = __ncp_ioctl(inode, filp, cmd, arg); + ret = __ncp_ioctl(filp, cmd, arg); if (ncp_ioctl_need_write(cmd)) mnt_drop_write(filp->f_path.mnt); + +out: + unlock_kernel(); return ret; } #ifdef CONFIG_COMPAT long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; + long ret; lock_kernel(); arg = (unsigned long) compat_ptr(arg); - ret = ncp_ioctl(inode, file, cmd, arg); + ret = ncp_ioctl(file, cmd, arg); unlock_kernel(); return ret; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f8b1157daa2..04214fc5c304 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; rc = strict_strtoul(string, 10, &option); kfree(string); - if (rc != 0 || option > USHORT_MAX) + if (rc != 0 || option > USHRT_MAX) goto out_invalid_value; mnt->nfs_server.port = option; break; @@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; rc = strict_strtoul(string, 10, &option); kfree(string); - if (rc != 0 || option > USHORT_MAX) + if (rc != 0 || option > USHRT_MAX) goto out_invalid_value; mnt->mount_server.port = option; break; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 7a9ae3254a4b..7e26caab2a26 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -44,8 +44,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ -static struct path rec_dir; -static int rec_dir_init = 0; +static struct file *rec_file; static int nfs4_save_creds(const struct cred **original_creds) @@ -117,33 +116,28 @@ out_no_tfm: return status; } -static void -nfsd4_sync_rec_dir(void) -{ - vfs_fsync(NULL, rec_dir.dentry, 0); -} - int nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char *dname = clp->cl_recdir; - struct dentry *dentry; + struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); - if (!rec_dir_init || clp->cl_firststate) + if (!rec_file || clp->cl_firststate) return 0; status = nfs4_save_creds(&original_cred); if (status < 0) return status; + dir = rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&rec_dir.dentry->d_inode->i_mutex); + mutex_lock(&dir->d_inode->i_mutex); - dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); + dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); goto out_put; } - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out_put; - status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); - mnt_drop_write(rec_dir.mnt); + status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_file->f_path.mnt); out_put: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); if (status == 0) { clp->cl_firststate = 1; - nfsd4_sync_rec_dir(); + vfs_fsync(rec_file, 0); } nfs4_reset_creds(original_cred); dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); @@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) struct dentry *dentry; int status; - if (!rec_dir_init) + if (!rec_file) return 0; status = nfs4_save_creds(&original_cred); if (status < 0) return status; - filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, + filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY, current_cred()); status = PTR_ERR(filp); if (IS_ERR(filp)) @@ -250,13 +244,14 @@ out: static int nfsd4_unlink_clid_dir(char *name, int namlen) { - struct dentry *dentry; + struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); - mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(name, rec_dir.dentry, namlen); + dir = rec_file->f_path.dentry; + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen) status = -ENOENT; if (!dentry->d_inode) goto out; - status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); + status = vfs_rmdir(dir->d_inode, dentry); out: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); return status; } @@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) const struct cred *original_cred; int status; - if (!rec_dir_init || !clp->cl_firststate) + if (!rec_file || !clp->cl_firststate) return; - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; clp->cl_firststate = 0; @@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_creds(original_cred); if (status == 0) - nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.mnt); + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -323,19 +318,19 @@ void nfsd4_recdir_purge_old(void) { int status; - if (!rec_dir_init) + if (!rec_file) return; - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; - status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old); if (status == 0) - nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.mnt); + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); out: if (status) printk("nfsd4: failed to purge old clients from recovery" - " directory %s\n", rec_dir.dentry->d_name.name); + " directory %s\n", rec_file->f_path.dentry->d_name.name); } static int @@ -355,10 +350,13 @@ int nfsd4_recdir_load(void) { int status; - status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); + if (!rec_file) + return 0; + + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir); if (status) printk("nfsd4: failed loading clients from recovery" - " directory %s\n", rec_dir.dentry->d_name.name); + " directory %s\n", rec_file->f_path.dentry->d_name.name); return status; } @@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname) printk("NFSD: Using %s as the NFSv4 state recovery directory\n", rec_dirname); - BUG_ON(rec_dir_init); + BUG_ON(rec_file); status = nfs4_save_creds(&original_cred); if (status < 0) { @@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname) return; } - status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, - &rec_dir); - if (status) + rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", rec_dirname); + rec_file = NULL; + } - if (!status) - rec_dir_init = 1; nfs4_reset_creds(original_cred); } void nfsd4_shutdown_recdir(void) { - if (!rec_dir_init) + if (!rec_file) return; - rec_dir_init = 0; - path_put(&rec_dir); + fput(rec_file); + rec_file = NULL; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc3194ea01f5..508941c23af7 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf) if (sscanf(buf, "%15s %4u", transport, &port) != 2) return -EINVAL; - if (port < 1 || port > USHORT_MAX) + if (port < 1 || port > USHRT_MAX) return -EINVAL; err = nfsd_create_serv(); @@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf) if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) return -EINVAL; - if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) + if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) return -EINVAL; xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23c06f77f4ca..ebbf3b6b2457 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file) if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); - err = vfs_fsync(file, file->f_path.dentry, 0); + err = vfs_fsync(file, 0); } last_ino = inode->i_ino; last_dev = inode->i_sb->s_dev; @@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, file->f_path.dentry, - offset, end, 0); + int err2 = vfs_fsync_range(file, offset, end, 0); if (err2 != -EINVAL) err = nfserrno(err2); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 5e226d4b41d3..39e038ac8fcb 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) /* reference count of i_bh inherits from nilfs_mdt_read_block() */ atomic_inc(&sbi->s_inodes_count); - - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index a756168a21c2..8c1097327abc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_BARRIER); if (ret < 0) return ret; nblocks = 0; @@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, DISCARD_FL_BARRIER); + GFP_NOFS, BLKDEV_IFL_BARRIER); return ret; } diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index 40b1cf914ccb..27b75ebc7460 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch); int pin_inotify_watch(struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; - spin_lock(&sb_lock); - if (sb->s_count >= S_BIAS) { - atomic_inc(&sb->s_active); - spin_unlock(&sb_lock); + if (atomic_inc_not_zero(&sb->s_active)) { atomic_inc(&watch->count); return 1; } - spin_unlock(&sb_lock); return 0; } @@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch); * done. Cleanup is just deactivate_super(). However, that leaves a messy * case - what if we *are* racing with umount() and active references to * superblock can't be acquired anymore? We can bump ->s_count, grab - * ->s_umount, which will almost certainly wait until the superblock is shut - * down and the watch in question is pining for fjords. That's fine, but - * there is a problem - we might have hit the window between ->s_active - * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock - * is past the point of no return and is heading for shutdown) and the - * moment when deactivate_super() acquires ->s_umount. We could just do - * drop_super() yield() and retry, but that's rather antisocial and this - * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having - * found that we'd got there first (i.e. that ->s_root is non-NULL) we know - * that we won't race with inotify_umount_inodes(). So we could grab a - * reference to watch and do the rest as above, just with drop_super() instead - * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we - * could grab ->s_umount. So the watch could've been gone already. - * - * That still can be dealt with - we need to save watch->wd, do idr_find() - * and compare its result with our pointer. If they match, we either have - * the damn thing still alive or we'd lost not one but two races at once, - * the watch had been killed and a new one got created with the same ->wd - * at the same address. That couldn't have happened in inotify_destroy(), - * but inotify_rm_wd() could run into that. Still, "new one got created" - * is not a problem - we have every right to kill it or leave it alone, - * whatever's more convenient. - * - * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as - * "grab it and kill it" check. If it's been our original watch, we are - * fine, if it's a newcomer - nevermind, just pretend that we'd won the - * race and kill the fscker anyway; we are safe since we know that its - * superblock won't be going away. + * ->s_umount, which will wait until the superblock is shut down and the + * watch in question is pining for fjords. * * And yes, this is far beyond mere "not very pretty"; so's the entire * concept of inotify to start with. @@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch); * Called with ih->mutex held, drops it. Possible return values: * 0 - nothing to do, it has died * 1 - remove it, drop the reference and deactivate_super() - * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid - * that variant, since it involved a lot of PITA, but that's the best that - * could've been done. */ static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; - s32 wd = watch->wd; - spin_lock(&sb_lock); - if (sb->s_count >= S_BIAS) { - atomic_inc(&sb->s_active); - spin_unlock(&sb_lock); + if (atomic_inc_not_zero(&sb->s_active)) { get_inotify_watch(watch); mutex_unlock(&ih->mutex); return 1; /* the best outcome */ } + spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ down_read(&sb->s_umount); - if (likely(!sb->s_root)) { - /* fs is already shut down; the watch is dead */ - drop_super(sb); - return 0; - } - /* raced with the final deactivate_super() */ - mutex_lock(&ih->mutex); - if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { - /* the watch is dead */ - mutex_unlock(&ih->mutex); - drop_super(sb); - return 0; - } - /* still alive or freed and reused with the same sb and wd; kill */ - get_inotify_watch(watch); - mutex_unlock(&ih->mutex); - return 2; + /* fs is already shut down; the watch is dead */ + drop_super(sb); + return 0; } -static void unpin_and_kill(struct inotify_watch *watch, int how) +static void unpin_and_kill(struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; put_inotify_watch(watch); - switch (how) { - case 1: - deactivate_super(sb); - break; - case 2: - drop_super(sb); - } + deactivate_super(sb); } /** @@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih) struct list_head *watches; struct super_block *sb; struct inode *inode; - int how; mutex_lock(&ih->mutex); watches = &ih->watches; @@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih) } watch = list_first_entry(watches, struct inotify_watch, h_list); sb = watch->inode->i_sb; - how = pin_to_kill(ih, watch); - if (!how) + if (!pin_to_kill(ih, watch)) continue; inode = watch->inode; @@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - unpin_and_kill(watch, how); + unpin_and_kill(watch); } /* free this handle: the put matching the get in inotify_init() */ @@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) struct inotify_watch *watch; struct super_block *sb; struct inode *inode; - int how; mutex_lock(&ih->mutex); watch = idr_find(&ih->idr, wd); @@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) return -EINVAL; } sb = watch->inode->i_sb; - how = pin_to_kill(ih, watch); - if (!how) + if (!pin_to_kill(ih, watch)) return 0; inode = watch->inode; @@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - unpin_and_kill(watch, how); + unpin_and_kill(watch); return 0; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 8804f093ba75..a1924a0d2ab0 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * the page at all. For a more detailed explanation see ntfs_truncate() in * fs/ntfs/inode.c. * - * @cached_page and @lru_pvec are just optimizations for dealing with multiple - * pages. - * * Return 0 on success and -errno on error. In the case that an error is * encountered it is possible that the initialized size will already have been * incremented some way towards @new_init_size but it is guaranteed that if @@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be * held by the caller. */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, - struct page **cached_page, struct pagevec *lru_pvec) +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) { s64 old_init_size; loff_t old_i_size; @@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. * - * If a page is newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec @lru_pvec. - * - * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages - * are obtained at once instead of just one page and that 0 is returned on - * success and -errno on error. + * If a page is newly created, add it to lru list * * Note, the page locks are obtained in ascending page index order. */ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page, struct pagevec *lru_pvec) + struct page **cached_page) { int err, nr; @@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } } - err = add_to_page_cache(*cached_page, mapping, index, + err = add_to_page_cache_lru(*cached_page, mapping, index, GFP_KERNEL); if (unlikely(err)) { if (err == -EEXIST) @@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } pages[nr] = *cached_page; - page_cache_get(*cached_page); - if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ssize_t status, written; unsigned nr_pages; int err; - struct pagevec lru_pvec; ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " "pos 0x%llx, count 0x%lx.", @@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } } } - pagevec_init(&lru_pvec, 0); written = 0; /* * If the write starts beyond the initialized size, extend it up to the @@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ll = ni->initialized_size; read_unlock_irqrestore(&ni->size_lock, flags); if (pos > ll) { - err = ntfs_attr_extend_initialized(ni, pos, &cached_page, - &lru_pvec); + err = ntfs_attr_extend_initialized(ni, pos); if (err < 0) { ntfs_error(vol->sb, "Cannot perform write to inode " "0x%lx, attribute type 0x%x, because " @@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); /* Get and lock @do_pages starting at index @start_idx. */ status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page, &lru_pvec); + pages, &cached_page); if (unlikely(status)) break; /* @@ -2077,7 +2062,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e13fc9e8fcdc..da702294d7e7 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -489,7 +489,7 @@ cleanup: return ret; } -struct xattr_handler ocfs2_xattr_acl_access_handler = { +const struct xattr_handler ocfs2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ocfs2_xattr_list_acl_access, @@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = { .set = ocfs2_xattr_set_acl, }; -struct xattr_handler ocfs2_xattr_acl_default_handler = { +const struct xattr_handler ocfs2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ocfs2_xattr_list_acl_default, diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index b7428c5d0d3b..ec6d12339593 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize, * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ - BUG_ON(ecc > USHORT_MAX); + BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); @@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ - BUG_ON(ecc > USHORT_MAX); + BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index db5dd3ed4df4..f171b51a74f7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) inode->i_nlink = 2; else inode->i_nlink = 1; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 98ee6c44102d..e97b34842cfe 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -97,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, @@ -106,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { +static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, @@ -540,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, static inline const char *ocfs2_xattr_prefix(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; @@ -7213,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle, xattr_ac, data_ac); } -struct xattr_handler ocfs2_xattr_security_handler = { +const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, @@ -7257,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ocfs2_xattr_trusted_handler = { +const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, @@ -7313,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ocfs2_xattr_user_handler = { +const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index abd72a47f520..aa64bb37a65b 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info { size_t value_len; }; -extern struct xattr_handler ocfs2_xattr_user_handler; -extern struct xattr_handler ocfs2_xattr_trusted_handler; -extern struct xattr_handler ocfs2_xattr_security_handler; -extern struct xattr_handler ocfs2_xattr_acl_access_handler; -extern struct xattr_handler ocfs2_xattr_acl_default_handler; -extern struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler ocfs2_xattr_user_handler; +extern const struct xattr_handler ocfs2_xattr_trusted_handler; +extern const struct xattr_handler ocfs2_xattr_security_handler; +extern const struct xattr_handler ocfs2_xattr_acl_access_handler; +extern const struct xattr_handler ocfs2_xattr_acl_default_handler; +extern const struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index b44bb835e8ea..089839a6cc64 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode) goto fail; inode->i_ino = new_block; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/open.c b/fs/open.c index 74e5cd9f718e..5463266db9e6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -17,7 +17,6 @@ #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> -#include <linux/vfs.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <asm/uaccess.h> @@ -33,171 +32,6 @@ #include "internal.h" -int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int retval = -ENODEV; - - if (dentry) { - retval = -ENOSYS; - if (dentry->d_sb->s_op->statfs) { - memset(buf, 0, sizeof(*buf)); - retval = security_sb_statfs(dentry); - if (retval) - return retval; - retval = dentry->d_sb->s_op->statfs(dentry, buf); - if (retval == 0 && buf->f_frsize == 0) - buf->f_frsize = buf->f_bsize; - } - } - return retval; -} - -EXPORT_SYMBOL(vfs_statfs); - -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & - 0xffffffff00000000ULL) - return -EOVERFLOW; - /* - * f_files and f_ffree may be -1; it's okay to stuff - * that into 32 bits - */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) -{ - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = vfs_statfs_native(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) -{ - struct path path; - long error; - - if (sz != sizeof(*buf)) - return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = vfs_statfs64(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) -{ - struct file * file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - -SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) -{ - struct file * file; - struct statfs64 tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index a97b477ac0fc..6921e7890be6 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -70,14 +70,14 @@ struct riscix_record { #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) -static int -riscix_partition(struct parsed_partitions *state, struct block_device *bdev, - unsigned long first_sect, int slot, unsigned long nr_sects) +static int riscix_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) { Sector sect; struct riscix_record *rr; - rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, §); + rr = read_part_sector(state, first_sect, §); if (!rr) return -1; @@ -123,9 +123,9 @@ struct linux_part { #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) -static int -linux_partition(struct parsed_partitions *state, struct block_device *bdev, - unsigned long first_sect, int slot, unsigned long nr_sects) +static int linux_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) { Sector sect; struct linux_part *linuxp; @@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, put_partition(state, slot++, first_sect, size); - linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, §); + linuxp = read_part_sector(state, first_sect, §); if (!linuxp) return -1; @@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, #endif #ifdef CONFIG_ACORN_PARTITION_CUMANA -int -adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_CUMANA(struct parsed_partitions *state) { unsigned long first_sector = 0; unsigned int start_blk = 0; @@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev struct adfs_discrecord *dr; unsigned int nr_sects; - data = read_dev_sector(bdev, start_blk * 2 + 6, §); + data = read_part_sector(state, start_blk * 2 + 6, §); if (!data) return -1; @@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: /* RISCiX - we don't know how to find the next one. */ - slot = riscix_partition(state, bdev, first_sector, - slot, nr_sects); + slot = riscix_partition(state, first_sector, slot, + nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, bdev, first_sector, - slot, nr_sects); + slot = linux_partition(state, first_sector, slot, + nr_sects); break; } put_dev_sector(sect); @@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev * hda1 = ADFS partition on first drive. * hda2 = non-ADFS partition. */ -int -adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_ADFS(struct parsed_partitions *state) { unsigned long start_sect, nr_sects, sectscyl, heads; Sector sect; @@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) unsigned char id; int slot = 1; - data = read_dev_sector(bdev, 6, §); + data = read_part_sector(state, 6, §); if (!data) return -1; @@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) /* * Work out start of non-adfs partition. */ - nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; if (start_sect) { switch (id) { #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, bdev, start_sect, - slot, nr_sects); + slot = riscix_partition(state, start_sect, slot, + nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, bdev, start_sect, - slot, nr_sects); + slot = linux_partition(state, start_sect, slot, + nr_sects); break; } } @@ -308,10 +306,11 @@ struct ics_part { __le32 size; }; -static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) +static int adfspart_check_ICSLinux(struct parsed_partitions *state, + unsigned long block) { Sector sect; - unsigned char *data = read_dev_sector(bdev, block, §); + unsigned char *data = read_part_sector(state, block, §); int result = 0; if (data) { @@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data) * hda2 = ADFS partition 1 on first drive. * ..etc.. */ -int -adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_ICS(struct parsed_partitions *state) { const unsigned char *data; const struct ics_part *p; @@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) /* * Try ICS style partitions - sector 0 contains partition info. */ - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) * partition is. We must not make this visible * to the filesystem. */ - if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { + if (size > 1 && adfspart_check_ICSLinux(state, start)) { start += 1; size -= 1; } @@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data) * hda2 = ADFS partition 1 on first drive. * ..etc.. */ -int -adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_POWERTEC(struct parsed_partitions *state) { Sector sect; const unsigned char *data; @@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd int slot = 1; int i; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -508,8 +505,7 @@ static const char eesox_name[] = { * 1. The individual ADFS boot block entries that are placed on the disk. * 2. The start address of the next entry. */ -int -adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_EESOX(struct parsed_partitions *state) { Sector sect; const unsigned char *data; @@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) sector_t start = 0; int i, slot = 1; - data = read_dev_sector(bdev, 7, §); + data = read_part_sector(state, 7, §); if (!data) return -1; @@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) if (i != 0) { sector_t size; - size = get_capacity(bdev->bd_disk); + size = get_capacity(state->bdev->bd_disk); put_partition(state, slot++, start, size - start); printk("\n"); } diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h index 81fd50ecc080..ede828529692 100644 --- a/fs/partitions/acorn.h +++ b/fs/partitions/acorn.h @@ -7,8 +7,8 @@ * format, and everyone stick to it? */ -int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c index 9917a8c360f2..ba443d4229f8 100644 --- a/fs/partitions/amiga.c +++ b/fs/partitions/amiga.c @@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size) return sum; } -int -amiga_partition(struct parsed_partitions *state, struct block_device *bdev) +int amiga_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; @@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; - data = read_dev_sector(bdev, blk, §); + data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) printk("Dev %s: unable to read RDB block %d\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) } printk("Dev %s: RDB in block %d has bad checksum\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); } /* blksize is blocks per 512 byte standard block */ @@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) put_dev_sector(sect); for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { blk *= blksize; /* Read in terms partition table understands */ - data = read_dev_sector(bdev, blk, §); + data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) printk("Dev %s: unable to read partition block %d\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h index 2f3e9ce22d53..d094585cadaa 100644 --- a/fs/partitions/amiga.h +++ b/fs/partitions/amiga.h @@ -2,5 +2,5 @@ * fs/partitions/amiga.h */ -int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); +int amiga_partition(struct parsed_partitions *state); diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c index 1f3572d5b755..4439ff1b6cec 100644 --- a/fs/partitions/atari.c +++ b/fs/partitions/atari.c @@ -30,7 +30,7 @@ static inline int OK_id(char *s) memcmp (s, "RAW", 3) == 0 ; } -int atari_partition(struct parsed_partitions *state, struct block_device *bdev) +int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; @@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif - rs = (struct rootsector *) read_dev_sector(bdev, 0, §); + rs = read_part_sector(state, 0, §); if (!rs) return -1; /* Verify this is an Atari rootsector: */ - hd_size = bdev->bd_inode->i_size >> 9; + hd_size = state->bdev->bd_inode->i_size >> 9; if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && @@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) printk(" XGM<"); partsect = extensect = be32_to_cpu(pi->st); while (1) { - xrs = (struct rootsector *)read_dev_sector(bdev, partsect, §2); + xrs = read_part_sector(state, partsect, §2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h index 63186b00e135..fe2d32a89f36 100644 --- a/fs/partitions/atari.h +++ b/fs/partitions/atari.h @@ -31,4 +31,4 @@ struct rootsector u16 checksum; /* checksum for bootable disks */ } __attribute__((__packed__)); -int atari_partition(struct parsed_partitions *state, struct block_device *bdev); +int atari_partition(struct parsed_partitions *state); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index e238ab23a9e7..5dcd4b0c5533 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev); int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ -static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { +static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. @@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev) struct parsed_partitions *state; int i, res, err; - state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); + state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); if (!state) return NULL; + state->bdev = bdev; disk_name(hd, 0, state->name); printk(KERN_INFO " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); - res = check_part[i++](state, bdev); + res = check_part[i++](state); if (res < 0) { /* We have hit an I/O error which we don't report now. * But record it, and let the others do their job. @@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev) } if (res > 0) return state; + if (state->access_beyond_eod) + err = -ENOSPC; if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; @@ -538,12 +541,33 @@ exit: disk_part_iter_exit(&piter); } +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { + struct parsed_partitions *state = NULL; struct disk_part_iter piter; struct hd_struct *part; - struct parsed_partitions *state; int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } if (bdev->bd_part_count) return -EBUSY; @@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; - if (IS_ERR(state)) /* I/O error reading the partition table */ + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If any + * partition code tried to read beyond EOD, retry + * after unlocking native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } return -EIO; + } + /* + * If any partition code tried to read beyond EOD, try + * unlocking native capacity even if partition table is + * sucessfully read as we could be missing some partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); @@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size, from; -try_scan: + size = state->parts[p].size; if (!size) continue; @@ -589,30 +637,21 @@ try_scan: from = state->parts[p].from; if (from >= get_capacity(disk)) { printk(KERN_WARNING - "%s: p%d ignored, start %llu is behind the end of the disk\n", + "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + goto rescan; continue; } if (from + size > get_capacity(disk)) { - const struct block_device_operations *bdops = disk->fops; - unsigned long long capacity; - printk(KERN_WARNING - "%s: p%d size %llu exceeds device capacity, ", + "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); - if (bdops->set_capacity && - (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { - printk(KERN_CONT "enabling native capacity\n"); - capacity = bdops->set_capacity(disk, ~0ULL); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - if (capacity > get_capacity(disk)) { - set_capacity(disk, capacity); - check_disk_size_change(disk, bdev); - bdev->bd_invalidated = 0; - } - goto try_scan; + if (disk_unlock_native_capacity(disk)) { + /* free state and restart */ + goto rescan; } else { /* * we can not ignore partitions of broken tables @@ -620,7 +659,6 @@ try_scan: * we limit them to the end of the disk to avoid * creating invalid block devices */ - printk(KERN_CONT "limited to end of disk\n"); size = get_capacity(disk) - from; } } diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 98dbe1a84528..52f8bd399396 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -6,6 +6,7 @@ * description. */ struct parsed_partitions { + struct block_device *bdev; char name[BDEVNAME_SIZE]; struct { sector_t from; @@ -14,8 +15,19 @@ struct parsed_partitions { } parts[DISK_MAX_PARTS]; int next; int limit; + bool access_beyond_eod; }; +static inline void *read_part_sector(struct parsed_partitions *state, + sector_t n, Sector *p) +{ + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + return read_dev_sector(state->bdev, n, p); +} + static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 91babdae7587..9efb2cfe2410 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len) * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 -last_lba(struct block_device *bdev) +static u64 last_lba(struct block_device *bdev) { if (!bdev || !bdev->bd_inode) return 0; @@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr) /** * read_lba(): Read bytes from disk, starting at given LBA - * @bdev + * @state * @lba * @buffer * @size_t * - * Description: Reads @count bytes from @bdev into @buffer. + * Description: Reads @count bytes from @state->bdev into @buffer. * Returns number of bytes read on success, 0 on error. */ -static size_t -read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) +static size_t read_lba(struct parsed_partitions *state, + u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; + struct block_device *bdev = state->bdev; sector_t n = lba * (bdev_logical_block_size(bdev) / 512); - if (!bdev || !buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(bdev)) return 0; while (count) { int copied = 512; Sector sect; - unsigned char *data = read_dev_sector(bdev, n++, §); + unsigned char *data = read_part_sector(state, n++, §); if (!data) break; if (copied > count) @@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) /** * alloc_read_gpt_entries(): reads partition entries from disk - * @bdev + * @state * @gpt - GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. * Notes: remember to free pte when you're done! */ -static gpt_entry * -alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) +static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, + gpt_header *gpt) { size_t count; gpt_entry *pte; - if (!bdev || !gpt) + + if (!gpt) return NULL; count = le32_to_cpu(gpt->num_partition_entries) * @@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) if (!pte) return NULL; - if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), + if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, count) < count) { kfree(pte); @@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk - * @bdev + * @state * @lba is the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @bdev. + * and fills a GPT header starting at @ from @state->bdev. * Note: remember to free gpt when finished with it. */ -static gpt_header * -alloc_read_gpt_header(struct block_device *bdev, u64 lba) +static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, + u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(bdev); - - if (!bdev) - return NULL; + unsigned ssz = bdev_logical_block_size(state->bdev); gpt = kzalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; - if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { + if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; @@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) /** * is_gpt_valid() - tests one GPT header and PTEs for validity - * @bdev + * @state * @lba is the logical block address of the GPT header to test * @gpt is a GPT header ptr, filled on return. * @ptes is a PTEs ptr, filled on return. @@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. */ -static int -is_gpt_valid(struct block_device *bdev, u64 lba, - gpt_header **gpt, gpt_entry **ptes) +static int is_gpt_valid(struct parsed_partitions *state, u64 lba, + gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; u64 lastlba; - if (!bdev || !gpt || !ptes) + if (!ptes) return 0; - if (!(*gpt = alloc_read_gpt_header(bdev, lba))) + if (!(*gpt = alloc_read_gpt_header(state, lba))) return 0; /* Check the GUID Partition Table signature */ @@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(bdev); + lastlba = last_lba(state->bdev); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, goto fail; } - if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ @@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs - * @bdev + * @state * @gpt is a GPT header ptr, filled on return. * @ptes is a PTEs ptr, filled on return. * Description: Returns 1 if valid, 0 on error. @@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) * This protects against devices which misreport their size, and forces * the user to decide to use the Alternate GPT. */ -static int -find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) +static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, + gpt_entry **ptes) { int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; u64 lastlba; - if (!bdev || !gpt || !ptes) + + if (!ptes) return 0; - lastlba = last_lba(bdev); + lastlba = last_lba(state->bdev); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); if (legacymbr) { - read_lba(bdev, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); + read_lba(state, 0, (u8 *) legacymbr, + sizeof (*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr); kfree(legacymbr); } @@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) goto fail; } - good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, + good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); if (good_pgpt) - good_agpt = is_gpt_valid(bdev, + good_agpt = is_gpt_valid(state, le64_to_cpu(pgpt->alternate_lba), &agpt, &aptes); if (!good_agpt && force_gpt) - good_agpt = is_gpt_valid(bdev, lastlba, - &agpt, &aptes); + good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) @@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) } /** - * efi_partition(struct parsed_partitions *state, struct block_device *bdev) + * efi_partition(struct parsed_partitions *state) * @state - * @bdev * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. @@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) * 1 if successful * */ -int -efi_partition(struct parsed_partitions *state, struct block_device *bdev) +int efi_partition(struct parsed_partitions *state) { gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(bdev) / 512; + unsigned ssz = bdev_logical_block_size(state->bdev) / 512; - if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { + if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); kfree(ptes); return 0; @@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) continue; put_partition(state, i+1, start * ssz, size * ssz); @@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) - state->parts[i+1].flags = 1; + state->parts[i + 1].flags = ADDPART_FLAG_RAID; } kfree(ptes); kfree(gpt); diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 6998b589abf9..b69ab729558f 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h @@ -110,7 +110,7 @@ typedef struct _legacy_mbr { } __attribute__ ((packed)) legacy_mbr; /* Functions */ -extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int efi_partition(struct parsed_partitions *state); #endif diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index fc71aab08460..3e73de5967ff 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { /* */ -int -ibm_partition(struct parsed_partitions *state, struct block_device *bdev) +int ibm_partition(struct parsed_partitions *state) { + struct block_device *bdev = state->bdev; int blocksize, res; loff_t i_size, offset, size, fmt_size; dasd_information2_t *info; @@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) /* * Get volume label, extract name and type. */ - data = read_dev_sector(bdev, info->label_block*(blocksize/512), §); + data = read_part_sector(state, info->label_block*(blocksize/512), + §); if (data == NULL) goto out_readerr; @@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) */ blk = cchhb2blk(&label->vol.vtoc, geo) + 1; counter = 0; - data = read_dev_sector(bdev, blk * (blocksize/512), - §); + data = read_part_sector(state, blk * (blocksize/512), + §); while (data != NULL) { struct vtoc_format1_label f1; @@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) || f1.DS1FMTID == _ascebc['7'] || f1.DS1FMTID == _ascebc['9']) { blk++; - data = read_dev_sector(bdev, blk * - (blocksize/512), - §); + data = read_part_sector(state, + blk * (blocksize/512), §); continue; } @@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) size * (blocksize >> 9)); counter++; blk++; - data = read_dev_sector(bdev, - blk * (blocksize/512), - §); + data = read_part_sector(state, + blk * (blocksize/512), §); } if (!data) diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h index 31f85a6ac459..08fb0804a812 100644 --- a/fs/partitions/ibm.h +++ b/fs/partitions/ibm.h @@ -1 +1 @@ -int ibm_partition(struct parsed_partitions *, struct block_device *); +int ibm_partition(struct parsed_partitions *); diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c index 176d89bcf123..1cc928bb762f 100644 --- a/fs/partitions/karma.c +++ b/fs/partitions/karma.c @@ -9,7 +9,7 @@ #include "check.h" #include "karma.h" -int karma_partition(struct parsed_partitions *state, struct block_device *bdev) +int karma_partition(struct parsed_partitions *state) { int i; int slot = 1; @@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev) } __attribute__((packed)) *label; struct d_partition *p; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h index ecf7d3f2a3d8..c764b2e9df21 100644 --- a/fs/partitions/karma.h +++ b/fs/partitions/karma.h @@ -4,5 +4,5 @@ #define KARMA_LABEL_MAGIC 0xAB56 -int karma_partition(struct parsed_partitions *state, struct block_device *bdev); +int karma_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 8652fb99e962..648c9d8f3357 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/stringify.h> +#include <linux/kernel.h> #include "ldm.h" #include "check.h" #include "msdos.h" @@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src) int h; /* high part */ - if ((x = src[0] - '0') <= '9'-'0') h = x; - else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; - else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; - else return -1; - h <<= 4; + x = h = hex_to_bin(src[0]); + if (h < 0) + return -1; /* low part */ - if ((x = src[1] - '0') <= '9'-'0') return h | x; - if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); - if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); - return -1; + h = hex_to_bin(src[1]); + if (h < 0) + return -1; + + return (x << 4) + h; } /** @@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, /** * ldm_validate_privheads - Compare the primary privhead with its backups - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * @ph1: Memory struct to fill with ph contents * * Read and compare all three privheads from disk. @@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, * Return: 'true' Success * 'false' Error */ -static bool ldm_validate_privheads (struct block_device *bdev, - struct privhead *ph1) +static bool ldm_validate_privheads(struct parsed_partitions *state, + struct privhead *ph1) { static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; struct privhead *ph[3] = { ph1 }; @@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, long num_sects; int i; - BUG_ON (!bdev || !ph1); + BUG_ON (!state || !ph1); ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); @@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev, /* Read and parse privheads */ for (i = 0; i < 3; i++) { - data = read_dev_sector (bdev, - ph[0]->config_start + off[i], §); + data = read_part_sector(state, ph[0]->config_start + off[i], + §); if (!data) { ldm_crit ("Disk read failed."); goto out; @@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, } } - num_sects = bdev->bd_inode->i_size >> 9; + num_sects = state->bdev->bd_inode->i_size >> 9; if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -397,20 +397,20 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups - * @bdev: Device holding the LDM Database - * @base: Offset, into @bdev, of the database + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @bdev and return the parsed information into @toc1. + * @state->bdev and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * * Return: 'true' @toc1 contains validated TOCBLOCK info * 'false' @toc1 contents are undefined */ -static bool ldm_validate_tocblocks(struct block_device *bdev, - unsigned long base, struct ldmdb *ldb) +static bool ldm_validate_tocblocks(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) { static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; struct tocblock *tb[4]; @@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, int i, nr_tbs; bool result = false; - BUG_ON(!bdev || !ldb); + BUG_ON(!state || !ldb); ph = &ldb->ph; tb[0] = &ldb->toc; tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); @@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, * skip any that fail as long as we get at least one valid TOCBLOCK. */ for (nr_tbs = i = 0; i < 4; i++) { - data = read_dev_sector(bdev, base + off[i], §); + data = read_part_sector(state, base + off[i], §); if (!data) { ldm_error("Disk read failed for TOCBLOCK %d.", i); continue; @@ -473,7 +473,7 @@ err: /** * ldm_validate_vmdb - Read the VMDB and validate it - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * @base: Offset, into @bdev, of the database * @ldb: Cache of the database structures * @@ -483,8 +483,8 @@ err: * Return: 'true' @ldb contains validated VBDB info * 'false' @ldb contents are undefined */ -static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, - struct ldmdb *ldb) +static bool ldm_validate_vmdb(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) { Sector sect; u8 *data; @@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, struct vmdb *vm; struct tocblock *toc; - BUG_ON (!bdev || !ldb); + BUG_ON (!state || !ldb); vm = &ldb->vm; toc = &ldb->toc; - data = read_dev_sector (bdev, base + OFF_VMDB, §); + data = read_part_sector(state, base + OFF_VMDB, §); if (!data) { ldm_crit ("Disk read failed."); return false; @@ -534,21 +534,21 @@ out: /** * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * * This function provides a weak test to decide whether the device is a dynamic * disk or not. It looks for an MS-DOS-style partition table containing at * least one partition of type 0x42 (formerly SFS, now used by Windows for * dynamic disks). * - * N.B. The only possible error can come from the read_dev_sector and that is + * N.B. The only possible error can come from the read_part_sector and that is * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @bdev is a dynamic disk - * 'false' @bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->bdev is a dynamic disk + * 'false' @state->bdev is not a dynamic disk, or an error occurred */ -static bool ldm_validate_partition_table (struct block_device *bdev) +static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; @@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev) int i; bool result = false; - BUG_ON (!bdev); + BUG_ON(!state); - data = read_dev_sector (bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) { ldm_crit ("Disk read failed."); return false; @@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory - * @bdev: Device holding the LDM Database - * @base: Offset, into @bdev, of the database + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) * Return: 'true' All the VBLKs were read successfully * 'false' An error occurred */ -static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, - struct ldmdb *ldb) +static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, + struct ldmdb *ldb) { int size, perbuf, skip, finish, s, v, recs; u8 *data = NULL; @@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, bool result = false; LIST_HEAD (frags); - BUG_ON (!bdev || !ldb); + BUG_ON(!state || !ldb); size = ldb->vm.vblk_size; perbuf = 512 / size; @@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, finish = (size * ldb->vm.last_vblk_seq) >> 9; for (s = skip; s < finish; s++) { /* For each sector */ - data = read_dev_sector (bdev, base + OFF_VMDB + s, §); + data = read_part_sector(state, base + OFF_VMDB + s, §); if (!data) { ldm_crit ("Disk read failed."); goto out; @@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh) /** * ldm_partition - Find out whether a device is a dynamic disk and handle it - * @pp: List of the partitions parsed so far - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * * This determines whether the device @bdev is a dynamic disk and if so creates * the partitions necessary in the gendisk structure pointed to by @hd. @@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @bdev is a dynamic disk and we handled it - * 0 Success, @bdev is not a dynamic disk + * Return: 1 Success, @state->bdev is a dynamic disk and we handled it + * 0 Success, @state->bdev is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @bdev is a dynamic disk, but it may be corrupted + * Or @state->bdev is a dynamic disk, but it may be corrupted */ -int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) +int ldm_partition(struct parsed_partitions *state) { struct ldmdb *ldb; unsigned long base; int result = -1; - BUG_ON (!pp || !bdev); + BUG_ON(!state); /* Look for signs of a Dynamic Disk */ - if (!ldm_validate_partition_table (bdev)) + if (!ldm_validate_partition_table(state)) return 0; ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); @@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) } /* Parse and check privheads. */ - if (!ldm_validate_privheads (bdev, &ldb->ph)) + if (!ldm_validate_privheads(state, &ldb->ph)) goto out; /* Already logged */ /* All further references are relative to base (database start). */ base = ldb->ph.config_start; /* Parse and check tocs and vmdb. */ - if (!ldm_validate_tocblocks (bdev, base, ldb) || - !ldm_validate_vmdb (bdev, base, ldb)) + if (!ldm_validate_tocblocks(state, base, ldb) || + !ldm_validate_vmdb(state, base, ldb)) goto out; /* Already logged */ /* Initialize vblk lists in ldmdb struct */ @@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) INIT_LIST_HEAD (&ldb->v_comp); INIT_LIST_HEAD (&ldb->v_part); - if (!ldm_get_vblks (bdev, base, ldb)) { + if (!ldm_get_vblks(state, base, ldb)) { ldm_crit ("Failed to read the VBLKs from the database."); goto cleanup; } /* Finally, create the data partition devices. */ - if (ldm_create_data_partitions (pp, ldb)) { + if (ldm_create_data_partitions(state, ldb)) { ldm_debug ("Parsed LDM database successfully."); result = 1; } diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index 30e08e809c1d..d1fb50b28d86 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h @@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */ struct list_head v_part; }; -int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); +int ldm_partition(struct parsed_partitions *state); #endif /* _FS_PT_LDM_H_ */ diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index d4a0fad3563b..74465ff7c263 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len) stg[i] = 0; } -int mac_partition(struct parsed_partitions *state, struct block_device *bdev) +int mac_partition(struct parsed_partitions *state) { int slot = 1; Sector sect; @@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) struct mac_driver_desc *md; /* Get 0th block and look at the first partition map entry. */ - md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, §); + md = read_part_sector(state, 0, §); if (!md) return -1; if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { @@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_dev_sector(bdev, secsize/512, §); + data = read_part_sector(state, secsize/512, §); if (!data) return -1; part = (struct mac_partition *) (data + secsize%512); @@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) for (blk = 1; blk <= blocks_in_map; ++blk) { int pos = blk * secsize; put_dev_sector(sect); - data = read_dev_sector(bdev, pos/512, §); + data = read_part_sector(state, pos/512, §); if (!data) return -1; part = (struct mac_partition *) (data + pos%512); @@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) be32_to_cpu(part->block_count) * (secsize/512)); if (!strnicmp(part->type, "Linux_RAID", 10)) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* * If this is the first bootable partition, tell the @@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); + note_bootable_part(state->bdev->bd_dev, found_root, + found_root_goodness); #endif put_dev_sector(sect); diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h index bbf26e1386fa..3c7d98436380 100644 --- a/fs/partitions/mac.h +++ b/fs/partitions/mac.h @@ -41,4 +41,4 @@ struct mac_driver_desc { /* ... more stuff */ }; -int mac_partition(struct parsed_partitions *state, struct block_device *bdev); +int mac_partition(struct parsed_partitions *state); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 90be97f1f5a8..15bfb7b1e044 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC2 0xC2 #define AIX_LABEL_MAGIC3 0xD4 #define AIX_LABEL_MAGIC4 0xC1 -static int aix_magic_present(unsigned char *p, struct block_device *bdev) +static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { struct partition *pt = (struct partition *) (p + 0x1be); Sector sect; @@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) is_extended_partition(pt)) return 0; } - d = read_dev_sector(bdev, 7, §); + d = read_part_sector(state, 7, §); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; @@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) * only for the actual data partitions. */ -static void -parse_extended(struct parsed_partitions *state, struct block_device *bdev, - sector_t first_sector, sector_t first_size) +static void parse_extended(struct parsed_partitions *state, + sector_t first_sector, sector_t first_size) { struct partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(bdev) / 512; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; int loopct = 0; /* number of links followed without finding a data partition */ int i; @@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, return; if (state->next == state->limit) return; - data = read_dev_sector(bdev, this_sector, §); + data = read_part_sector(state, this_sector, §); if (!data) return; @@ -198,9 +197,8 @@ done: /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ -static void -parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_solaris_x86(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_SOLARIS_X86_PARTITION Sector sect; @@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, int i; short max_nparts; - v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, §); + v = read_part_sector(state, offset + 1, §); if (!v) return; if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { @@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -static void -parse_bsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin, char *flavour, - int max_partitions) +static void parse_bsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin, char *flavour, + int max_partitions) { Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; - l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, §); + l = read_part_sector(state, offset + 1, §); if (!l) return; if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { @@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev, } #endif -static void -parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_freebsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "bsd", BSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); #endif } -static void -parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_netbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "netbsd", BSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); #endif } -static void -parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_openbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "openbsd", OPENBSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "openbsd", + OPENBSD_MAXPARTITIONS); #endif } @@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -static void -parse_unixware(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_unixware(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_UNIXWARE_DISKLABEL Sector sect; struct unixware_disklabel *l; struct unixware_slice *p; - l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, §); + l = read_part_sector(state, offset + 29, §); if (!l) return; if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || @@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev, * Anand Krishnamurthy <anandk@wiproge.med.ge.com> * Rajeev V. Pillai <rajeevvp@yahoo.com> */ -static void -parse_minix(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_minix(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; @@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, struct partition *p; int i; - data = read_dev_sector(bdev, offset, §); + data = read_part_sector(state, offset, §); if (!data) return; @@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, static struct { unsigned char id; - void (*parse)(struct parsed_partitions *, struct block_device *, - sector_t, sector_t, int); + void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); } subtypes[] = { {FREEBSD_PARTITION, parse_freebsd}, {NETBSD_PARTITION, parse_netbsd}, @@ -417,16 +406,16 @@ static struct { {0, NULL}, }; -int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) +int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(bdev) / 512; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; struct partition *p; struct fat_boot_sector *fb; int slot; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; if (!msdos_magic_present(data + 510)) { @@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) return 0; } - if (aix_magic_present(data, bdev)) { + if (aix_magic_present(state, data)) { put_dev_sector(sect); printk( " [AIX]"); return 0; @@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) put_partition(state, slot, start, n); printk(" <"); - parse_extended(state, bdev, start, size); + parse_extended(state, start, size); printk(" >"); continue; } put_partition(state, slot, start, size); if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; if (SYS_IND(p) == DM6_PARTITION) printk("[DM]"); if (SYS_IND(p) == EZD_PARTITION) @@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) if (!subtypes[n].parse) continue; - subtypes[n].parse(state, bdev, start_sect(p)*sector_size, - nr_sects(p)*sector_size, slot); + subtypes[n].parse(state, start_sect(p) * sector_size, + nr_sects(p) * sector_size, slot); } put_dev_sector(sect); return 1; diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h index 01e5e0b6902d..38c781c490b3 100644 --- a/fs/partitions/msdos.h +++ b/fs/partitions/msdos.h @@ -4,5 +4,5 @@ #define MSDOS_LABEL_MAGIC 0xAA55 -int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); +int msdos_partition(struct parsed_partitions *state); diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index c05c17bc5df3..fc22b85d436a 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,7 +10,7 @@ #include "check.h" #include "osf.h" -int osf_partition(struct parsed_partitions *state, struct block_device *bdev) +int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; @@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev) } * label; struct d_partition * partition; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h index 427b8eab314b..20ed2315ec16 100644 --- a/fs/partitions/osf.h +++ b/fs/partitions/osf.h @@ -4,4 +4,4 @@ #define DISKLABELMAGIC (0x82564557UL) -int osf_partition(struct parsed_partitions *state, struct block_device *bdev); +int osf_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index ed5ac83fe83a..43b1df9aa16c 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c @@ -27,7 +27,7 @@ struct sgi_disklabel { __be32 _unused1; /* Padding */ }; -int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) +int sgi_partition(struct parsed_partitions *state) { int i, csum; __be32 magic; @@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) struct sgi_partition *p; char b[BDEVNAME_SIZE]; - label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, §); + label = read_part_sector(state, 0, §); if (!label) return -1; p = &label->partitions[0]; @@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(bdev, b)); + bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h index 5d5595c09928..b9553ebdd5a9 100644 --- a/fs/partitions/sgi.h +++ b/fs/partitions/sgi.h @@ -2,7 +2,7 @@ * fs/partitions/sgi.h */ -extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int sgi_partition(struct parsed_partitions *state); #define SGI_LABEL_MAGIC 0x0be5a941 diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index c95e6a62c01d..a32660e25f7f 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -10,7 +10,7 @@ #include "check.h" #include "sun.h" -int sun_partition(struct parsed_partitions *state, struct block_device *bdev) +int sun_partition(struct parsed_partitions *state) { int i; __be16 csum; @@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) int use_vtoc; int nparts; - label = (struct sun_disklabel *)read_dev_sector(bdev, 0, §); + label = read_part_sector(state, 0, §); if (!label) return -1; @@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(bdev, b)); + bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h index 7f864d1f86d4..2424baa8319f 100644 --- a/fs/partitions/sun.h +++ b/fs/partitions/sun.h @@ -5,4 +5,4 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE -int sun_partition(struct parsed_partitions *state, struct block_device *bdev); +int sun_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c index 4eba27b78643..9030c864428e 100644 --- a/fs/partitions/sysv68.c +++ b/fs/partitions/sysv68.c @@ -46,7 +46,7 @@ struct slice { }; -int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) +int sysv68_partition(struct parsed_partitions *state) { int i, slices; int slot = 1; @@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) struct dkblk0 *b; struct slice *slice; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) i = be32_to_cpu(b->dk_ios.ios_slcblk); put_dev_sector(sect); - data = read_dev_sector(bdev, i, §); + data = read_part_sector(state, i, §); if (!data) return -1; diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h index fa733f68431b..bf2f5ffa97ac 100644 --- a/fs/partitions/sysv68.h +++ b/fs/partitions/sysv68.h @@ -1 +1 @@ -extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int sysv68_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c index ec852c11dce4..db9eef260364 100644 --- a/fs/partitions/ultrix.c +++ b/fs/partitions/ultrix.c @@ -9,7 +9,7 @@ #include "check.h" #include "ultrix.h" -int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) +int ultrix_partition(struct parsed_partitions *state) { int i; Sector sect; @@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) #define PT_MAGIC 0x032957 /* Partition magic number */ #define PT_VALID 1 /* Indicates if struct is valid */ - data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, §); + data = read_part_sector(state, (16384 - sizeof(*label))/512, §); if (!data) return -1; diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h index a74bf8e2d370..a3cc00b2bded 100644 --- a/fs/partitions/ultrix.h +++ b/fs/partitions/ultrix.h @@ -2,4 +2,4 @@ * fs/partitions/ultrix.h */ -int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); +int ultrix_partition(struct parsed_partitions *state); diff --git a/fs/pipe.c b/fs/pipe.c index 37ba29ff3158..d79872eba09a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mount.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> @@ -18,11 +19,18 @@ #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> +#include <linux/fcntl.h> #include <asm/uaccess.h> #include <asm/ioctls.h> /* + * The max size that a non-root user is allowed to grow the pipe. Can + * be set by root in /proc/sys/fs/pipe-max-pages + */ +unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; + +/* * We use a start+len construction, which provides full use of the * allocated memory. * -- Florian Coosmann (FGC) @@ -390,7 +398,7 @@ redo: if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); + curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; do_wakeup = 1; @@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & - (PIPE_BUFFERS-1); + (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; @@ -518,8 +526,8 @@ redo1: break; } bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + if (bufs < pipe->buffers) { + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; char *src; @@ -580,7 +588,7 @@ redo2: if (!total_len) break; } - if (bufs < PIPE_BUFFERS) + if (bufs < pipe->buffers) continue; if (filp->f_flags & O_NONBLOCK) { if (!ret) @@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { count += pipe->bufs[buf].len; - buf = (buf+1) & (PIPE_BUFFERS-1); + buf = (buf+1) & (pipe->buffers - 1); } mutex_unlock(&inode->i_mutex); @@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait) } if (filp->f_mode & FMODE_WRITE) { - mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; /* * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). @@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + pipe->buffers = PIPE_DEF_BUFFERS; + return pipe; + } + kfree(pipe); } - return pipe; + return NULL; } void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) buf->ops->release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); + kfree(pipe->bufs); kfree(pipe); } @@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) } /* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + struct pipe_buffer *bufs; + + /* + * Must be a power-of-2 currently + */ + if (!is_power_of_2(arg)) + return -EINVAL; + + /* + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't + * expect a lot of shrink+grow operations, just free and allocate + * again like we would do for growing. If the pipe currently + * contains more buffers than arg, then return busy. + */ + if (arg < pipe->nrbufs) + return -EBUSY; + + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); + if (unlikely(!bufs)) + return -ENOMEM; + + /* + * The pipe array wraps around, so just start the new one at zero + * and adjust the indexes. + */ + if (pipe->nrbufs) { + const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); + const unsigned int head = pipe->nrbufs - tail; + + if (head) + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); + if (tail) + memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); + } + + pipe->curbuf = 0; + kfree(pipe->bufs); + pipe->bufs = bufs; + pipe->buffers = arg; + return arg; +} + +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe; + long ret; + + pipe = file->f_path.dentry->d_inode->i_pipe; + if (!pipe) + return -EBADF; + + mutex_lock(&pipe->inode->i_mutex); + + switch (cmd) { + case F_SETPIPE_SZ: + if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) + return -EINVAL; + /* + * The pipe needs to be at least 2 pages large to + * guarantee POSIX behaviour. + */ + if (arg < 2) + return -EINVAL; + ret = pipe_set_size(pipe, arg); + break; + case F_GETPIPE_SZ: + ret = pipe->buffers; + break; + default: + ret = -EINVAL; + break; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47f5b145f56e..aea1d3f1ffb5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) { u64 pme = 0; @@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, return err; } +#endif /* HUGETLB_PAGE */ /* * /proc/pid/pagemap - an array mapping virtual pages to pfns @@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif pagemap_walk.mm = mm; pagemap_walk.private = ± diff --git a/fs/quota/quota.c b/fs/quota/quota.c index cfc78826da90..ce3dfd066f59 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, return security_quotactl(cmd, type, id, sb); } +static void quota_sync_one(struct super_block *sb, void *arg) +{ + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, *(int *)arg, 1); +} + static int quota_sync_all(int type) { - struct super_block *sb; int ret; if (type >= MAXQUOTAS) return -EINVAL; ret = security_quotactl(Q_SYNC, type, 0, NULL); - if (ret) - return ret; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_qcop || !sb->s_qcop->quota_sync) - continue; - - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sb->s_qcop->quota_sync(sb, type, 1); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - - return 0; + if (!ret) + iterate_supers(quota_sync_one, &type); + return ret; } static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f47cd212dee1..a5ebae70dc6d 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = { BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, }; -struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) +struct inode *ramfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) { struct inode * inode = new_inode(sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); @@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) static int ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); + struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; if (inode) { - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; @@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * struct inode *inode; int error = -ENOSPC; - inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { - if (dir->i_mode & S_ISGID) - inode->i_gid = dir->i_gid; d_instantiate(dentry, inode); dget(dentry); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -241,7 +233,7 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ramfs_ops; sb->s_time_gran = 1; - inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); + inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { err = -ENOMEM; goto fail; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1d9c12714c5c..9977df9f3a54 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index d0c43cb99ffc..ee78d4a0086a 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode) */ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) { - - /* the quota init calls have to know who to charge the quota to, so - ** we have to set uid and gid here - */ - inode->i_uid = current_fsuid(); - inode->i_mode = mode; /* Make inode invalid - just in case we are going to drop it before * the initialization happens */ INODE_PKEY(inode)->k_objectid = 0; - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + /* the quota init calls have to know who to charge the quota to, so + ** we have to set uid and gid here + */ + inode_init_owner(inode, dir, mode); dquot_initialize(inode); return 0; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e7cc00e636dc..8c4cf273c672 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -723,11 +723,11 @@ out: (handler) = *(handlers)++) /* This is the implementation for the xattr plugin infrastructure */ -static inline struct xattr_handler * -find_xattr_handler_prefix(struct xattr_handler **handlers, +static inline const struct xattr_handler * +find_xattr_handler_prefix(const struct xattr_handler **handlers, const char *name) { - struct xattr_handler *xah; + const struct xattr_handler *xah; if (!handlers) return NULL; @@ -748,7 +748,7 @@ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t size) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); @@ -767,7 +767,7 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); @@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) @@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen, size_t size; if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ @@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 9cdb759645a9..536d697a8a28 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list, return size; } -struct xattr_handler reiserfs_posix_acl_access_handler = { +const struct xattr_handler reiserfs_posix_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = posix_acl_get, @@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list, return size; } -struct xattr_handler reiserfs_posix_acl_default_handler = { +const struct xattr_handler reiserfs_posix_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = posix_acl_get, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 7271a477c041..237c6928d3c6 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec) sec->value = NULL; } -struct xattr_handler reiserfs_xattr_security_handler = { +const struct xattr_handler reiserfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = security_get, .set = security_set, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5b08aaca3daf..9883736ce3ec 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, return len; } -struct xattr_handler reiserfs_xattr_trusted_handler = { +const struct xattr_handler reiserfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = trusted_get, .set = trusted_set, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 75d59c49b911..45ae1a00013a 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size, return len; } -struct xattr_handler reiserfs_xattr_user_handler = { +const struct xattr_handler reiserfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = user_get, .set = user_set, diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 3e4803b4427e..6c978428892d 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -39,7 +39,7 @@ const struct file_operations smb_dir_operations = { .read = generic_read_dir, .readdir = smb_readdir, - .ioctl = smb_ioctl, + .unlocked_ioctl = smb_ioctl, .open = smb_dir_open, }; diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index dbf6548bbf06..84ecf0e43f91 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -437,7 +437,7 @@ const struct file_operations smb_file_operations = .aio_read = smb_file_aio_read, .write = do_sync_write, .aio_write = smb_file_aio_write, - .ioctl = smb_ioctl, + .unlocked_ioctl = smb_ioctl, .mmap = smb_file_mmap, .open = smb_file_open, .release = smb_file_release, diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c index dbae1f8ea26f..07215312ad39 100644 --- a/fs/smbfs/ioctl.c +++ b/fs/smbfs/ioctl.c @@ -13,6 +13,7 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/highuid.h> +#include <linux/smp_lock.h> #include <linux/net.h> #include <linux/smb_fs.h> @@ -22,14 +23,14 @@ #include "proto.h" -int -smb_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long +smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct smb_sb_info *server = server_from_inode(inode); + struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode); struct smb_conn_opt opt; int result = -EINVAL; + lock_kernel(); switch (cmd) { uid16_t uid16; uid_t uid32; @@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp, default: break; } + unlock_kernel(); return result; } diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h index 03f456c1b7d4..05939a6f43e6 100644 --- a/fs/smbfs/proto.h +++ b/fs/smbfs/proto.h @@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops; extern const struct file_operations smb_file_operations; extern const struct inode_operations smb_file_inode_operations; /* ioctl.c */ -extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* smbiod.c */ extern void smbiod_wake_up(void); extern int smbiod_register_server(struct smb_sb_info *server); diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c index 54350b59046b..00b2909bd469 100644 --- a/fs/smbfs/symlink.c +++ b/fs/smbfs/symlink.c @@ -15,7 +15,6 @@ #include <linux/pagemap.h> #include <linux/net.h> #include <linux/namei.h> -#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> diff --git a/fs/splice.c b/fs/splice.c index 9313b6124a2e..ac22b00d86c3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, break; } - if (pipe->nrbufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; @@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, if (!--spd->nr_pages) break; - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) continue; break; @@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) page_cache_release(spd->pages[i]); } +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, { struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; @@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); + nr_pages = min(req_pages, pipe->buffers); /* * Lookup the (hopefully) full range of pages we need. */ - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; /* @@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unlock_page(page); } - pages[spd.nr_pages++] = page; + spd.pages[spd.nr_pages++] = page; index++; } @@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * this_len is the max we'll use from this page */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); - page = pages[page_nr]; + page = spd.pages[page_nr]; if (PageReadahead(page)) page_cache_async_readahead(mapping, &in->f_ra, in, @@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = -ENOMEM; break; } - page_cache_release(pages[page_nr]); - pages[page_nr] = page; + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -451,8 +484,8 @@ fill_it: len = this_len; } - partial[page_nr].offset = loff; - partial[page_nr].len = this_len; + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; spd.nr_pages++; @@ -464,12 +497,13 @@ fill_it: * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) - page_cache_release(pages[page_nr++]); + page_cache_release(spd.pages[page_nr++]); in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) - return splice_to_pipe(pipe, &spd); + error = splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); return error; } @@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, unsigned int nr_pages; unsigned int nr_freed; size_t offset; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; - struct iovec vec[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; pgoff_t index; ssize_t res; size_t this_len; @@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + res = -ENOMEM; + vec = __vec; + if (pipe->buffers > PIPE_DEF_BUFFERS) { + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } + index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { struct page *page; page = alloc_page(GFP_USER); @@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; - pages[i] = page; + spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; @@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { this_len = min_t(size_t, vec[i].iov_len, res); - partial[i].offset = 0; - partial[i].len = this_len; + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; if (!this_len) { - __free_page(pages[i]); - pages[i] = NULL; + __free_page(spd.pages[i]); + spd.pages[i] = NULL; nr_freed++; } res -= this_len; @@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, if (res > 0) *ppos += res; +shrink_ret: + if (vec != __vec) + kfree(vec); + splice_shrink_spd(pipe, &spd); return res; err: for (i = 0; i < spd.nr_pages; i++) - __free_page(pages[i]); + __free_page(spd.pages[i]); - return error; + res = error; + goto shrink_ret; } EXPORT_SYMBOL(default_file_splice_read); @@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->inode) sd->need_wakeup = true; @@ -1211,7 +1261,7 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) { @@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, int aligned) + struct partial_page *partial, int aligned, + unsigned int pipe_buffers) { int buffers = 0, error = 0; @@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, break; npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (npages > PIPE_BUFFERS - buffers) - npages = PIPE_BUFFERS - buffers; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; error = get_user_pages_fast((unsigned long)base, npages, 0, &pages[buffers]); @@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, * or if we mapped the max number of pages that we have * room for. */ - if (error < npages || buffers == PIPE_BUFFERS) + if (error < npages || buffers == pipe_buffers) break; nr_vecs--; @@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, }; + long ret; pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, - flags & SPLICE_F_GIFT); + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, flags & SPLICE_F_GIFT, + pipe->buffers); if (spd.nr_pages <= 0) - return spd.nr_pages; + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); - return splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); + return ret; } /* @@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check ->nrbufs without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) return 0; ret = 0; pipe_lock(pipe); - while (pipe->nrbufs >= PIPE_BUFFERS) { + while (pipe->nrbufs >= pipe->buffers) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1810,7 +1869,7 @@ retry: * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { /* Already processed some buffers, break */ if (ret) break; @@ -1831,7 +1890,7 @@ retry: } ibuf = ipipe->bufs + ipipe->curbuf; - nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); obuf = opipe->bufs + nbuf; if (len >= ibuf->len) { @@ -1841,7 +1900,7 @@ retry: *obuf = *ibuf; ibuf->ops = NULL; opipe->nrbufs++; - ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); ipipe->nrbufs--; input_wakeup = true; } else { @@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * If we have iterated all input buffers or ran out of * output room, break. */ - if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) break; - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); /* * Get a reference to this pipe buffer, diff --git a/fs/statfs.c b/fs/statfs.c new file mode 100644 index 000000000000..4ef021f3b612 --- /dev/null +++ b/fs/statfs.c @@ -0,0 +1,196 @@ +#include <linux/syscalls.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/security.h> +#include <linux/uaccess.h> + +int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int retval = -ENODEV; + + if (dentry) { + retval = -ENOSYS; + if (dentry->d_sb->s_op->statfs) { + memset(buf, 0, sizeof(*buf)); + retval = security_sb_statfs(dentry); + if (retval) + return retval; + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; + } + } + return retval; +} + +EXPORT_SYMBOL(vfs_statfs); + +static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +{ + struct kstatfs st; + int retval; + + retval = vfs_statfs(dentry, &st); + if (retval) + return retval; + + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { + if (sizeof buf->f_blocks == 4) { + if ((st.f_blocks | st.f_bfree | st.f_bavail | + st.f_bsize | st.f_frsize) & + 0xffffffff00000000ULL) + return -EOVERFLOW; + /* + * f_files and f_ffree may be -1; it's okay to stuff + * that into 32 bits + */ + if (st.f_files != -1 && + (st.f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (st.f_ffree != -1 && + (st.f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + + buf->f_type = st.f_type; + buf->f_bsize = st.f_bsize; + buf->f_blocks = st.f_blocks; + buf->f_bfree = st.f_bfree; + buf->f_bavail = st.f_bavail; + buf->f_files = st.f_files; + buf->f_ffree = st.f_ffree; + buf->f_fsid = st.f_fsid; + buf->f_namelen = st.f_namelen; + buf->f_frsize = st.f_frsize; + memset(buf->f_spare, 0, sizeof(buf->f_spare)); + } + return 0; +} + +static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +{ + struct kstatfs st; + int retval; + + retval = vfs_statfs(dentry, &st); + if (retval) + return retval; + + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { + buf->f_type = st.f_type; + buf->f_bsize = st.f_bsize; + buf->f_blocks = st.f_blocks; + buf->f_bfree = st.f_bfree; + buf->f_bavail = st.f_bavail; + buf->f_files = st.f_files; + buf->f_ffree = st.f_ffree; + buf->f_fsid = st.f_fsid; + buf->f_namelen = st.f_namelen; + buf->f_frsize = st.f_frsize; + memset(buf->f_spare, 0, sizeof(buf->f_spare)); + } + return 0; +} + +SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) +{ + struct path path; + int error; + + error = user_path(pathname, &path); + if (!error) { + struct statfs tmp; + error = vfs_statfs_native(path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) +{ + struct path path; + long error; + + if (sz != sizeof(*buf)) + return -EINVAL; + error = user_path(pathname, &path); + if (!error) { + struct statfs64 tmp; + error = vfs_statfs64(path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) +{ + struct file *file; + struct statfs tmp; + int error; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs_native(file->f_path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +out: + return error; +} + +SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) +{ + struct file *file; + struct statfs64 tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs64(file->f_path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +out: + return error; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; + int err; + + s = user_get_super(new_decode_dev(dev)); + if (!s) + return -EINVAL; + + err = vfs_statfs(s->s_root, &sbuf); + drop_super(s); + if (err) + return err; + + memset(&tmp,0,sizeof(struct ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + + return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; +} diff --git a/fs/super.c b/fs/super.c index 1527e6a0ee35..69688b15f1fa 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,23 +22,15 @@ #include <linux/module.h> #include <linux/slab.h> -#include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/acct.h> #include <linux/blkdev.h> #include <linux/quotaops.h> -#include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/vfs.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> -#include <linux/kobject.h> #include <linux/mutex.h> -#include <linux/file.h> #include <linux/backing-dev.h> -#include <asm/uaccess.h> #include "internal.h" @@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type) * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); - s->s_count = S_BIAS; + s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); init_rwsem(&s->s_dquot.dqptr_sem); @@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s) /* Superblock refcounting */ /* - * Drop a superblock's refcount. Returns non-zero if the superblock was - * destroyed. The caller must hold sb_lock. + * Drop a superblock's refcount. The caller must hold sb_lock. */ -static int __put_super(struct super_block *sb) +void __put_super(struct super_block *sb) { - int ret = 0; - if (!--sb->s_count) { + list_del_init(&sb->s_list); destroy_super(sb); - ret = 1; } - return ret; -} - -/* - * Drop a superblock's refcount. - * Returns non-zero if the superblock is about to be destroyed and - * at least is already removed from super_blocks list, so if we are - * making a loop through super blocks then we need to restart. - * The caller must hold sb_lock. - */ -int __put_super_and_need_restart(struct super_block *sb) -{ - /* check for race with generic_shutdown_super() */ - if (list_empty(&sb->s_list)) { - /* super block is removed, need to restart... */ - __put_super(sb); - return 1; - } - /* can't be the last, since s_list is still in use */ - sb->s_count--; - BUG_ON(sb->s_count == 0); - return 0; } /** @@ -178,57 +146,48 @@ void put_super(struct super_block *sb) /** - * deactivate_super - drop an active reference to superblock + * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * - * Drops an active reference to superblock, acquiring a temprory one if - * there is no active references left. In that case we lock superblock, + * Drops an active reference to superblock, converting it into a temprory + * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. + * + * Caller holds exclusive lock on superblock; that lock is released. */ -void deactivate_super(struct super_block *s) +void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; - if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { - s->s_count -= S_BIAS-1; - spin_unlock(&sb_lock); + if (atomic_dec_and_test(&s->s_active)) { vfs_dq_off(s, 0); - down_write(&s->s_umount); fs->kill_sb(s); put_filesystem(fs); put_super(s); + } else { + up_write(&s->s_umount); } } -EXPORT_SYMBOL(deactivate_super); +EXPORT_SYMBOL(deactivate_locked_super); /** - * deactivate_locked_super - drop an active reference to superblock + * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * - * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that - * it does not unlock it until it's all over. As the result, it's safe to - * use to dispose of new superblock on ->get_sb() failure exits - nobody - * will see the sucker until it's all over. Equivalent using up_write + - * deactivate_super is safe for that purpose only if superblock is either - * safe to use or has NULL ->s_root when we unlock. + * Variant of deactivate_locked_super(), except that superblock is *not* + * locked by caller. If we are going to drop the final active reference, + * lock will be acquired prior to that. */ -void deactivate_locked_super(struct super_block *s) +void deactivate_super(struct super_block *s) { - struct file_system_type *fs = s->s_type; - if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { - s->s_count -= S_BIAS-1; - spin_unlock(&sb_lock); - vfs_dq_off(s, 0); - fs->kill_sb(s); - put_filesystem(fs); - put_super(s); - } else { - up_write(&s->s_umount); + if (!atomic_add_unless(&s->s_active, -1, 1)) { + down_write(&s->s_umount); + deactivate_locked_super(s); } } -EXPORT_SYMBOL(deactivate_locked_super); +EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference @@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + if (atomic_inc_not_zero(&s->s_active)) { + spin_unlock(&sb_lock); + return 1; + } + /* it's going away */ s->s_count++; spin_unlock(&sb_lock); + /* wait for it to die */ down_write(&s->s_umount); - if (s->s_root) { - spin_lock(&sb_lock); - if (s->s_count > S_BIAS) { - atomic_inc(&s->s_active); - s->s_count--; - spin_unlock(&sb_lock); - return 1; - } - spin_unlock(&sb_lock); - } up_write(&s->s_umount); put_super(s); - yield(); return 0; } @@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb) } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ - list_del_init(&sb->s_list); - list_del(&sb->s_instances); + list_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); } @@ -357,6 +310,7 @@ retry: up_write(&s->s_umount); destroy_super(s); } + down_write(&old->s_umount); return old; } } @@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super); */ void sync_supers(void) { - struct super_block *sb; + struct super_block *sb, *n; spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_op->write_super && sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); @@ -423,14 +378,43 @@ restart: up_read(&sb->s_umount); spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + __put_super(sb); } } spin_unlock(&sb_lock); } /** + * iterate_supers - call function for all active superblocks + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *n; + + spin_lock(&sb_lock); + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + __put_super(sb); + } + spin_unlock(&sb_lock); +} + +/** * get_super - get the superblock of a device * @bdev: device to get the superblock for * @@ -438,7 +422,7 @@ restart: * mounted on the device given. %NULL is returned if no match is found. */ -struct super_block * get_super(struct block_device *bdev) +struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; @@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + /* still alive? */ if (sb->s_root) return sb; up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ + /* nope, got unmounted */ spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto rescan; + __put_super(sb); + goto rescan; } } spin_unlock(&sb_lock); @@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super); * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active - * reference and s_umount held exclusively or %NULL if none was found. + * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { @@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; +restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdev != bdev) + if (list_empty(&sb->s_instances)) continue; - - sb->s_count++; - spin_unlock(&sb_lock); - down_write(&sb->s_umount); - if (sb->s_root) { - spin_lock(&sb_lock); - if (sb->s_count > S_BIAS) { - atomic_inc(&sb->s_active); - sb->s_count--; - spin_unlock(&sb_lock); + if (sb->s_bdev == bdev) { + if (grab_super(sb)) /* drops sb_lock */ return sb; - } - spin_unlock(&sb_lock); + else + goto restart; } - up_write(&sb->s_umount); - put_super(sb); - yield(); - spin_lock(&sb_lock); } spin_unlock(&sb_lock); return NULL; } -struct super_block * user_get_super(dev_t dev) +struct super_block *user_get_super(dev_t dev) { struct super_block *sb; spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + /* still alive? */ if (sb->s_root) return sb; up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ + /* nope, got unmounted */ spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto rescan; + __put_super(sb); + goto rescan; } } spin_unlock(&sb_lock); return NULL; } -SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) -{ - struct super_block *s; - struct ustat tmp; - struct kstatfs sbuf; - int err = -EINVAL; - - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; - err = vfs_statfs(s->s_root, &sbuf); - drop_super(s); - if (err) - goto out; - - memset(&tmp,0,sizeof(struct ustat)); - tmp.f_tfree = sbuf.f_bfree; - tmp.f_tinode = sbuf.f_ffree; - - err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0; -out: - return err; -} - /** * do_remount_sb - asks filesystem to change mount options. * @sb: superblock in question @@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) static void do_emergency_remount(struct work_struct *work) { - struct super_block *sb; + struct super_block *sb, *n; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { /* - * ->remount_fs needs lock_kernel(). - * * What lock protects sb->s_flags?? */ do_remount_sb(sb, MS_RDONLY, NULL, 1); } up_write(&sb->s_umount); - put_super(sb); spin_lock(&sb_lock); + __put_super(sb); } spin_unlock(&sb_lock); kfree(work); @@ -990,6 +945,96 @@ out: EXPORT_SYMBOL_GPL(vfs_kern_mount); +/** + * freeze_super -- lock the filesystem and force it into a consistent state + * @super: the super to lock + * + * Syncs the super to make sure the filesystem is consistent and calls the fs's + * freeze_fs. Subsequent calls to this without first thawing the fs will return + * -EBUSY. + */ +int freeze_super(struct super_block *sb) +{ + int ret; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + if (sb->s_frozen) { + deactivate_locked_super(sb); + return -EBUSY; + } + + if (sb->s_flags & MS_RDONLY) { + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + up_write(&sb->s_umount); + return 0; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + if (sb->s_op->freeze_fs) { + ret = sb->s_op->freeze_fs(sb); + if (ret) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + deactivate_locked_super(sb); + return ret; + } + } + up_write(&sb->s_umount); + return 0; +} +EXPORT_SYMBOL(freeze_super); + +/** + * thaw_super -- unlock filesystem + * @sb: the super to thaw + * + * Unlocks the filesystem and marks it writeable again after freeze_super(). + */ +int thaw_super(struct super_block *sb) +{ + int error; + + down_write(&sb->s_umount); + if (sb->s_frozen == SB_UNFROZEN) { + up_write(&sb->s_umount); + return -EINVAL; + } + + if (sb->s_flags & MS_RDONLY) + goto out; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); + return error; + } + } + +out: + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + deactivate_locked_super(sb); + + return 0; +} +EXPORT_SYMBOL(thaw_super); + static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) { int err; diff --git a/fs/sync.c b/fs/sync.c index 92b228176f7c..e8cbd415e50a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb_locked(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb) } EXPORT_SYMBOL_GPL(sync_filesystem); +static void sync_one_sb(struct super_block *sb, void *arg) +{ + if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) + __sync_filesystem(sb, *(int *)arg); +} /* * Sync all the data for all the filesystems (called by sys_sync() and * emergency sync) - * - * This operation is careful to avoid the livelock which could easily happen - * if two or more filesystems are being continuously dirtied. s_need_sync - * is used only here. We set it against all filesystems and then clear it as - * we sync them. So redirtied filesystems are skipped. - * - * But if process A is currently running sync_filesystems and then process B - * calls sync_filesystems as well, process B will set all the s_need_sync - * flags again, which will cause process A to resync everything. Fix that with - * a local mutex. */ static void sync_filesystems(int wait) { - struct super_block *sb; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); /* Could be down_interruptible */ - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) - sb->s_need_sync = 1; - -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_need_sync) - continue; - sb->s_need_sync = 0; - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi) - __sync_filesystem(sb, wait); - up_read(&sb->s_umount); - - /* restart only when sb is no longer on the list */ - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - mutex_unlock(&mutex); + iterate_supers(sync_one_sb, &wait); } /* @@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync); /** * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync - * @dentry: dentry of @file * @start: offset in bytes of the beginning of data range to sync * @end: offset in bytes of the end of data range (inclusive) * @datasync: perform only datasync @@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync); * Write back data in range @start..@end and metadata for @file to disk. If * @datasync is set only metadata needed to access modified file data is * written. - * - * In case this function is called from nfsd @file may be %NULL and - * only @dentry is set. This can only happen when the filesystem - * implements the export_operations API. */ -int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, - loff_t end, int datasync) +int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { - const struct file_operations *fop; - struct address_space *mapping; + struct address_space *mapping = file->f_mapping; int err, ret; - /* - * Get mapping and operations from the file in case we have - * as file, or get the default values for them in case we - * don't have a struct file available. Damn nfsd.. - */ - if (file) { - mapping = file->f_mapping; - fop = file->f_op; - } else { - mapping = dentry->d_inode->i_mapping; - fop = dentry->d_inode->i_fop; - } - - if (!fop || !fop->fsync) { + if (!file->f_op || !file->f_op->fsync) { ret = -EINVAL; goto out; } @@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = fop->fsync(file, dentry, datasync); + err = file->f_op->fsync(file, file->f_path.dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); @@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range); /** * vfs_fsync - perform a fsync or fdatasync on a file * @file: file to sync - * @dentry: dentry of @file * @datasync: only perform a fdatasync operation * * Write back data and metadata for @file to disk. If @datasync is * set only metadata needed to access modified file data is written. - * - * In case this function is called from nfsd @file may be %NULL and - * only @dentry is set. This can only happen when the filesystem - * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync(struct file *file, int datasync) { - return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); + return vfs_fsync_range(file, 0, LLONG_MAX, datasync); } EXPORT_SYMBOL(vfs_fsync); @@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync) file = fget(fd); if (file) { - ret = vfs_fsync(file, file->f_path.dentry, datasync); + ret = vfs_fsync(file, datasync); fput(file); } return ret; @@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count) { if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) return 0; - return vfs_fsync_range(file, file->f_path.dentry, pos, - pos + count - 1, + return vfs_fsync_range(file, pos, pos + count - 1, (file->f_flags & __O_SYNC) ? 0 : 1); } EXPORT_SYMBOL(generic_write_sync); diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 241e9765cfad..bbd69bdb0fa8 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_uid = current_fsuid(); + inode_init_owner(inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; @@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) insert_inode_hash(inode); mark_inode_dirty(inode); - inode->i_mode = mode; /* for sysv_write_inode() */ sysv_write_inode(inode, 0); /* ensure inode not allocated again */ mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ /* That's it. */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 401e503d44a1..87ebcce72213 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, */ inode->i_flags |= (S_NOCMTIME); - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index fb68c9cd0c3e..2b5586c7f02a 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + + inode_init_owner(inode, dir, mode); iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 75816025f95f..585f733615dc 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; - inode->i_mode = mode; mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); @@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, goto out; iinfo = UDF_I(inode); - inode->i_uid = current_fsuid(); init_special_inode(inode, mode, rdev); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { @@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; err = -EIO; - inode = udf_new_inode(dir, S_IFDIR, &err); + inode = udf_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out; @@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); brelse(fibh.sbh); - inode->i_mode = S_IFDIR | mode; - if (dir->i_mode & S_ISGID) - inode->i_mode |= S_ISGID; mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); @@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); lock_kernel(); - inode = udf_new_inode(dir, S_IFLNK, &err); + inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); if (!inode) goto out; @@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } iinfo = UDF_I(inode); - inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 230ecf608026..3a959d55084d 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -303,15 +303,7 @@ cg_found: sb->s_dirt = 1; inode->i_ino = cg * uspi->s_ipg + bit; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - + inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; diff --git a/fs/xattr.c b/fs/xattr.c index 46f87e828b48..01bb8135e14a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix) /* * Find the xattr_handler with the matching prefix. */ -static struct xattr_handler * -xattr_resolve_name(struct xattr_handler **handlers, const char **name) +static const struct xattr_handler * +xattr_resolve_name(const struct xattr_handler **handlers, const char **name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; if (!*name) return NULL; @@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name) ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) @@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { @@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct xattr_handler *handler; + const struct xattr_handler *handler; if (size == 0) value = ""; /* empty EA, do not remove */ @@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz int generic_removexattr(struct dentry *dentry, const char *name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b4769e40e8bc..c8fb13f83b3f 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ + xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ xfs_mru_cache.o \ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index a7bc925c4d60..9f769b5b38fc 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, return error; } -struct xattr_handler xfs_xattr_acl_access_handler = { +const struct xattr_handler xfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = xfs_xattr_acl_get, .set = xfs_xattr_acl_set, }; -struct xattr_handler xfs_xattr_acl_default_handler = { +const struct xattr_handler xfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = xfs_xattr_acl_get, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f01de3c55c43..649ade8ef598 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -37,6 +37,7 @@ #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -850,6 +851,12 @@ xfs_buf_lock_value( * Note that this in no way locks the underlying pages, so it is only * useful for synchronizing concurrent use of buffer objects, not for * synchronizing independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( @@ -857,6 +864,8 @@ xfs_buf_lock( { trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index e31bf21fe5d3..9ac8aea91529 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -19,6 +19,7 @@ #include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index e9002513e08f..f2d1718c9165 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -374,6 +376,13 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { mp->m_flags |= XFS_MOUNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + cmn_err(CE_WARN, + "Enabling EXPERIMENTAL delayed logging feature " + "- use at your own risk.\n"); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -535,6 +544,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -725,7 +735,8 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } STATIC void @@ -1754,7 +1765,7 @@ xfs_init_zones(void) * but it is much faster. */ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 233d4b9881b1..519618e9279e 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; -extern struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 8a319cfd2901..ff6bc797baf2 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap, ); +#define XFS_BUSY_SYNC \ + { 0, "async" }, \ + { 1, "sync" } + TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int slot), - TP_ARGS(mp, agno, agbno, len, slot), + TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int sync), + TP_ARGS(trans, agno, agbno, len, sync), TP_STRUCT__entry( __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(int, tid) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, slot) + __field(int, sync) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->tid = trans->t_ticket->t_tid; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->slot = slot; + __entry->sync = sync; ), - TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->tid, __entry->agno, __entry->agbno, __entry->len, - __entry->slot) + __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) ); -#define XFS_BUSY_STATES \ - { 0, "found" }, \ - { 1, "missing" } - TRACE_EVENT(xfs_alloc_unbusy, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int slot, int found), - TP_ARGS(mp, agno, slot, found), + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, slot) - __field(int, found) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->slot = slot; - __entry->found = found; + __entry->agbno = agbno; + __entry->len = len; ), - TP_printk("dev %d:%d agno %u slot %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->slot, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->agbno, + __entry->len) ); +#define XFS_BUSY_STATES \ + { 0, "missing" }, \ + { 1, "found" } + TRACE_EVENT(xfs_alloc_busysearch, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, xfs_lsn_t lsn), - TP_ARGS(mp, agno, agbno, len, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int found), + TP_ARGS(mp, agno, agbno, len, found), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_lsn_t, lsn) + __field(int, found) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->lsn = lsn; + __entry->found = found; ), - TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + TP_printk("dev %d:%d agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, __entry->lsn) ); diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index fa01b9daba6b..87d3e03878c8 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, (void *)value, size, xflags); } -static struct xattr_handler xfs_xattr_user_handler = { +static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_trusted_handler = { +static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_security_handler = { +static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index b89ec5df0129..585e7633dfc7 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk( for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index d13eeba2c8f8..0135e2a669d7 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_acl_access_handler; -extern struct xattr_handler xfs_xattr_acl_default_handler; +extern const struct xattr_handler xfs_xattr_acl_access_handler; +extern const struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index abb8222b88c9..401f364ad36c 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,14 +175,20 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + xlog_tid_t tid; /* transaction that created this */ +}; /* * Per-ag incore structure, copies of information in agf and agi, @@ -216,7 +222,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ @@ -226,7 +233,6 @@ typedef struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 94cddbfb2560..a7fbe8a99b12 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1693,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2201,7 +2212,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; - memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2479,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_perag_busy_t *bsy; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; struct xfs_perag *pag; - int n; + struct rb_node **rbp; + struct rb_node *parent; + int match; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - /* search pagb_list for an open slot */ - for (bsy = pag->pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); + xfs_trans_set_sync(tp); + return; } - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &pag->pagb_list[n]; - pag->pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; + } + if (match > 0) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. */ - xfs_trans_set_sync(tp); - } + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); + kmem_free(busyp); } -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { struct xfs_perag *pag; - xfs_perag_busy_t *list; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - pag = xfs_perag_get(tp->t_mountp, agno); + pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); - list = pag->pagb_list; - trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - pag->pagb_count--; + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } } - spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); + return match; } - -/* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. - */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) { struct xfs_perag *pag; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - cnt = pag->pagb_count; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - uend = bno + len - 1; - for (cnt = 0; cnt < pag->pagb_count; cnt++) { - bsy = &pag->pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); + kmem_free(busyp); } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa39784..6d05199b667c 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xfs_mount; struct xfs_perag; struct xfs_trans; +struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void -xfs_alloc_mark_busy(xfs_trans_t *tp, +xfs_alloc_busy_insert(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b726e10d2c1c..83f494218759 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -134,7 +134,7 @@ xfs_allocbt_free_block( * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 240340a4727b..02a80984aa05 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -64,7 +64,7 @@ xfs_buf_item_log_debug( nbytes = last - first + 1; bfset(bip->bli_logged, first, nbytes); for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; + chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); wordp = &(bip->bli_format.blf_data_map[word_num]); @@ -166,7 +166,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); return 1; } @@ -197,9 +197,9 @@ xfs_buf_item_size( } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; nvecs++; } else { @@ -254,6 +254,20 @@ xfs_buf_item_format( vecp++; nvecs = 1; + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(&bip->bli_item))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -261,7 +275,7 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); bip->bli_format.blf_size = nvecs; return; } @@ -294,28 +308,28 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + - XFS_BLI_CHUNK)) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + XFS_BLF_CHUNK)) { + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary @@ -341,10 +355,15 @@ xfs_buf_item_format( } /* - * This is called to pin the buffer associated with the buf log - * item in memory so it cannot be written out. Simply call bpin() - * on the buffer to do this. + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. Simply call bpin() on the buffer to do this. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. */ + STATIC void xfs_buf_item_pin( xfs_buf_log_item_t *bip) @@ -356,6 +375,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); + atomic_inc(&bip->bli_refcount); trace_xfs_buf_item_pin(bip); xfs_bpin(bp); } @@ -393,7 +413,7 @@ xfs_buf_item_unpin( ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); /* @@ -489,20 +509,23 @@ xfs_buf_item_trylock( } /* - * Release the buffer associated with the buf log item. - * If there is no dirty logged data associated with the - * buffer recorded in the buf log item, then free the - * buf log item and remove the reference to it in the - * buffer. + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. * - * This call ignores the recursion count. It is only called - * when the buffer should REALLY be unlocked, regardless - * of the recursion count. + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. * - * If the XFS_BLI_HOLD flag is set in the buf log item, then - * free the log item if necessary but do not unlock the buffer. - * This is for support of xfs_trans_bhold(). Make sure the - * XFS_BLI_HOLD field is cleared if we don't free the item. + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. */ STATIC void xfs_buf_item_unlock( @@ -514,73 +537,54 @@ xfs_buf_item_unlock( bp = bip->bli_buf; - /* - * Clear the buffer's association with this transaction. - */ + /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); /* - * If this is a transaction abort, don't return early. - * Instead, allow the brelse to happen. - * Normally it would be done for stale (cancelled) buffers - * at unpin time, but we'll never go through the pin/unpin - * cycle if we abort inside commit. + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. */ aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; /* - * If the buf item is marked stale, then don't do anything. - * We'll unlock the buffer and free the buf item when the - * buffer is unpinned for the last time. + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. */ - if (bip->bli_flags & XFS_BLI_STALE) { - bip->bli_flags &= ~XFS_BLI_LOGGED; - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - if (!aborted) - return; - } + hold = bip->bli_flags & XFS_BLI_HOLD; + + /* Clear the per transaction state. */ + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); /* - * Drop the transaction's reference to the log item if - * it was not logged as part of the transaction. Otherwise - * we'll drop the reference in xfs_buf_item_unpin() when - * the transaction is really through with the buffer. + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. */ - if (!(bip->bli_flags & XFS_BLI_LOGGED)) { - atomic_dec(&bip->bli_refcount); - } else { - /* - * Clear the logged flag since this is per - * transaction state. - */ - bip->bli_flags &= ~XFS_BLI_LOGGED; + if (bip->bli_flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } } - /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. - */ - hold = bip->bli_flags & XFS_BLI_HOLD; trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it. - * Otherwise, if XFS_BLI_HOLD is set clear it. + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. */ if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) { + bip->bli_format.blf_map_size)) xfs_buf_item_relse(bp); - } else if (hold) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } + else + atomic_dec(&bip->bli_refcount); - /* - * Release the buffer if XFS_BLI_HOLD was not set. - */ - if (!hold) { + if (!hold) xfs_buf_relse(bp); - } } /* @@ -717,12 +721,12 @@ xfs_buf_item_init( } /* - * chunks is the number of XFS_BLI_CHUNK size pieces + * chunks is the number of XFS_BLF_CHUNK size pieces * the buffer can be divided into. Make sure not to * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, @@ -790,8 +794,8 @@ xfs_buf_item_log( /* * Convert byte offsets to bit numbers. */ - first_bit = first >> XFS_BLI_SHIFT; - last_bit = last >> XFS_BLI_SHIFT; + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index df4454511f73..f20bb472d582 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format { * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLI_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF 0x1 /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLI_CANCEL 0x2 +#define XFS_BLF_CANCEL 0x2 /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLI_UDQUOT_BUF 0x4 -#define XFS_BLI_PDQUOT_BUF 0x8 -#define XFS_BLI_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF 0x4 +#define XFS_BLF_PDQUOT_BUF 0x8 +#define XFS_BLF_GDQUOT_BUF 0x10 -#define XFS_BLI_CHUNK 128 -#define XFS_BLI_SHIFT 7 +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) @@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format { #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format { { XFS_BLI_STALE, "STALE" }, \ { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ - { XFS_BLI_STALE_INODE, "STALE_INODE" } + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" } #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ef96175c0744..047b8a8e5c29 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) va_list ap; #ifdef DEBUG - xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); #endif if (xfs_panic_mask && (xfs_panic_mask & panic_tag) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3038dd52c72a..5215abc8023a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, uint flags); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); @@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); - -/* local ticket functions */ -STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int count, - char clientid, - uint flags); - #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); STATIC void xlog_verify_grant_head(xlog_t *log, int equals); @@ -360,6 +349,15 @@ xfs_log_reserve( ASSERT(flags & XFS_LOG_PERM_RESERV); internal_ticket = *ticket; + /* + * this is a new transaction on the ticket, so we need to + * change the transaction ID so that the next transaction has a + * different TID in the log. Just add one to the existing tid + * so that we can see chains of rolling transactions in the log + * easily. + */ + internal_ticket->t_tid++; + trace_xfs_log_reserve(log, internal_ticket); xlog_grant_push_ail(mp, internal_ticket->t_unit_res); @@ -367,7 +365,8 @@ xfs_log_reserve( } else { /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags); + client, flags, + KM_SLEEP|KM_MAYFAIL); if (!internal_ticket) return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; @@ -452,6 +451,13 @@ xfs_log_mount( /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + return 0; out_destroy_ail: @@ -658,6 +664,10 @@ xfs_log_item_init( item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); } /* @@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; return log; out_free_iclog: @@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log) xlog_in_core_t *iclog, *next_iclog; int i; + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { sv_destroy(&iclog->ic_force_wait); @@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -STATIC void -xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); @@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } + + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } /* @@ -1865,7 +1886,7 @@ xlog_write_copy_finish( * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ -STATIC int +int xlog_write( struct log *log, struct xfs_log_vec *log_vector, @@ -1889,22 +1910,26 @@ xlog_write( *start_lsn = 0; len = xlog_write_calc_vec_length(ticket, log_vector); - if (ticket->t_curr_res < len) { - xlog_print_tic_res(log->l_mp, ticket); -#ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); -#else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, - "xfs_log_write: reservation ran out. Need to up reservation"); + if (log->l_cilp) { + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); -#endif - } + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } else + ticket->t_curr_res -= len; - ticket->t_curr_res -= len; + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); index = 0; lv = log_vector; @@ -3000,6 +3025,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); + xlog_cil_push(log, 1); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3149,6 +3176,12 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); + if (log->l_cilp) { + lsn = xlog_cil_push_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + } + try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3313,22 +3346,30 @@ xfs_log_ticket_get( return ticket; } +xlog_tid_t +xfs_log_get_trans_ident( + struct xfs_trans *tp) +{ + return tp->t_ticket->t_tid; +} + /* * Allocate and initialise a new log ticket. */ -STATIC xlog_ticket_t * +xlog_ticket_t * xlog_ticket_alloc( struct log *log, int unit_bytes, int cnt, char client, - uint xflags) + uint xflags, + int alloc_flags) { struct xlog_ticket *tic; uint num_headers; int iclog_space; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); if (!tic) return NULL; @@ -3647,6 +3688,11 @@ xlog_state_ioerror( * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. */ int xfs_log_force_umount( @@ -3680,6 +3726,16 @@ xfs_log_force_umount( return 1; } retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + xlog_cil_push(log, 1); + /* * We must hold both the GRANT lock and the LOG lock, * before we mark the filesystem SHUTDOWN and wake diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 229d1f36ba9a..04c78e642cc8 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -19,7 +19,6 @@ #define __XFS_LOG_H__ /* get lsn fields */ - #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) @@ -114,6 +113,9 @@ struct xfs_log_vec { struct xfs_log_vec *lv_next; /* next lv in build list */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_buf_len; /* size of formatted buffer */ }; /* @@ -134,6 +136,7 @@ struct xlog_in_core; struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; +struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, @@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); -struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); +struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); +xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); + +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, int flags); +bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); + #endif diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 000000000000..bb17cc044bf3 --- /dev/null +++ b/fs/xfs/xfs_log_cil.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_alloc.h" + +/* + * Perform initial CIL structure initialisation. If the CIL is not + * enabled in this filesystem, ensure the log->l_cilp is null so + * we can check this conditional to determine if we are doing delayed + * logging or not. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + log->l_cilp = NULL; + if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) + return 0; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (!log->l_cilp) + return; + + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct log *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct log *log) +{ + if (!log->l_cilp) + return; + + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Insert the log item into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + * + * If this is the first time the item is being placed into the CIL in this + * context, pin it so it can't be written to disk until the CIL is flushed to + * the iclog and the iclog written to disk. + */ +static void +xlog_cil_insert( + struct log *log, + struct xlog_ticket *ticket, + struct xfs_log_item *item, + struct xfs_log_vec *lv) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *old = lv->lv_item->li_lv; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + int len; + int diff_iovecs; + int iclog_space; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + len = lv->lv_buf_len - old->lv_buf_len; + diff_iovecs = lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&item->li_cil)); + + len = lv->lv_buf_len; + diff_iovecs = lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + len += diff_iovecs * sizeof(xlog_op_header_t); + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + spin_lock(&cil->xc_cil_lock); + list_move_tail(&item->li_cil, &cil->xc_cil); + ctx->nvecs += diff_iovecs; + + /* + * If this is the first time the item is being committed to the CIL, + * store the sequence number on the log item so we can tell + * in future commits whether this is the first checkpoint the item is + * being committed into. + */ + if (!item->li_seq) + item->li_seq = ctx->sequence; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_format_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) { + void *ptr; + int index; + int len = 0; + + /* build the vector array and calculate it's length */ + IOP_FORMAT(lv->lv_item, lv->lv_iovecp); + for (index = 0; index < lv->lv_niovecs; index++) + len += lv->lv_iovecp[index].i_len; + + lv->lv_buf_len = len; + lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + ptr = lv->lv_buf; + + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + + xlog_cil_insert(log, ticket, lv->lv_item, lv); + } +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv->lv_buf); + kmem_free(lv); + lv = next; + } +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_log_vec *lv; + int abortflag = abort ? XFS_LI_ABORTED : 0; + struct xfs_busy_extent *busyp, *n; + + /* unpin all the log items */ + for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { + xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, + abortflag); + } + + list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + + spin_lock(&ctx->cil->xc_cil_lock); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_cil_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If the push_now flag is not set, + * then it is a background flush and so we can chose to ignore it. + */ +int +xlog_cil_push( + struct log *log, + int push_now) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_lv; + int num_iovecs; + int len; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + /* lock out transaction commit, but don't block on background push */ + if (!down_write_trylock(&cil->xc_ctx_lock)) { + if (!push_now) + goto out_free_ticket; + down_write(&cil->xc_ctx_lock); + } + ctx = cil->xc_ctx; + + /* check if we've anything to push */ + if (list_empty(&cil->xc_cil)) + goto out_skip; + + /* check for spurious background flush */ + if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_lv = 0; + num_iovecs = 0; + len = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + int i; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + + num_lv++; + num_iovecs += lv->lv_niovecs; + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + */ + spin_lock(&cil->xc_cil_lock); + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for own own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + } + spin_unlock(&cil->xc_cil_lock); + + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (error || commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_cil_lock); + ctx->commit_lsn = commit_lsn; + sv_broadcast(&cil->xc_commit_wait); + spin_unlock(&cil->xc_cil_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); +out_free_ticket: + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + * + * XXX: Initially, just push the CIL unconditionally and return whatever + * commit lsn is there. It'll be empty, so this is broken for now. + */ +xfs_lsn_t +xlog_cil_push_lsn( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + +restart: + down_write(&cil->xc_ctx_lock); + ASSERT(push_seq <= cil->xc_ctx->sequence); + + /* check to see if we need to force out the current context */ + if (push_seq == cil->xc_ctx->sequence) { + up_write(&cil->xc_ctx_lock); + xlog_cil_push(log, 1); + goto restart; + } + + /* + * See if we can find a previous sequence still committing. + * We can drop the flush lock as soon as we have the cil lock + * because we are now only comparing contexts protected by + * the cil lock. + * + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + if (ctx->sequence > push_seq) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + if (ctx->sequence != push_seq) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + spin_unlock(&cil->xc_cil_lock); + return commit_lsn; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) + return false; + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9cf695154451..8c072618965c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ -typedef __uint32_t xlog_tid_t; - #ifdef __KERNEL__ /* @@ -379,6 +377,99 @@ typedef struct xlog_in_core { } xlog_in_core_t; /* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + xfs_log_callback_t log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct log *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + struct xfs_cil_ctx *xc_ctx; + struct rw_semaphore xc_ctx_lock; + struct list_head xc_committing; + sv_t xc_commit_wait; +}; + +/* + * The amount of log space we should the CIL to aggregate is difficult to size. + * Whatever we chose we have to make we can get a reservation for the log space + * effectively, that it is large enough to capture sufficient relogging to + * reduce log buffer IO significantly, but it is not too large for the log or + * induces too much latency when writing out through the iclogs. We track both + * space consumed and the number of vectors in the checkpoint context, so we + * need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can basically make up arbitrary limits for the + * checkpoint size so long as they don't violate any other size rules. Hence + * the initial maximum size for the checkpoint transaction will be set to a + * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit + * right now based on the latency of writing out a large amount of data through + * the circular iclog buffers. + */ + +#define XLOG_CIL_SPACE_LIMIT(log) \ + (min((log->l_logsize >> 2), (8 * 1024 * 1024))) + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -388,6 +479,7 @@ typedef struct log { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ @@ -438,14 +530,17 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) - /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern kmem_zone_t *xfs_log_ticket_zone; +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, + int count, char client, uint xflags, + int alloc_flags); + static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) @@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) *off += bytes; } +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct log *log); +void xlog_cil_init_post_recovery(struct log *log); +void xlog_cil_destroy(struct log *log); + +int xlog_cil_push(struct log *log, int push_now); +xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); + /* * Unmount record type is used as a pseudo transaction type for the ticket. * It's value must be outside the range of XFS_TRANS_* values. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0de08e366315..14a69aec2c0b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); list_move(&item->ri_list, &trans->r_itemq); @@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1( /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) { + if (!(flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_buf_not_cancel(log, buf_f); return; } @@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1( * Check to see whether the buffer being recovered has a corresponding * entry in the buffer cancel record table. If it does then return 1 * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement + * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement * the refcount on the entry in the table and remove it from the table * if this is the last reference. * @@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled( * There is nothing in the table built in pass one, * so this buffer must not be cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled( * There is no corresponding entry in the table built * in pass one, so this buffer has not been cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled( * one in the table and remove it if this is the * last reference. */ - if (flags & XFS_BLI_CANCEL) { + if (flags & XFS_BLF_CANCEL) { bcp->bc_refcount--; if (bcp->bc_refcount == 0) { if (prevp == NULL) { @@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled( * We didn't find a corresponding entry in the table, so * return 0 so that the buffer is NOT cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer( } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); /* @@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); @@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer( } memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ + (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; @@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans( if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. + * with the XFS_BLF_CANCEL bit set. */ xlog_recover_do_buffer_pass1(log, buf_f); return 0; @@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans( mp = log->l_mp; buf_flags = XBF_LOCK; - if (!(flags & XFS_BLI_INODE_BUF)) + if (!(flags & XFS_BLF_INODE_BUF)) buf_flags |= XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); @@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans( } error = 0; - if (flags & XFS_BLI_INODE_BUF) { + if (flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index 75d749207258..1c55ccbb379d 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -28,7 +28,7 @@ #define XLOG_RHASH(tid) \ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) -#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9ff48a16a7ee..1d2c7eed4eda 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -268,6 +268,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index be578ecb4af2..ce558efa2ea0 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -44,6 +44,7 @@ #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" +#include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; @@ -243,9 +244,8 @@ _xfs_trans_alloc( tp->t_type = type; tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(tp->t_items)); - XFS_LBC_INIT(&(tp->t_busy)); + INIT_LIST_HEAD(&tp->t_busy); return tp; } @@ -255,8 +255,13 @@ _xfs_trans_alloc( */ STATIC void xfs_trans_free( - xfs_trans_t *tp) + struct xfs_trans *tp) { + struct xfs_busy_extent *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tp->t_busy, list) + xfs_alloc_busy_clear(tp->t_mountp, busyp); + atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); @@ -285,9 +290,8 @@ xfs_trans_dup( ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; - ntp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(ntp->t_items)); - XFS_LBC_INIT(&(ntp->t_busy)); + INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -423,7 +427,6 @@ undo_blocks: return error; } - /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas( * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. */ -STATIC void +void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { @@ -880,7 +883,7 @@ xfs_trans_fill_vecs( * they could be immediately flushed and we'd have to race with the flusher * trying to pull the item from the AIL as we add it. */ -static void +void xfs_trans_item_committed( struct xfs_log_item *lip, xfs_lsn_t commit_lsn, @@ -930,26 +933,6 @@ xfs_trans_item_committed( IOP_UNPIN(lip); } -/* Clear all the per-AG busy list items listed in this transaction */ -static void -xfs_trans_clear_busy_extents( - struct xfs_trans *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; - - for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) { - i = 0; - for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (XFS_LBC_ISFREE(lbcp, i)) - continue; - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx); - } - } - xfs_trans_free_busy(tp); -} - /* * This is typically called by the LM when a transaction has been fully * committed to disk. It needs to unpin the items which have @@ -984,7 +967,6 @@ xfs_trans_committed( kmem_free(licp); } - xfs_trans_clear_busy_extents(tp); xfs_trans_free(tp); } @@ -1012,8 +994,7 @@ xfs_trans_uncommit( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } @@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog( *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); tp->t_commit_lsn = *commit_lsn; + trace_xfs_trans_commit_lsn(tp); + if (nvec > XFS_TRANS_LOGVEC_COUNT) kmem_free(log_vector); @@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog( return xfs_log_release_iclog(mp, commit_iclog); } +/* + * Walk the log items and allocate log vector structures for + * each item large enough to fit all the vectors they require. + * Note that this format differs from the old log vector format in + * that there is no transaction header in these log vectors. + */ +STATIC struct xfs_log_vec * +xfs_trans_alloc_log_vecs( + xfs_trans_t *tp) +{ + xfs_log_item_desc_t *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; + + lidp = xfs_trans_first_item(tp); + + /* Bail out if we didn't find a log item. */ + if (!lidp) { + ASSERT(0); + return NULL; + } + + while (lidp != NULL) { + struct xfs_log_vec *new_lv; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + /* Skip items that do not have any vectors for writing */ + lidp->lid_size = IOP_SIZE(lidp->lid_item); + if (!lidp->lid_size) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + new_lv = kmem_zalloc(sizeof(*new_lv) + + lidp->lid_size * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = lidp->lid_size; + new_lv->lv_item = lidp->lid_item; + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; + lidp = xfs_trans_next_item(tp, lidp); + } + + return ret_lv; +} + +static int +xfs_trans_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct xfs_log_vec *log_vector; + int error; + + /* + * Get each log item to allocate a vector structure for + * the log item to to pass to the log write code. The + * CIL commit code will format the vector and save it away. + */ + log_vector = xfs_trans_alloc_log_vecs(tp); + if (!log_vector) + return ENOMEM; + + error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); + if (error) + return error; + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* xfs_trans_free_items() unlocks them first */ + xfs_trans_free_items(tp, *commit_lsn, 0); + xfs_trans_free(tp); + return 0; +} /* * xfs_trans_commit @@ -1221,7 +1291,11 @@ _xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + if (mp->m_flags & XFS_MOUNT_DELAYLOG) + error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); + else + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + if (error == ENOMEM) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); error = XFS_ERROR(EIO); @@ -1259,8 +1333,7 @@ out_unreserve: error = XFS_ERROR(EIO); } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); @@ -1338,8 +1411,7 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c62beee0921e..8c69e7824f68 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -106,7 +106,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_TYPE_MAX 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_TYPE_MAX 42 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -148,6 +149,7 @@ typedef struct xfs_trans_header { { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ { XFS_TRANS_DUMMY1, "DUMMY1" }, \ { XFS_TRANS_DUMMY2, "DUMMY2" }, \ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } @@ -813,6 +815,7 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; +struct xfs_busy_extent; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -828,6 +831,11 @@ typedef struct xfs_log_item { /* buffer item iodone */ /* callback func */ struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 @@ -872,34 +880,6 @@ typedef struct xfs_item_ops { #define XFS_ITEM_PUSHBUF 3 /* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; - -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) - -/* * This is the type of function which can be given to xfs_trans_callback() * to be called upon the transaction's commit to disk. */ @@ -950,8 +930,7 @@ typedef struct xfs_trans { unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -1025,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *, void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); extern kmem_zone_t *xfs_trans_zone; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 9cd809025f3a..63d81a22f4fd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -114,7 +114,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* @@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); @@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; @@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); @@ -762,8 +762,8 @@ xfs_trans_binval( ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(lidp->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -774,7 +774,7 @@ xfs_trans_binval( * in the buf log item. The STALE flag will be used in * xfs_buf_item_unpin() to determine if it should clean up * when the last reference to the buf item is given up. - * We set the XFS_BLI_CANCEL flag in the buf log format structure + * We set the XFS_BLF_CANCEL flag in the buf log format structure * and log the buf item. This will be used at recovery time * to determine that copies of the buffer in the log before * this should not be replayed. @@ -792,9 +792,9 @@ xfs_trans_binval( XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_STALE(bp); bip->bli_flags |= XFS_BLI_STALE; - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLI_CANCEL; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); lidp->lid_flags |= XFS_LID_DIRTY; @@ -802,16 +802,16 @@ xfs_trans_binval( } /* - * This call is used to indicate that the buffer contains on-disk - * inodes which must be handled specially during recovery. They - * require special handling because only the di_next_unlinked from - * the inodes in the buffer should be recovered. The rest of the - * data in the buffer is logged via the inodes themselves. + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. * - * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log - * format structure so that we'll know what to do at recovery time. + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. */ -/* ARGSUSED */ void xfs_trans_inode_buf( xfs_trans_t *tp, @@ -826,7 +826,7 @@ xfs_trans_inode_buf( bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; + bip->bli_flags |= XFS_BLI_INODE_BUF; } /* @@ -908,9 +908,9 @@ xfs_trans_dquot_buf( ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(type == XFS_BLI_UDQUOT_BUF || - type == XFS_BLI_PDQUOT_BUF || - type == XFS_BLI_GDQUOT_BUF); + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index eb3fc57f9eef..f11d37d06dcc 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) void xfs_trans_free_items( xfs_trans_t *tp, + xfs_lsn_t commit_lsn, int flags) { xfs_log_item_chunk_t *licp; @@ -311,7 +312,7 @@ xfs_trans_free_items( * Special case the embedded chunk so we don't free it below. */ if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); xfs_lic_all_free(licp); licp->lic_unused = 0; } @@ -322,7 +323,7 @@ xfs_trans_free_items( */ while (licp != NULL) { ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); next_licp = licp->lic_next; kmem_free(licp); licp = next_licp; @@ -438,112 +439,3 @@ xfs_trans_unlock_chunk( return freed; } - - -/* - * This is called to add the given busy item to the transaction's - * list of busy items. It must find a free busy item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to busy descriptor used to point - * to the new busy entry. The log busy entry will now point to its new - * descriptor with its ???? field. - */ -xfs_log_busy_slot_t * -xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_busy_free == 0) { - lbcp = (xfs_log_busy_chunk_t*) - kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); - ASSERT(lbcp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LBC_INIT(lbcp); - XFS_LBC_CLAIM(lbcp, 0); - lbcp->lbc_unused = 1; - lbsp = XFS_LBC_SLOT(lbcp, 0); - - /* - * Link in the new chunk and update the free count. - */ - lbcp->lbc_next = tp->t_busy.lbc_next; - tp->t_busy.lbc_next = lbcp; - tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - if (XFS_LBC_VACANCY(lbcp)) { - if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { - i = lbcp->lbc_unused; - break; - } else { - /* out-of-order vacancy */ - cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); - ASSERT(0); - } - } - lbcp = lbcp->lbc_next; - } - ASSERT(lbcp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LBC_CLAIM(lbcp, i); - if (lbcp->lbc_unused <= i) { - lbcp->lbc_unused = i + 1; - } - lbsp = XFS_LBC_SLOT(lbcp, i); - tp->t_busy_free--; - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; -} - - -/* - * xfs_trans_free_busy - * Free all of the busy lists from a transaction - */ -void -xfs_trans_free_busy(xfs_trans_t *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_chunk_t *lbcq; - - lbcp = tp->t_busy.lbc_next; - while (lbcp != NULL) { - lbcq = lbcp->lbc_next; - kmem_free(lbcp); - lbcp = lbcq; - } - - XFS_LBC_INIT(&tp->t_busy); - tp->t_busy.lbc_unused = 0; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 73e2ad397432..c6e4f2c8de6e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, struct xfs_log_item_desc *); -void xfs_trans_free_items(struct xfs_trans *, int); -void xfs_trans_unlock_items(struct xfs_trans *, - xfs_lsn_t); -void xfs_trans_free_busy(xfs_trans_t *tp); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); + +void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); + +void xfs_trans_item_committed(struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, int aborted); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); /* * AIL traversal cursor. diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b09904555d07..320775295e32 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ +typedef __uint32_t xlog_tid_t; /* transaction ID type */ + /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: |