diff options
Diffstat (limited to 'fs')
121 files changed, 5100 insertions, 1669 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 945aa5f02f9b..a9ea73d6dcf3 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, uint16_t klen = 0; v9ses = (struct v9fs_session_info *)cookie_netfs_data; - P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses, - buffer, bufmax); + p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n", + v9ses, buffer, bufmax); if (v9ses->cachetag) klen = strlen(v9ses->cachetag); @@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, return 0; memcpy(buffer, v9ses->cachetag, klen); - P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag); + p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag); return klen; } @@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, v9ses); - P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses, - v9ses->fscache); + p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", + v9ses, v9ses->fscache); } void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses, - v9ses->fscache); + p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", + v9ses, v9ses->fscache); fscache_relinquish_cookie(v9ses->fscache, 0); v9ses->fscache = NULL; } @@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, - v9inode->qid.path); + p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n", + &v9inode->vfs_inode, v9inode->qid.path); return sizeof(v9inode->qid.path); } @@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, const struct v9fs_inode *v9inode = cookie_netfs_data; *size = i_size_read(&v9inode->vfs_inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, - *size); + p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n", + &v9inode->vfs_inode, *size); } static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, @@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, - v9inode->qid.version); + p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n", + &v9inode->vfs_inode, v9inode->qid.version); return sizeof(v9inode->qid.version); } @@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &v9fs_cache_inode_index_def, v9inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", + inode, v9inode->fscache); } void v9fs_cache_inode_put_cookie(struct inode *inode) @@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode) if (!v9inode->fscache) return; - P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", + inode, v9inode->fscache); fscache_relinquish_cookie(v9inode->fscache, 0); v9inode->fscache = NULL; @@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) if (!v9inode->fscache) return; - P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", + inode, v9inode->fscache); fscache_relinquish_cookie(v9inode->fscache, 1); v9inode->fscache = NULL; @@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, v9inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", - inode, old, v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", + inode, old, v9inode->fscache); spin_unlock(&v9inode->fscache_lock); } @@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); if (!v9inode->fscache) return -ENOBUFS; @@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case -ENOBUFS: case -ENODATA: - P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret); + p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret); return 1; case 0: - P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); return ret; default: - P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); return ret; } } @@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode, int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); + p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages); if (!v9inode->fscache) return -ENOBUFS; @@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode, switch (ret) { case -ENOBUFS: case -ENODATA: - P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret); + p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret); return 1; case 0: BUG_ON(!list_empty(pages)); BUG_ON(*nr_pages != 0); - P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); return ret; default: - P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); return ret; } } @@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); - P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); + p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); if (ret != 0) v9fs_uncache_page(inode, page); } @@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) { const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); if (PageFsCache(page)) fscache_wait_on_page_write(v9inode->fscache, page); } diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 85b67ffa2a43..da8eefbe830d 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { struct v9fs_dentry *dent; - P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n", - fid->fid, dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n", + fid->fid, dentry->d_name.name); dent = dentry->d_fsdata; if (!dent) { @@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) struct v9fs_dentry *dent; struct p9_fid *fid, *ret; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", - dentry->d_name.name, dentry, uid, any); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", + dentry->d_name.name, dentry, uid, any); dent = (struct v9fs_dentry *) dentry->d_fsdata; ret = NULL; if (dent) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 2b78014a124a..1964f98e74be 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -23,6 +23,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> @@ -85,15 +87,15 @@ static int get_cache_mode(char *s) if (!strcmp(s, "loose")) { version = CACHE_LOOSE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { version = CACHE_FSCACHE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "none")) { version = CACHE_NONE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); } else - printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s); + pr_info("Unknown Cache mode %s\n", s); return version; } @@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_debug: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_dfltuid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_dfltgid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_afid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; - P9_DPRINTK(P9_DEBUG_ERROR, - "problem allocating copy of cache arg\n"); + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of cache arg\n"); goto free_and_return; } ret = get_cache_mode(s); @@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; - P9_DPRINTK(P9_DEBUG_ERROR, - "problem allocating copy of access arg\n"); + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of access arg\n"); goto free_and_return; } @@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->uid = simple_strtoul(s, &e, 10); if (*e != '\0') { ret = -EINVAL; - printk(KERN_INFO "9p: Unknown access " - "argument %s.\n", s); + pr_info("Unknown access argument %s\n", + s); kfree(s); goto free_and_return; } @@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_POSIX_ACL; #else - P9_DPRINTK(P9_DEBUG_ERROR, - "Not defined CONFIG_9P_FS_POSIX_ACL. " - "Ignoring posixacl option\n"); + p9_debug(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n"); #endif break; @@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; - P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n"); + p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); goto error; } @@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(fid)) { retval = PTR_ERR(fid); fid = NULL; - P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n"); + p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); goto error; } @@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); + p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); p9_client_disconnect(v9ses->clnt); } @@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); + p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); p9_client_begin_disconnect(v9ses->clnt); } @@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void) static int __init init_v9fs(void) { int err; - printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); + pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ err = register_filesystem(&v9fs_fs_type); if (err < 0) { - printk(KERN_ERR "Failed to register filesystem\n"); + pr_err("Failed to register filesystem\n"); return err; } err = v9fs_cache_register(); if (err < 0) { - printk(KERN_ERR "Failed to register v9fs for caching\n"); + pr_err("Failed to register v9fs for caching\n"); goto out_fs_unreg; } err = v9fs_sysfs_init(); if (err < 0) { - printk(KERN_ERR "Failed to register with sysfs\n"); + pr_err("Failed to register with sysfs\n"); goto out_sysfs_cleanup; } diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 2524e4cbb8ea..0ad61c6a65a5 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) struct inode *inode; inode = page->mapping->host; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); BUG_ON(!PageLocked(page)); @@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, struct inode *inode; inode = mapping->host; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); if (ret == 0) return ret; ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); - P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret); + p9_debug(P9_DEBUG_VFS, " = %d\n", ret); return ret; } @@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * Now that we do caching with cache mode enabled, We need * to support direct IO */ - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " - "off/no(%lld/%lu) EINVAL\n", - iocb->ki_filp->f_path.dentry->d_name.name, - (long long) pos, nr_segs); + p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long)pos, nr_segs); return -EINVAL; } diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index e022890c6f40..d529437ff442 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,8 +53,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry) { - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); return 1; } @@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry) */ static int v9fs_cached_dentry_delete(const struct dentry *dentry) { - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); /* Don't cache negative dentries */ if (!dentry->d_inode) @@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry) struct v9fs_dentry *dent; struct p9_fid *temp, *current_fid; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); dent = dentry->d_fsdata; if (dent) { list_for_each_entry_safe(current_fid, temp, &dent->fidlist, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 598fff1a54e5..ff911e779651 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) int reclen = 0; struct p9_rdir *rdir; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; @@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; p9stat_free(&st); goto unlock_and_exit; @@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, struct p9_dirent curdirent; u64 oldoffset = 0; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; @@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, rdir->tail - rdir->head, &curdirent); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; goto unlock_and_exit; } @@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) struct p9_fid *fid; fid = filp->private_data; - P9_DPRINTK(P9_DEBUG_VFS, - "v9fs_dir_release: inode: %p filp: %p fid: %d\n", - inode, filp, fid ? fid->fid : -1); + p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); if (fid) p9_client_clunk(fid); return 0; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 62857a810a79..fc06fd27065e 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct p9_fid *fid; int omode; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) @@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) int res = 0; struct inode *inode = filp->f_path.dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); + p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - schedule_timeout_interruptible(P9_LOCK_TIMEOUT); + if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + break; } /* map 9p status to VFS status */ @@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) struct inode *inode = filp->f_path.dentry->d_inode; int ret = -ENOLCK; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, - cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", + filp, cmd, fl, filp->f_path.dentry->d_name.name); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct inode *inode = filp->f_path.dentry->d_inode; int ret = -ENOLCK; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, - cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", + filp, cmd, fl, filp->f_path.dentry->d_name.name); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, { int n, total, size; - P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, - (long long unsigned) offset, count); + p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", + fid->fid, (long long unsigned)offset, count); n = 0; total = 0; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, struct p9_fid *fid; size_t size; - P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); + p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, loff_t origin = *offset; unsigned long pg_start, pg_end; - P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, - (int)count, (int)*offset); + p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", + data, (int)count, (int)*offset); clnt = fid->clnt; do { @@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, return retval; mutex_lock(&inode->i_mutex); - P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); @@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, return retval; mutex_lock(&inode->i_mutex); - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", - filp, datasync); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; @@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = filp->f_path.dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", - page, (unsigned long)filp->private_data); + p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e0f20de6aa2b..014c8dd62962 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -23,6 +23,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> @@ -88,6 +90,32 @@ static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode) } /** + * p9mode2perm- convert plan9 mode bits to unix permission bits + * @v9ses: v9fs session information + * @stat: p9_wstat from which mode need to be derived + * + */ +static int p9mode2perm(struct v9fs_session_info *v9ses, + struct p9_wstat *stat) +{ + int res; + int mode = stat->mode; + + res = mode & S_IALLUGO; + if (v9fs_proto_dotu(v9ses)) { + if ((mode & P9_DMSETUID) == P9_DMSETUID) + res |= S_ISUID; + + if ((mode & P9_DMSETGID) == P9_DMSETGID) + res |= S_ISGID; + + if ((mode & P9_DMSETVTX) == P9_DMSETVTX) + res |= S_ISVTX; + } + return res; +} + +/** * p9mode2unixmode- convert plan9 mode bits to unix mode bits * @v9ses: v9fs session information * @stat: p9_wstat from which mode need to be derived @@ -100,8 +128,8 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, int res; u32 mode = stat->mode; - res = mode & S_IALLUGO; *rdev = 0; + res = p9mode2perm(v9ses, stat); if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; @@ -128,24 +156,13 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, res |= S_IFBLK; break; default: - P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c %s\n", type, - stat->extension); + p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n", + type, stat->extension); }; *rdev = MKDEV(major, minor); } else res |= S_IFREG; - if (v9fs_proto_dotu(v9ses)) { - if ((mode & P9_DMSETUID) == P9_DMSETUID) - res |= S_ISUID; - - if ((mode & P9_DMSETGID) == P9_DMSETGID) - res |= S_ISGID; - - if ((mode & P9_DMSETVTX) == P9_DMSETVTX) - res |= S_ISVTX; - } return res; } @@ -275,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; } else { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); + p9_debug(P9_DEBUG_ERROR, + "special files without extended mode\n"); err = -EINVAL; goto error; } @@ -301,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; case S_IFLNK: if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " - "legacy protocol.\n"); + p9_debug(P9_DEBUG_ERROR, + "extended modes used with legacy protocol\n"); err = -EINVAL; goto error; } @@ -329,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; default: - P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", - mode, mode & S_IFMT); + p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", + mode, mode & S_IFMT); err = -EINVAL; goto error; } @@ -352,11 +369,12 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); + p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); inode = new_inode(sb); if (!inode) { - P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + pr_warn("%s (%d): Problem allocating inode\n", + __func__, task_pid_nr(current)); return ERR_PTR(-ENOMEM); } err = v9fs_init_inode(v9ses, inode, mode, rdev); @@ -573,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) struct p9_fid *v9fid, *dfid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", - dir, dentry, flags); + p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", + dir, dentry, flags); v9ses = v9fs_inode2v9ses(dir); inode = dentry->d_inode; dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); return retval; } if (v9fs_proto_dotl(v9ses)) @@ -630,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct p9_fid *dfid, *ofid, *fid; struct inode *inode; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; ofid = NULL; @@ -639,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return ERR_PTR(err); } @@ -647,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); return ERR_PTR(err); } err = p9_client_fcreate(ofid, name, perm, mode, extension); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); - goto error; - } - - /* now walk from the parent so we can get unopened fid */ - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; + p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); goto error; } - /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); - goto error; + if (!(perm & P9_DMLINK)) { + /* now walk from the parent so we can get unopened fid */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, + "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + /* + * instantiate inode and assign the unopened fid to the dentry + */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, + "inode creation failed %d\n", err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - d_instantiate(dentry, inode); return ofid; error: if (ofid) @@ -788,7 +811,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct p9_fid *fid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode | S_IFDIR); @@ -826,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, char *name; int result = 0; - P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_name.name, dentry, nameidata); + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", + dir, dentry->d_name.name, dentry, nameidata); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -933,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; @@ -969,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, * 9P .u can only handle file rename in the same directory */ - P9_DPRINTK(P9_DEBUG_ERROR, - "old dir and new dir are different\n"); + p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; goto clunk_newdir; } @@ -1026,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct p9_fid *fid; struct p9_wstat *st; - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { @@ -1063,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_fid *fid; struct p9_wstat wstat; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = inode_change_ok(dentry->d_inode, iattr); if (retval) return retval; @@ -1162,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, set_nlink(inode, i_nlink); } } - mode = stat->mode & S_IALLUGO; + mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; i_size_write(inode, stat->length); @@ -1208,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct p9_fid *fid; struct p9_wstat *st; - P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); retval = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); @@ -1230,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) /* copy extension buffer into buffer */ strncpy(buffer, st->extension, buflen); - P9_DPRINTK(P9_DEBUG_VFS, - "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); + p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n", + dentry->d_name.name, st->extension, buffer); retval = strnlen(buffer, buflen); done: @@ -1252,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) int len = 0; char *link = __getname(); - P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); if (!link) link = ERR_PTR(-ENOMEM); @@ -1283,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); - P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, - IS_ERR(s) ? "<error>" : s); + p9_debug(P9_DEBUG_VFS, " %s %s\n", + dentry->d_name.name, IS_ERR(s) ? "<error>" : s); if (!IS_ERR(s)) __putname(s); } @@ -1306,7 +1328,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); if (!v9fs_proto_dotu(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); + p9_debug(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } @@ -1333,8 +1355,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, static int v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, - dentry->d_name.name, symname); + p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", + dir->i_ino, dentry->d_name.name, symname); return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } @@ -1355,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, char *name; struct p9_fid *oldfid; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, - old_dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", + dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); oldfid = v9fs_fid_clone(old_dentry); if (IS_ERR(oldfid)) @@ -1398,9 +1419,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde char *name; u32 perm; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry->d_name.name, mode, + MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 8ef152ac6a16..a1e6c990cd41 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, } name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " - "mode:0x%hx\n", name, flags, omode); + p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", + name, flags, omode); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } @@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); return err; } @@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in creat %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + err); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), mode, gid, &qid); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_open_dotl failed in creat %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + err); goto error; } v9fs_invalidate_inode_attr(dir); @@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); @@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } @@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mkdir %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", + err); goto error; } name = (char *) dentry->d_name.name; @@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, struct p9_fid *fid; struct p9_stat_dotl *st; - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { @@ -523,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, return 0; } +/* + * Attribute flags. + */ +#define P9_ATTR_MODE (1 << 0) +#define P9_ATTR_UID (1 << 1) +#define P9_ATTR_GID (1 << 2) +#define P9_ATTR_SIZE (1 << 3) +#define P9_ATTR_ATIME (1 << 4) +#define P9_ATTR_MTIME (1 << 5) +#define P9_ATTR_CTIME (1 << 6) +#define P9_ATTR_ATIME_SET (1 << 7) +#define P9_ATTR_MTIME_SET (1 << 8) + +struct dotl_iattr_map { + int iattr_valid; + int p9_iattr_valid; +}; + +static int v9fs_mapped_iattr_valid(int iattr_valid) +{ + int i; + int p9_iattr_valid = 0; + struct dotl_iattr_map dotl_iattr_map[] = { + { ATTR_MODE, P9_ATTR_MODE }, + { ATTR_UID, P9_ATTR_UID }, + { ATTR_GID, P9_ATTR_GID }, + { ATTR_SIZE, P9_ATTR_SIZE }, + { ATTR_ATIME, P9_ATTR_ATIME }, + { ATTR_MTIME, P9_ATTR_MTIME }, + { ATTR_CTIME, P9_ATTR_CTIME }, + { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, + { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, + }; + for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { + if (iattr_valid & dotl_iattr_map[i].iattr_valid) + p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; + } + return p9_iattr_valid; +} + /** * v9fs_vfs_setattr_dotl - set file metadata * @dentry: file whose metadata to set @@ -537,13 +576,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) struct p9_fid *fid; struct p9_iattr_dotl p9attr; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = inode_change_ok(dentry->d_inode, iattr); if (retval) return retval; - p9attr.valid = iattr->ia_valid; + p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); p9attr.mode = iattr->ia_mode; p9attr.uid = iattr->ia_uid; p9attr.gid = iattr->ia_gid; @@ -670,14 +709,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, struct v9fs_session_info *v9ses; name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", - dir->i_ino, name, symname); + p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } @@ -687,7 +725,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); goto error; } @@ -697,8 +735,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -707,8 +745,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -751,9 +789,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", - dir->i_ino, old_dentry->d_name.name, - dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); v9ses = v9fs_inode2v9ses(dir); dir_dentry = v9fs_dentry_from_dir_inode(dir); @@ -770,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; } @@ -813,9 +850,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); + p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry->d_name.name, omode, + MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; @@ -825,7 +862,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } @@ -835,8 +872,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mknod %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", + err); goto error; } name = (char *) dentry->d_name.name; @@ -851,8 +888,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -860,8 +897,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -905,7 +942,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) char *link = __getname(); char *target; - P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); if (!link) { link = ERR_PTR(-ENOMEM); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index f68ff65a32a5..7b0cd87b07c2 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -121,7 +121,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, struct p9_fid *fid; int retval = 0; - P9_DPRINTK(P9_DEBUG_VFS, " \n"); + p9_debug(P9_DEBUG_VFS, "\n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) @@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; v9fs_fid_add(root, fid); - P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); clunk_fid: @@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); + p9_debug(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); @@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); s->s_fs_info = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); + p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); } static void @@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; @@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index d288773871b3..29653b70a9c3 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { retval = PTR_ERR(attr_fid); - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_attrwalk failed %zd\n", retval); + p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n", + retval); attr_fid = NULL; goto error; } @@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, { struct p9_fid *fid; - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", - __func__, name, buffer_size); + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n", + name, buffer_size); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, int retval, msize, write_count; struct p9_fid *fid = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", - __func__, name, value_len, flags); + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", + name, value_len, flags); fid = v9fs_fid_clone(dentry); if (IS_ERR(fid)) { @@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, */ retval = p9_client_xattrcreate(fid, name, value_len, flags); if (retval < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_xattrcreate failed %d\n", retval); + p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", + retval); goto error; } msize = fid->clnt->msize; diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 79e2ca7973b7..e95d1b64082c 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF +config ARCH_BINFMT_ELF_RANDOMIZE_PIE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y @@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total) batch->count = total; } -static void kiocb_batch_free(struct kiocb_batch *batch) +static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) { struct kiocb *req, *n; + if (list_empty(&batch->head)) + return; + + spin_lock_irq(&ctx->ctx_lock); list_for_each_entry_safe(req, n, &batch->head, ki_batch) { list_del(&req->ki_batch); + list_del(&req->ki_list); kmem_cache_free(kiocb_cachep, req); + ctx->reqs_active--; } + spin_unlock_irq(&ctx->ctx_lock); } /* @@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - kiocb_batch_free(&batch); + kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 5869d4e974a9..d8d8e7ba6a1e 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -116,6 +116,7 @@ struct autofs_sb_info { int needs_reghost; struct super_block *sb; struct mutex wq_mutex; + struct mutex pipe_mutex; spinlock_t fs_lock; struct autofs_wait_queue *queues; /* Wait queue pointer */ spinlock_t lookup_lock; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 2ba44c79d548..e16980b00b8d 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); + mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; spin_lock_init(&sbi->lookup_lock); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e1fbdeef85db..da8876d38a7b 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct file *file, const void *addr, int bytes) +static int autofs4_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; mm_segment_t fs; const char *data = (const char *)addr; ssize_t wr = 0; - /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/ - sigpipe = sigismember(¤t->pending.signal, SIGPIPE); /* Save pointer to user space and point back to kernel space */ fs = get_fs(); set_fs(KERNEL_DS); + mutex_lock(&sbi->pipe_mutex); while (bytes && (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { data += wr; bytes -= wr; } + mutex_unlock(&sbi->pipe_mutex); set_fs(fs); @@ -110,6 +111,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; + mutex_lock(&sbi->wq_mutex); + + /* Check if we have become catatonic */ + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: @@ -163,22 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } default: printk("autofs4_notify_daemon: bad type %d!\n", type); + mutex_unlock(&sbi->wq_mutex); return; } - /* Check if we have become catatonic */ - mutex_lock(&sbi->wq_mutex); - if (!sbi->catatonic) { - pipe = sbi->pipe; - get_file(pipe); - } + pipe = sbi->pipe; + get_file(pipe); + mutex_unlock(&sbi->wq_mutex); - if (pipe) { - if (autofs4_write(pipe, &pkt, pktsz)) - autofs4_catatonic_mode(sbi); - fput(pipe); - } + if (autofs4_write(sbi, pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); } static int autofs4_getpath(struct autofs_sb_info *sbi, @@ -257,6 +261,9 @@ static int validate_request(struct autofs_wait_queue **wait, struct autofs_wait_queue *wq; struct autofs_info *ino; + if (sbi->catatonic) + return -ENOENT; + /* Wait in progress, continue; */ wq = autofs4_find_wait(sbi, qstr); if (wq) { @@ -289,6 +296,9 @@ static int validate_request(struct autofs_wait_queue **wait, if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; + if (sbi->catatonic) + return -ENOENT; + wq = autofs4_find_wait(sbi, qstr); if (wq) { *wait = wq; @@ -389,7 +399,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, ret = validate_request(&wq, sbi, &qstr, dentry, notify); if (ret <= 0) { - if (ret == 0) + if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); return ret; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 21ac5ee4b43f..bcb884e2d613 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#if defined(CONFIG_X86) || defined(CONFIG_ARM) +#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off * in runtime via sysctl. * If that is the case, retain the original non-zero diff --git a/fs/block_dev.c b/fs/block_dev.c index 69a5b6fbee2b..0e575d1304b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -25,7 +25,6 @@ #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> -#include <linux/kmemleak.h> #include <linux/cleancache.h> #include <asm/uaccess.h> #include "internal.h" @@ -521,7 +520,7 @@ static struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { int err; - struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -533,12 +532,7 @@ void __init bdev_cache_init(void) bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); - /* - * This vfsmount structure is only used to obtain the - * blockdev_superblock, so tell kmemleak not to report it. - */ - kmemleak_not_leak(bd_mnt); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* @@ -1145,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1165,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1238,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a7747..d8525662ca7a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe939c050..034d98503229 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask); + mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1136,7 +1136,8 @@ again: GFP_NOFS); } for (i = 0; i < num_pages; i++) { - clear_page_dirty_for_io(pages[i]); + if (clear_page_dirty_for_io(pages[i])) + account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 74fd74719dc2..618246bc2196 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -973,7 +973,7 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { + if (di->lease_session) { s = di->lease_session; spin_lock(&s->s_cap_lock); gen = s->s_cap_gen; @@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); dout("d_release %p\n", dentry); - if (di) { - ceph_dentry_lru_del(dentry); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); - kmem_cache_free(ceph_dentry_cachep, di); - dentry->d_fsdata = NULL; - } + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; } static int ceph_snapdir_d_revalidate(struct dentry *dentry, @@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, */ void ceph_dir_set_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry) && + ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); } void ceph_dir_clear_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); } bool ceph_dir_test_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) NOT complete\n", inode, dentry); + clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); return false; } @@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, do { ceph_mdsc_get_request(req); spin_unlock(&ci->i_unsafe_lock); + dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { @@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, } else { wait_for_completion(&req->r_safe_completion); } - spin_lock(&ci->i_unsafe_lock); ceph_mdsc_put_request(req); + spin_lock(&ci->i_unsafe_lock); if (ret || list_empty(head)) break; req = list_entry(head->next, @@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_touch(struct dentry *dn) @@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_del(struct dentry *dn) @@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); } /* diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9fbcdecaaccd..fbb2a643ef10 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, return -EINVAL; spin_lock(&dentry->d_lock); - parent = dget(dentry->d_parent); - spin_unlock(&dentry->d_lock); - + parent = dentry->d_parent; if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); @@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } - dput(parent); + spin_unlock(&dentry->d_lock); return type; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 25283e7a37f8..2c489378b4cd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -850,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_info *ci; struct ceph_dentry_info *di; BUG_ON(!inode); + ci = ceph_inode(inode); di = ceph_dentry(dn); spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6203d805eb45..23ab6a3f1825 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2772,7 +2772,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di = ceph_dentry(dentry); switch (h->action) { case CEPH_MDS_LEASE_REVOKE: - if (di && di->lease_session == session) { + if (di->lease_session == session) { if (ceph_seq_cmp(di->lease_seq, seq) > 0) h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); @@ -2781,7 +2781,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, break; case CEPH_MDS_LEASE_RENEW: - if (di && di->lease_session == session && + if (di->lease_session == session && di->lease_gen == session->s_cap_gen && di->lease_renew_from && di->lease_renew_after == 0) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 48f61a12af66..00de2c9568cd 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -131,6 +131,8 @@ enum { Opt_rbytes, Opt_norbytes, Opt_noasyncreaddir, + Opt_dcache, + Opt_nodcache, Opt_ino32, }; @@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = { {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_dcache, "dcache"}, + {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, {-1, NULL} }; @@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; + case Opt_dcache: + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_nodcache: + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + break; case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; @@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) + seq_puts(m, ",dcache"); + else + seq_puts(m, ",nodcache"); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); @@ -647,10 +661,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, root = ERR_PTR(-ENOMEM); goto out; } - ceph_init_dentry(root); } else { root = d_obtain_alias(inode); } + ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); } else { root = ERR_PTR(err); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cb3652b37271..1421f3d875a2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -28,6 +28,7 @@ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a5e36e4488a7..857214ae8c08 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -818,6 +818,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; + int required_blob_size; int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -833,14 +834,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; } + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); __build_xattrs(inode); +retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + required_blob_size = __get_required_blob_size(ci, 0, 0); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + err = __remove_xattr_by_name(ceph_inode(inode), name); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; @@ -853,6 +874,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) do_sync: spin_unlock(&ci->i_ceph_lock); err = ceph_send_removexattr(dentry, name); +out: return err; } diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 6475877b0763..911cf30d057d 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, - link the two up if this is needed - fill in the attributes */ -int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb) +struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb) { struct coda_vattr attr; + struct inode *inode; int error; /* We get inode numbers from Venus -- see venus source */ error = venus_getattr(sb, fid, &attr); - if ( error ) { - *inode = NULL; - return error; - } + if (error) + return ERR_PTR(error); - *inode = coda_iget(sb, fid, &attr); - if ( IS_ERR(*inode) ) { + inode = coda_iget(sb, fid, &attr); + if (IS_ERR(inode)) printk("coda_cnode_make: coda_iget failed\n"); - return PTR_ERR(*inode); - } - return 0; + return inode; } @@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) } /* the CONTROL inode is made without asking attributes from Venus */ -int coda_cnode_makectl(struct inode **inode, struct super_block *sb) +struct inode *coda_cnode_makectl(struct super_block *sb) { - int error = -ENOMEM; - - *inode = new_inode(sb); - if (*inode) { - (*inode)->i_ino = CTL_INO; - (*inode)->i_op = &coda_ioctl_inode_operations; - (*inode)->i_fop = &coda_ioctl_operations; - (*inode)->i_mode = 0444; - error = 0; + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = CTL_INO; + inode->i_op = &coda_ioctl_inode_operations; + inode->i_fop = &coda_ioctl_operations; + inode->i_mode = 0444; + return inode; } - - return error; + return ERR_PTR(-ENOMEM); } diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index e35071b1de0e..b24fdfd8a3f0 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -49,9 +49,9 @@ struct coda_file_info { #define C_DYING 0x4 /* from venus (which died) */ #define C_PURGE 0x8 -int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *); +struct inode *coda_cnode_make(struct CodaFid *, struct super_block *); struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); -int coda_cnode_makectl(struct inode **inode, struct super_block *sb); +struct inode *coda_cnode_makectl(struct super_block *sb); struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 83d2fd8ec24b..177515829062 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = { /* access routines: lookup, readlink, permission */ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd) { - struct inode *inode = NULL; - struct CodaFid resfid = { { 0, } }; - int type = 0; - int error = 0; + struct super_block *sb = dir->i_sb; const char *name = entry->d_name.name; size_t length = entry->d_name.len; + struct inode *inode; + int type = 0; if (length > CODA_MAXNAMLEN) { printk(KERN_ERR "name too long: lookup, %s (%*s)\n", @@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc /* control object, create inode on the fly */ if (coda_isroot(dir) && coda_iscontrol(name, length)) { - error = coda_cnode_makectl(&inode, dir->i_sb); + inode = coda_cnode_makectl(sb); type = CODA_NOCACHE; - goto exit; + } else { + struct CodaFid fid = { { 0, } }; + int error = venus_lookup(sb, coda_i2f(dir), name, length, + &type, &fid); + inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error); } - error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, - &type, &resfid); - if (!error) - error = coda_cnode_make(&inode, &resfid, dir->i_sb); - - if (error && error != -ENOENT) - return ERR_PTR(error); - -exit: - if (inode && (type & CODA_NOCACHE)) + if (!IS_ERR(inode) && (type & CODA_NOCACHE)) coda_flag_inode(inode, C_VATTR | C_PURGE); + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; + return d_splice_alias(inode, entry); } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 1c08a8cd673a..5e2e1b3f068d 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -204,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid)); /* make root inode */ - error = coda_cnode_make(&root, &fid, sb); - if ( error || !root ) { - printk("Failure of coda_cnode_make for root: error %d\n", error); - goto error; + root = coda_cnode_make(&fid, sb); + if (IS_ERR(root)) { + error = PTR_ERR(root); + printk("Failure of coda_cnode_make for root: error %d\n", error); + root = NULL; + goto error; } printk("coda_read_super: rootinode is %ld dev %s\n", diff --git a/fs/dcache.c b/fs/dcache.c index 3c6d3113a255..16a53cc2cc02 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; } @@ -806,6 +807,7 @@ relock: spin_unlock(&dentry->d_lock); } else { list_move_tail(&dentry->d_lru, &tmp); + dentry->d_flags |= DCACHE_SHRINK_LIST; spin_unlock(&dentry->d_lock); if (!--count) break; @@ -1097,14 +1099,19 @@ resume: /* * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (!dentry->d_count) { + if (dentry->d_count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { dentry_lru_move_list(dentry, dispose); + dentry->d_flags |= DCACHE_SHRINK_LIST; found++; - } else { - dentry_lru_del(dentry); } - /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find @@ -1468,7 +1475,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode) return alias; } -static struct dentry * d_find_any_alias(struct inode *inode) +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; @@ -1477,7 +1491,7 @@ static struct dentry * d_find_any_alias(struct inode *inode) spin_unlock(&inode->i_lock); return de; } - +EXPORT_SYMBOL(d_find_any_alias); /** * d_obtain_alias - find or allocate a dentry for a given inode diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6e..4a588dbd11bf 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> +#include <linux/prefetch.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; @@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 6cf72fcc0d0c..e7e327d43fa5 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/in.h> #include <linux/in6.h> +#include <linux/dlmconstants.h> #include <net/ipv6.h> #include <net/sock.h> @@ -36,6 +37,7 @@ static struct config_group *space_list; static struct config_group *comm_list; static struct dlm_comm *local_comm; +static uint32_t dlm_comm_count; struct dlm_clusters; struct dlm_cluster; @@ -103,6 +105,8 @@ struct dlm_cluster { unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; unsigned int cl_new_rsb_count; + unsigned int cl_recover_callbacks; + char cl_cluster_name[DLM_LOCKSPACE_LEN]; }; enum { @@ -118,6 +122,8 @@ enum { CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, CLUSTER_ATTR_NEW_RSB_COUNT, + CLUSTER_ATTR_RECOVER_CALLBACKS, + CLUSTER_ATTR_CLUSTER_NAME, }; struct cluster_attribute { @@ -126,6 +132,27 @@ struct cluster_attribute { ssize_t (*store)(struct dlm_cluster *, const char *, size_t); }; +static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) +{ + return sprintf(buf, "%s\n", cl->cl_cluster_name); +} + +static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, + const char *buf, size_t len) +{ + strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); + strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); + return len; +} + +static struct cluster_attribute cluster_attr_cluster_name = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "cluster_name", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = cluster_cluster_name_read, + .store = cluster_cluster_name_write, +}; + static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, int *info_field, int check_zero, const char *buf, size_t len) @@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); CLUSTER_ATTR(new_rsb_count, 0); +CLUSTER_ATTR(recover_callbacks, 0); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, @@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, + [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr, + [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr, NULL, }; @@ -293,6 +323,7 @@ struct dlm_comms { struct dlm_comm { struct config_item item; + int seq; int nodeid; int local; int addr_count; @@ -309,6 +340,7 @@ struct dlm_node { int nodeid; int weight; int new; + int comm_seq; /* copy of cm->seq when nd->nodeid is set */ }; static struct configfs_group_operations clusters_ops = { @@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; + cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; + memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, + DLM_LOCKSPACE_LEN); space_list = &sps->ss_group; comm_list = &cms->cs_group; @@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name) return ERR_PTR(-ENOMEM); config_item_init_type_name(&cm->item, name, &comm_type); + + cm->seq = dlm_comm_count++; + if (!cm->seq) + cm->seq = dlm_comm_count++; + cm->nodeid = -1; cm->local = 0; cm->addr_count = 0; @@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf) static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len) { + uint32_t seq = 0; nd->nodeid = simple_strtol(buf, NULL, 0); + dlm_comm_seq(nd->nodeid, &seq); + nd->comm_seq = seq; return len; } @@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm) } /* caller must free mem */ -int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, - int **new_out, int *new_count_out) +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out) { struct dlm_space *sp; struct dlm_node *nd; - int i = 0, rv = 0, ids_count = 0, new_count = 0; - int *ids, *new; + struct dlm_config_node *nodes, *node; + int rv, count; sp = get_space(lsname); if (!sp) @@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, goto out; } - ids_count = sp->members_count; + count = sp->members_count; - ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); - if (!ids) { + nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); + if (!nodes) { rv = -ENOMEM; goto out; } + node = nodes; list_for_each_entry(nd, &sp->members, list) { - ids[i++] = nd->nodeid; - if (nd->new) - new_count++; - } - - if (ids_count != i) - printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i); - - if (!new_count) - goto out_ids; + node->nodeid = nd->nodeid; + node->weight = nd->weight; + node->new = nd->new; + node->comm_seq = nd->comm_seq; + node++; - new = kcalloc(new_count, sizeof(int), GFP_NOFS); - if (!new) { - kfree(ids); - rv = -ENOMEM; - goto out; + nd->new = 0; } - i = 0; - list_for_each_entry(nd, &sp->members, list) { - if (nd->new) { - new[i++] = nd->nodeid; - nd->new = 0; - } - } - *new_count_out = new_count; - *new_out = new; - - out_ids: - *ids_count_out = ids_count; - *ids_out = ids; + *count_out = count; + *nodes_out = nodes; + rv = 0; out: mutex_unlock(&sp->members_lock); put_space(sp); return rv; } -int dlm_node_weight(char *lsname, int nodeid) +int dlm_comm_seq(int nodeid, uint32_t *seq) { - struct dlm_space *sp; - struct dlm_node *nd; - int w = -EEXIST; - - sp = get_space(lsname); - if (!sp) - goto out; - - mutex_lock(&sp->members_lock); - list_for_each_entry(nd, &sp->members, list) { - if (nd->nodeid != nodeid) - continue; - w = nd->weight; - break; - } - mutex_unlock(&sp->members_lock); - put_space(sp); - out: - return w; + struct dlm_comm *cm = get_comm(nodeid, NULL); + if (!cm) + return -EEXIST; + *seq = cm->seq; + put_comm(cm); + return 0; } int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) @@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 #define DEFAULT_NEW_RSB_COUNT 128 +#define DEFAULT_RECOVER_CALLBACKS 0 +#define DEFAULT_CLUSTER_NAME "" struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, @@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = { .ci_protocol = DEFAULT_PROTOCOL, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, .ci_waitwarn_us = DEFAULT_WAITWARN_US, - .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT + .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, + .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, + .ci_cluster_name = DEFAULT_CLUSTER_NAME }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 3099d0dd26c0..9f5e3663bb0c 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -14,6 +14,13 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +struct dlm_config_node { + int nodeid; + int weight; + int new; + uint32_t comm_seq; +}; + #define DLM_MAX_ADDR_COUNT 3 struct dlm_config_info { @@ -29,15 +36,17 @@ struct dlm_config_info { int ci_timewarn_cs; int ci_waitwarn_us; int ci_new_rsb_count; + int ci_recover_callbacks; + char ci_cluster_name[DLM_LOCKSPACE_LEN]; }; extern struct dlm_config_info dlm_config; int dlm_config_init(void); void dlm_config_exit(void); -int dlm_node_weight(char *lsname, int nodeid); -int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, - int **new_out, int *new_count_out); +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out); +int dlm_comm_seq(int nodeid, uint32_t *seq); int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); int dlm_our_nodeid(void); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 59779237e2b4..3dca2b39e83f 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops; static void *table_seq_start(struct seq_file *seq, loff_t *pos) { + struct rb_node *node; struct dlm_ls *ls = seq->private; struct rsbtbl_iter *ri; struct dlm_rsb *r; @@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) ri->format = 3; spin_lock(&ls->ls_rsbtbl[bucket].lock); - if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { - list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, - res_hashchain) { + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { + for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node; + node = rb_next(node)) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); if (!entry--) { dlm_hold_rsb(r); ri->rsb = r; @@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) } spin_lock(&ls->ls_rsbtbl[bucket].lock); - if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { - r = list_first_entry(&ls->ls_rsbtbl[bucket].list, - struct dlm_rsb, res_hashchain); + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { + node = rb_first(&ls->ls_rsbtbl[bucket].keep); + r = rb_entry(node, struct dlm_rsb, res_hashnode); dlm_hold_rsb(r); ri->rsb = r; ri->bucket = bucket; @@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct dlm_ls *ls = seq->private; struct rsbtbl_iter *ri = iter_ptr; - struct list_head *next; + struct rb_node *next; struct dlm_rsb *r, *rp; loff_t n = *pos; unsigned bucket; @@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) spin_lock(&ls->ls_rsbtbl[bucket].lock); rp = ri->rsb; - next = rp->res_hashchain.next; + next = rb_next(&rp->res_hashnode); - if (next != &ls->ls_rsbtbl[bucket].list) { - r = list_entry(next, struct dlm_rsb, res_hashchain); + if (next) { + r = rb_entry(next, struct dlm_rsb, res_hashnode); dlm_hold_rsb(r); ri->rsb = r; spin_unlock(&ls->ls_rsbtbl[bucket].lock); @@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) } spin_lock(&ls->ls_rsbtbl[bucket].lock); - if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { - r = list_first_entry(&ls->ls_rsbtbl[bucket].list, - struct dlm_rsb, res_hashchain); + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { + next = rb_first(&ls->ls_rsbtbl[bucket].keep); + r = rb_entry(next, struct dlm_rsb, res_hashnode); dlm_hold_rsb(r); ri->rsb = r; ri->bucket = bucket; diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 7b84c1dbc82e..83641574b016 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls) out_status: error = 0; - dlm_set_recover_status(ls, DLM_RS_DIR); log_debug(ls, "dlm_recover_directory %d entries", count); out_free: kfree(last_name); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index fe2860c02449..3a564d197e99 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -103,8 +103,8 @@ struct dlm_dirtable { }; struct dlm_rsbtable { - struct list_head list; - struct list_head toss; + struct rb_root keep; + struct rb_root toss; spinlock_t lock; }; @@ -117,6 +117,10 @@ struct dlm_member { struct list_head list; int nodeid; int weight; + int slot; + int slot_prev; + int comm_seq; + uint32_t generation; }; /* @@ -125,10 +129,8 @@ struct dlm_member { struct dlm_recover { struct list_head list; - int *nodeids; /* nodeids of all members */ - int node_count; - int *new; /* nodeids of new members */ - int new_count; + struct dlm_config_node *nodes; + int nodes_count; uint64_t seq; }; @@ -285,7 +287,10 @@ struct dlm_rsb { unsigned long res_toss_time; uint32_t res_first_lkid; struct list_head res_lookup; /* lkbs waiting on first */ - struct list_head res_hashchain; /* rsbtbl */ + union { + struct list_head res_hashchain; + struct rb_node res_hashnode; /* rsbtbl */ + }; struct list_head res_grantqueue; struct list_head res_convertqueue; struct list_head res_waitqueue; @@ -334,7 +339,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ #define DLM_HEADER_MAJOR 0x00030000 -#define DLM_HEADER_MINOR 0x00000000 +#define DLM_HEADER_MINOR 0x00000001 + +#define DLM_HEADER_SLOTS 0x00000001 #define DLM_MSG 1 #define DLM_RCOM 2 @@ -422,10 +429,34 @@ union dlm_packet { struct dlm_rcom rcom; }; +#define DLM_RSF_NEED_SLOTS 0x00000001 + +/* RCOM_STATUS data */ +struct rcom_status { + __le32 rs_flags; + __le32 rs_unused1; + __le64 rs_unused2; +}; + +/* RCOM_STATUS_REPLY data */ struct rcom_config { __le32 rf_lvblen; __le32 rf_lsflags; - __le64 rf_unused; + + /* DLM_HEADER_SLOTS adds: */ + __le32 rf_flags; + __le16 rf_our_slot; + __le16 rf_num_slots; + __le32 rf_generation; + __le32 rf_unused1; + __le64 rf_unused2; +}; + +struct rcom_slot { + __le32 ro_nodeid; + __le16 ro_slot; + __le16 ro_unused1; + __le64 ro_unused2; }; struct rcom_lock { @@ -452,6 +483,7 @@ struct dlm_ls { struct list_head ls_list; /* list of lockspaces */ dlm_lockspace_t *ls_local_handle; uint32_t ls_global_id; /* global unique lockspace ID */ + uint32_t ls_generation; uint32_t ls_exflags; int ls_lvblen; int ls_count; /* refcount of processes in @@ -490,6 +522,11 @@ struct dlm_ls { int ls_total_weight; int *ls_node_array; + int ls_slot; + int ls_num_slots; + int ls_slots_size; + struct dlm_slot *ls_slots; + struct dlm_rsb ls_stub_rsb; /* for returning errors */ struct dlm_lkb ls_stub_lkb; /* for returning errors */ struct dlm_message ls_stub_ms; /* for faking a reply */ @@ -537,6 +574,9 @@ struct dlm_ls { struct list_head ls_root_list; /* root resources */ struct rw_semaphore ls_root_sem; /* protect root_list */ + const struct dlm_lockspace_ops *ls_ops; + void *ls_ops_arg; + int ls_namelen; char ls_name[1]; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 83b5e32514e1..d47183043c59 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -56,6 +56,7 @@ L: receive_xxxx_reply() <- R: send_xxxx_reply() */ #include <linux/types.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include "dlm_internal.h" #include <linux/dlm_device.h> @@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); list_del(&r->res_hashchain); + /* Convert the empty list_head to a NULL rb_node for tree usage: */ + memset(&r->res_hashnode, 0, sizeof(struct rb_node)); ls->ls_new_rsb_count--; spin_unlock(&ls->ls_new_rsb_spin); @@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, memcpy(r->res_name, name, len); mutex_init(&r->res_mutex); - INIT_LIST_HEAD(&r->res_hashchain); INIT_LIST_HEAD(&r->res_lookup); INIT_LIST_HEAD(&r->res_grantqueue); INIT_LIST_HEAD(&r->res_convertqueue); @@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, return 0; } -static int search_rsb_list(struct list_head *head, char *name, int len, +static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) +{ + char maxname[DLM_RESNAME_MAXLEN]; + + memset(maxname, 0, DLM_RESNAME_MAXLEN); + memcpy(maxname, name, nlen); + return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); +} + +static int search_rsb_tree(struct rb_root *tree, char *name, int len, unsigned int flags, struct dlm_rsb **r_ret) { + struct rb_node *node = tree->rb_node; struct dlm_rsb *r; int error = 0; - - list_for_each_entry(r, head, res_hashchain) { - if (len == r->res_length && !memcmp(name, r->res_name, len)) + int rc; + + while (node) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); + rc = rsb_cmp(r, name, len); + if (rc < 0) + node = node->rb_left; + else if (rc > 0) + node = node->rb_right; + else goto found; } *r_ret = NULL; @@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len, return error; } +static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) +{ + struct rb_node **newn = &tree->rb_node; + struct rb_node *parent = NULL; + int rc; + + while (*newn) { + struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb, + res_hashnode); + + parent = *newn; + rc = rsb_cmp(cur, rsb->res_name, rsb->res_length); + if (rc < 0) + newn = &parent->rb_left; + else if (rc > 0) + newn = &parent->rb_right; + else { + log_print("rsb_insert match"); + dlm_dump_rsb(rsb); + dlm_dump_rsb(cur); + return -EEXIST; + } + } + + rb_link_node(&rsb->res_hashnode, parent, newn); + rb_insert_color(&rsb->res_hashnode, tree); + return 0; +} + static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, unsigned int flags, struct dlm_rsb **r_ret) { struct dlm_rsb *r; int error; - error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r); + error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r); if (!error) { kref_get(&r->res_ref); goto out; } - error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); + error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); if (error) goto out; - list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + if (error) + return error; if (dlm_no_directory(ls)) goto out; @@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, nodeid = 0; r->res_nodeid = nodeid; } - list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); - error = 0; + error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep); out_unlock: spin_unlock(&ls->ls_rsbtbl[bucket].lock); out: @@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref) DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); kref_init(&r->res_ref); - list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); + rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); @@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r) r->res_name, r->res_length); } -/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is - found since they are in order of newest to oldest? */ +/* FIXME: make this more efficient */ static int shrink_bucket(struct dlm_ls *ls, int b) { + struct rb_node *n; struct dlm_rsb *r; int count = 0, found; for (;;) { found = 0; spin_lock(&ls->ls_rsbtbl[b].lock); - list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, - res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); if (!time_after_eq(jiffies, r->res_toss_time + dlm_config.ci_toss_secs * HZ)) continue; @@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) } if (kref_put(&r->res_ref, kill_rsb)) { - list_del(&r->res_hashchain); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); spin_unlock(&ls->ls_rsbtbl[b].lock); if (is_master(r)) @@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls) static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) { + struct rb_node *n; struct dlm_rsb *r, *r_ret = NULL; spin_lock(&ls->ls_rsbtbl[bucket].lock); - list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); if (!rsb_flag(r, RSB_LOCKS_PURGED)) continue; hold_rsb(r); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index a1d8f1af144b..a1ea25face82 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -386,12 +386,15 @@ static void threads_stop(void) dlm_lowcomms_stop(); } -static int new_lockspace(const char *name, int namelen, void **lockspace, - uint32_t flags, int lvblen) +static int new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) { struct dlm_ls *ls; int i, size, error; int do_unreg = 0; + int namelen = strlen(name); if (namelen > DLM_LOCKSPACE_LEN) return -EINVAL; @@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, return -EINVAL; if (!dlm_user_daemon_available()) { - module_put(THIS_MODULE); - return -EUNATCH; + log_print("dlm user daemon not available"); + error = -EUNATCH; + goto out; + } + + if (ops && ops_result) { + if (!dlm_config.ci_recover_callbacks) + *ops_result = -EOPNOTSUPP; + else + *ops_result = 0; + } + + if (dlm_config.ci_recover_callbacks && cluster && + strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { + log_print("dlm cluster name %s mismatch %s", + dlm_config.ci_cluster_name, cluster); + error = -EBADR; + goto out; } error = 0; @@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, ls->ls_flags = 0; ls->ls_scan_time = jiffies; + if (ops && dlm_config.ci_recover_callbacks) { + ls->ls_ops = ops; + ls->ls_ops_arg = ops_arg; + } + if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); @@ -457,8 +481,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); - INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); + ls->ls_rsbtbl[i].keep.rb_node = NULL; + ls->ls_rsbtbl[i].toss.rb_node = NULL; spin_lock_init(&ls->ls_rsbtbl[i].lock); } @@ -525,6 +549,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, if (!ls->ls_recover_buf) goto out_dirfree; + ls->ls_slot = 0; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; + ls->ls_slots = NULL; + INIT_LIST_HEAD(&ls->ls_recover_list); spin_lock_init(&ls->ls_recover_list_lock); ls->ls_recover_list_count = 0; @@ -614,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, return error; } -int dlm_new_lockspace(const char *name, int namelen, void **lockspace, - uint32_t flags, int lvblen) +int dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) { int error = 0; @@ -625,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace, if (error) goto out; - error = new_lockspace(name, namelen, lockspace, flags, lvblen); + error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg, + ops_result, lockspace); if (!error) ls_count++; if (error > 0) @@ -685,7 +717,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force) static int release_lockspace(struct dlm_ls *ls, int force) { struct dlm_rsb *rsb; - struct list_head *head; + struct rb_node *n; int i, busy, rv; busy = lockspace_busy(ls, force); @@ -746,20 +778,15 @@ static int release_lockspace(struct dlm_ls *ls, int force) */ for (i = 0; i < ls->ls_rsbtbl_size; i++) { - head = &ls->ls_rsbtbl[i].list; - while (!list_empty(head)) { - rsb = list_entry(head->next, struct dlm_rsb, - res_hashchain); - - list_del(&rsb->res_hashchain); + while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].keep); dlm_free_rsb(rsb); } - head = &ls->ls_rsbtbl[i].toss; - while (!list_empty(head)) { - rsb = list_entry(head->next, struct dlm_rsb, - res_hashchain); - list_del(&rsb->res_hashchain); + while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].toss); dlm_free_rsb(rsb); } } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b12532e553f8..862640a36d5c 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -19,6 +19,280 @@ #include "config.h" #include "lowcomms.h" +int dlm_slots_version(struct dlm_header *h) +{ + if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS) + return 0; + return 1; +} + +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb) +{ + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + + if (!dlm_slots_version(&rc->rc_header)) + return; + + memb->slot = le16_to_cpu(rf->rf_our_slot); + memb->generation = le32_to_cpu(rf->rf_generation); +} + +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct dlm_slot *slot; + struct rcom_slot *ro; + int i; + + ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + /* ls_slots array is sparse, but not rcom_slots */ + + for (i = 0; i < ls->ls_slots_size; i++) { + slot = &ls->ls_slots[i]; + if (!slot->nodeid) + continue; + ro->ro_nodeid = cpu_to_le32(slot->nodeid); + ro->ro_slot = cpu_to_le16(slot->slot); + ro++; + } +} + +#define SLOT_DEBUG_LINE 128 + +static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, + struct rcom_slot *ro0, struct dlm_slot *array, + int array_size) +{ + char line[SLOT_DEBUG_LINE]; + int len = SLOT_DEBUG_LINE - 1; + int pos = 0; + int ret, i; + + if (!dlm_config.ci_log_debug) + return; + + memset(line, 0, sizeof(line)); + + if (array) { + for (i = 0; i < array_size; i++) { + if (!array[i].nodeid) + continue; + + ret = snprintf(line + pos, len - pos, " %d:%d", + array[i].slot, array[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } else if (ro0) { + for (i = 0; i < num_slots; i++) { + ret = snprintf(line + pos, len - pos, " %d:%d", + ro0[i].ro_slot, ro0[i].ro_nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } + + log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); +} + +int dlm_slots_copy_in(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_rcom *rc = ls->ls_recover_buf; + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + struct rcom_slot *ro0, *ro; + int our_nodeid = dlm_our_nodeid(); + int i, num_slots; + uint32_t gen; + + if (!dlm_slots_version(&rc->rc_header)) + return -1; + + gen = le32_to_cpu(rf->rf_generation); + if (gen <= ls->ls_generation) { + log_error(ls, "dlm_slots_copy_in gen %u old %u", + gen, ls->ls_generation); + } + ls->ls_generation = gen; + + num_slots = le16_to_cpu(rf->rf_num_slots); + if (!num_slots) + return -1; + + ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid); + ro->ro_slot = le16_to_cpu(ro->ro_slot); + } + + log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); + + list_for_each_entry(memb, &ls->ls_nodes, list) { + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + if (ro->ro_nodeid != memb->nodeid) + continue; + memb->slot = ro->ro_slot; + memb->slot_prev = memb->slot; + break; + } + + if (memb->nodeid == our_nodeid) { + if (ls->ls_slot && ls->ls_slot != memb->slot) { + log_error(ls, "dlm_slots_copy_in our slot " + "changed %d %d", ls->ls_slot, + memb->slot); + return -1; + } + + if (!ls->ls_slot) + ls->ls_slot = memb->slot; + } + + if (!memb->slot) { + log_error(ls, "dlm_slots_copy_in nodeid %d no slot", + memb->nodeid); + return -1; + } + } + + return 0; +} + +/* for any nodes that do not support slots, we will not have set memb->slot + in wait_status_all(), so memb->slot will remain -1, and we will not + assign slots or set ls_num_slots here */ + +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out) +{ + struct dlm_member *memb; + struct dlm_slot *array; + int our_nodeid = dlm_our_nodeid(); + int array_size, max_slots, i; + int need = 0; + int max = 0; + int num = 0; + uint32_t gen = 0; + + /* our own memb struct will have slot -1 gen 0 */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == our_nodeid) { + memb->slot = ls->ls_slot; + memb->generation = ls->ls_generation; + break; + } + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->generation > gen) + gen = memb->generation; + + /* node doesn't support slots */ + + if (memb->slot == -1) + return -1; + + /* node needs a slot assigned */ + + if (!memb->slot) + need++; + + /* node has a slot assigned */ + + num++; + + if (!max || max < memb->slot) + max = memb->slot; + + /* sanity check, once slot is assigned it shouldn't change */ + + if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) { + log_error(ls, "nodeid %d slot changed %d %d", + memb->nodeid, memb->slot_prev, memb->slot); + return -1; + } + memb->slot_prev = memb->slot; + } + + array_size = max + need; + + array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS); + if (!array) + return -ENOMEM; + + num = 0; + + /* fill in slots (offsets) that are used */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!memb->slot) + continue; + + if (memb->slot > array_size) { + log_error(ls, "invalid slot number %d", memb->slot); + kfree(array); + return -1; + } + + array[memb->slot - 1].nodeid = memb->nodeid; + array[memb->slot - 1].slot = memb->slot; + num++; + } + + /* assign new slots from unused offsets */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->slot) + continue; + + for (i = 0; i < array_size; i++) { + if (array[i].nodeid) + continue; + + memb->slot = i + 1; + memb->slot_prev = memb->slot; + array[i].nodeid = memb->nodeid; + array[i].slot = memb->slot; + num++; + + if (!ls->ls_slot && memb->nodeid == our_nodeid) + ls->ls_slot = memb->slot; + break; + } + + if (!memb->slot) { + log_error(ls, "no free slot found"); + kfree(array); + return -1; + } + } + + gen++; + + log_debug_slots(ls, gen, num, NULL, array, array_size); + + max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - + sizeof(struct rcom_config)) / sizeof(struct rcom_slot); + + if (num > max_slots) { + log_error(ls, "num_slots %d exceeds max_slots %d", + num, max_slots); + kfree(array); + return -1; + } + + *gen_out = gen; + *slots_out = array; + *slots_size = array_size; + *num_slots = num; + return 0; +} + static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) { struct dlm_member *memb = NULL; @@ -43,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) } } -static int dlm_add_member(struct dlm_ls *ls, int nodeid) +static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) { struct dlm_member *memb; - int w, error; + int error; memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); if (!memb) return -ENOMEM; - w = dlm_node_weight(ls->ls_name, nodeid); - if (w < 0) { - kfree(memb); - return w; - } - - error = dlm_lowcomms_connect_node(nodeid); + error = dlm_lowcomms_connect_node(node->nodeid); if (error < 0) { kfree(memb); return error; } - memb->nodeid = nodeid; - memb->weight = w; + memb->nodeid = node->nodeid; + memb->weight = node->weight; + memb->comm_seq = node->comm_seq; add_ordered_member(ls, memb); ls->ls_num_nodes++; return 0; } -static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) -{ - list_move(&memb->list, &ls->ls_nodes_gone); - ls->ls_num_nodes--; -} - -int dlm_is_member(struct dlm_ls *ls, int nodeid) +static struct dlm_member *find_memb(struct list_head *head, int nodeid) { struct dlm_member *memb; - list_for_each_entry(memb, &ls->ls_nodes, list) { + list_for_each_entry(memb, head, list) { if (memb->nodeid == nodeid) - return 1; + return memb; } + return NULL; +} + +int dlm_is_member(struct dlm_ls *ls, int nodeid) +{ + if (find_memb(&ls->ls_nodes, nodeid)) + return 1; return 0; } int dlm_is_removed(struct dlm_ls *ls, int nodeid) { - struct dlm_member *memb; - - list_for_each_entry(memb, &ls->ls_nodes_gone, list) { - if (memb->nodeid == nodeid) - return 1; - } + if (find_memb(&ls->ls_nodes_gone, nodeid)) + return 1; return 0; } @@ -176,7 +442,7 @@ static int ping_members(struct dlm_ls *ls) error = dlm_recovery_stopped(ls); if (error) break; - error = dlm_rcom_status(ls, memb->nodeid); + error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) break; } @@ -186,10 +452,88 @@ static int ping_members(struct dlm_ls *ls) return error; } +static void dlm_lsop_recover_prep(struct dlm_ls *ls) +{ + if (!ls->ls_ops || !ls->ls_ops->recover_prep) + return; + ls->ls_ops->recover_prep(ls->ls_ops_arg); +} + +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +{ + struct dlm_slot slot; + uint32_t seq; + int error; + + if (!ls->ls_ops || !ls->ls_ops->recover_slot) + return; + + /* if there is no comms connection with this node + or the present comms connection is newer + than the one when this member was added, then + we consider the node to have failed (versus + being removed due to dlm_release_lockspace) */ + + error = dlm_comm_seq(memb->nodeid, &seq); + + if (!error && seq == memb->comm_seq) + return; + + slot.nodeid = memb->nodeid; + slot.slot = memb->slot; + + ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot); +} + +void dlm_lsop_recover_done(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_slot *slots; + int i, num; + + if (!ls->ls_ops || !ls->ls_ops->recover_done) + return; + + num = ls->ls_num_nodes; + + slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL); + if (!slots) + return; + + i = 0; + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (i == num) { + log_error(ls, "dlm_lsop_recover_done bad num %d", num); + goto out; + } + slots[i].nodeid = memb->nodeid; + slots[i].slot = memb->slot; + i++; + } + + ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num, + ls->ls_slot, ls->ls_generation); + out: + kfree(slots); +} + +static struct dlm_config_node *find_config_node(struct dlm_recover *rv, + int nodeid) +{ + int i; + + for (i = 0; i < rv->nodes_count; i++) { + if (rv->nodes[i].nodeid == nodeid) + return &rv->nodes[i]; + } + return NULL; +} + int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) { struct dlm_member *memb, *safe; - int i, error, found, pos = 0, neg = 0, low = -1; + struct dlm_config_node *node; + int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to count as a negative change so the "neg" recovery steps will happen */ @@ -202,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) /* move departed members from ls_nodes to ls_nodes_gone */ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { - found = 0; - for (i = 0; i < rv->node_count; i++) { - if (memb->nodeid == rv->nodeids[i]) { - found = 1; - break; - } - } + node = find_config_node(rv, memb->nodeid); + if (node && !node->new) + continue; - if (!found) { - neg++; - dlm_remove_member(ls, memb); + if (!node) { log_debug(ls, "remove member %d", memb->nodeid); + } else { + /* removed and re-added */ + log_debug(ls, "remove member %d comm_seq %u %u", + memb->nodeid, memb->comm_seq, node->comm_seq); } - } - - /* Add an entry to ls_nodes_gone for members that were removed and - then added again, so that previous state for these nodes will be - cleared during recovery. */ - - for (i = 0; i < rv->new_count; i++) { - if (!dlm_is_member(ls, rv->new[i])) - continue; - log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); - memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); - if (!memb) - return -ENOMEM; - memb->nodeid = rv->new[i]; - list_add_tail(&memb->list, &ls->ls_nodes_gone); neg++; + list_move(&memb->list, &ls->ls_nodes_gone); + ls->ls_num_nodes--; + dlm_lsop_recover_slot(ls, memb); } /* add new members to ls_nodes */ - for (i = 0; i < rv->node_count; i++) { - if (dlm_is_member(ls, rv->nodeids[i])) + for (i = 0; i < rv->nodes_count; i++) { + node = &rv->nodes[i]; + if (dlm_is_member(ls, node->nodeid)) continue; - dlm_add_member(ls, rv->nodeids[i]); - pos++; - log_debug(ls, "add member %d", rv->nodeids[i]); + dlm_add_member(ls, node); + log_debug(ls, "add member %d", node->nodeid); } list_for_each_entry(memb, &ls->ls_nodes, list) { @@ -251,7 +581,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) ls->ls_low_nodeid = low; make_member_array(ls); - dlm_set_recover_status(ls, DLM_RS_NODES); *neg_out = neg; error = ping_members(ls); @@ -261,12 +590,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) ls->ls_members_result = error; complete(&ls->ls_members_done); } - if (error) - goto out; - error = dlm_recover_members_wait(ls); - out: - log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error); + log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -327,26 +652,35 @@ int dlm_ls_stop(struct dlm_ls *ls) */ dlm_recoverd_suspend(ls); + + spin_lock(&ls->ls_recover_lock); + kfree(ls->ls_slots); + ls->ls_slots = NULL; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; ls->ls_recover_status = 0; + spin_unlock(&ls->ls_recover_lock); + dlm_recoverd_resume(ls); if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; + + dlm_lsop_recover_prep(ls); return 0; } int dlm_ls_start(struct dlm_ls *ls) { struct dlm_recover *rv = NULL, *rv_old; - int *ids = NULL, *new = NULL; - int error, ids_count = 0, new_count = 0; + struct dlm_config_node *nodes; + int error, count; rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); if (!rv) return -ENOMEM; - error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count, - &new, &new_count); + error = dlm_config_nodes(ls->ls_name, &nodes, &count); if (error < 0) goto fail; @@ -361,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls) goto fail; } - rv->nodeids = ids; - rv->node_count = ids_count; - rv->new = new; - rv->new_count = new_count; + rv->nodes = nodes; + rv->nodes_count = count; rv->seq = ++ls->ls_recover_seq; rv_old = ls->ls_recover_args; ls->ls_recover_args = rv; @@ -372,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls) if (rv_old) { log_error(ls, "unused recovery %llx %d", - (unsigned long long)rv_old->seq, rv_old->node_count); - kfree(rv_old->nodeids); - kfree(rv_old->new); + (unsigned long long)rv_old->seq, rv_old->nodes_count); + kfree(rv_old->nodes); kfree(rv_old); } @@ -383,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls) fail: kfree(rv); - kfree(ids); - kfree(new); + kfree(nodes); return error; } diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 7a26fca1e0b5..3deb70661c69 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -20,6 +20,14 @@ void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); int dlm_is_member(struct dlm_ls *ls, int nodeid); +int dlm_slots_version(struct dlm_header *h); +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb); +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_slots_copy_in(struct dlm_ls *ls); +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out); +void dlm_lsop_recover_done(struct dlm_ls *ls); #endif /* __MEMBER_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f10a50f24e8f..ac5c616c9696 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -23,6 +23,7 @@ #include "memory.h" #include "lock.h" #include "util.h" +#include "member.h" static int rcom_response(struct dlm_ls *ls) @@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, dlm_lowcomms_commit_buffer(mh); } +static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, + uint32_t flags) +{ + rs->rs_flags = cpu_to_le32(flags); +} + /* When replying to a status request, a node also sends back its configuration values. The requesting node then checks that the remote node is configured the same way as itself. */ -static void make_config(struct dlm_ls *ls, struct rcom_config *rf) +static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf, + uint32_t num_slots) { rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); + + rf->rf_our_slot = cpu_to_le16(ls->ls_slot); + rf->rf_num_slots = cpu_to_le16(num_slots); + rf->rf_generation = cpu_to_le32(ls->ls_generation); } -static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; - size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { log_error(ls, "version mismatch: %x nodeid %d: %x", @@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) return -EPROTO; } - if (rc->rc_header.h_length < conf_size) { - log_error(ls, "config too short: %d nodeid %d", - rc->rc_header.h_length, nodeid); - return -EPROTO; - } - if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", @@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls) spin_unlock(&ls->ls_rcom_spin); } -int dlm_rcom_status(struct dlm_ls *ls, int nodeid) +/* + * low nodeid gathers one slot value at a time from each node. + * it sets need_slots=0, and saves rf_our_slot returned from each + * rcom_config. + * + * other nodes gather all slot values at once from the low nodeid. + * they set need_slots=1, and ignore the rf_our_slot returned from each + * rcom_config. they use the rf_num_slots returned from the low + * node's rcom_config. + */ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) goto out; } - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &mh); if (error) goto out; + set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); + allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); @@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) /* we pretend the remote lockspace exists with 0 status */ log_debug(ls, "remote node %d not ready", nodeid); rc->rc_result = 0; - } else - error = check_config(ls, rc, nodeid); + error = 0; + } else { + error = check_rcom_config(ls, rc, nodeid); + } + /* the caller looks at rc_result for the remote recovery status */ out: return error; @@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error, nodeid = rc_in->rc_header.h_nodeid; + struct rcom_status *rs; + uint32_t status; + int nodeid = rc_in->rc_header.h_nodeid; + int len = sizeof(struct rcom_config); + int num_slots = 0; + int error; + + if (!dlm_slots_version(&rc_in->rc_header)) { + status = dlm_recover_status(ls); + goto do_create; + } + + rs = (struct rcom_status *)rc_in->rc_buf; + if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) { + status = dlm_recover_status(ls); + goto do_create; + } + + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + num_slots = ls->ls_num_slots; + spin_unlock(&ls->ls_recover_lock); + len += num_slots * sizeof(struct rcom_slot); + + do_create: error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, - sizeof(struct rcom_config), &rc, &mh); + len, &rc, &mh); if (error) return; + rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - rc->rc_result = dlm_recover_status(ls); - make_config(ls, (struct rcom_config *) rc->rc_buf); + rc->rc_result = status; + + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots); + + if (!num_slots) + goto do_send; + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_num_slots != num_slots) { + spin_unlock(&ls->ls_recover_lock); + log_debug(ls, "receive_rcom_status num_slots %d to %d", + num_slots, ls->ls_num_slots); + rc->rc_result = 0; + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0); + goto do_send; + } + + dlm_slots_copy_out(ls, rc); + spin_unlock(&ls->ls_recover_lock); + do_send: send_rcom(ls, mh, rc); } diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index b09abd29ba38..206723ab744d 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -14,7 +14,7 @@ #ifndef __RCOM_DOT_H__ #define __RCOM_DOT_H__ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid); +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 14638235f7b2..34d5adf1fce7 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls) return status; } +static void _set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + ls->ls_recover_status |= status; +} + void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) { spin_lock(&ls->ls_recover_lock); - ls->ls_recover_status |= status; + _set_recover_status(ls, status); spin_unlock(&ls->ls_recover_lock); } -static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) +static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, + int save_slots) { struct dlm_rcom *rc = ls->ls_recover_buf; struct dlm_member *memb; @@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) goto out; } - error = dlm_rcom_status(ls, memb->nodeid); + error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) goto out; + if (save_slots) + dlm_slot_save(ls, rc, memb); + if (rc->rc_result & wait_status) break; if (delay < 1000) @@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) return error; } -static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) +static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, + uint32_t status_flags) { struct dlm_rcom *rc = ls->ls_recover_buf; int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; @@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) goto out; } - error = dlm_rcom_status(ls, nodeid); + error = dlm_rcom_status(ls, nodeid, status_flags); if (error) break; @@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status) int error; if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, status); + error = wait_status_all(ls, status, 0); if (!error) dlm_set_recover_status(ls, status_all); } else - error = wait_status_low(ls, status_all); + error = wait_status_low(ls, status_all, 0); return error; } int dlm_recover_members_wait(struct dlm_ls *ls) { - return wait_status(ls, DLM_RS_NODES); + struct dlm_member *memb; + struct dlm_slot *slots; + int num_slots, slots_size; + int error, rv; + uint32_t gen; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + memb->slot = -1; + memb->generation = 0; + } + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, DLM_RS_NODES, 1); + if (error) + goto out; + + /* slots array is sparse, slots_size may be > num_slots */ + + rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen); + if (!rv) { + spin_lock(&ls->ls_recover_lock); + _set_recover_status(ls, DLM_RS_NODES_ALL); + ls->ls_num_slots = num_slots; + ls->ls_slots_size = slots_size; + ls->ls_slots = slots; + ls->ls_generation = gen; + spin_unlock(&ls->ls_recover_lock); + } else { + dlm_set_recover_status(ls, DLM_RS_NODES_ALL); + } + } else { + error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS); + if (error) + goto out; + + dlm_slots_copy_in(ls); + } + out: + return error; } int dlm_recover_directory_wait(struct dlm_ls *ls) @@ -542,8 +590,6 @@ int dlm_recover_locks(struct dlm_ls *ls) out: if (error) recover_list_clear(ls); - else - dlm_set_recover_status(ls, DLM_RS_LOCKS); return error; } @@ -715,6 +761,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls) int dlm_create_root_list(struct dlm_ls *ls) { + struct rb_node *n; struct dlm_rsb *r; int i, error = 0; @@ -727,7 +774,8 @@ int dlm_create_root_list(struct dlm_ls *ls) for (i = 0; i < ls->ls_rsbtbl_size; i++) { spin_lock(&ls->ls_rsbtbl[i].lock); - list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); list_add(&r->res_root_list, &ls->ls_root_list); dlm_hold_rsb(r); } @@ -741,7 +789,8 @@ int dlm_create_root_list(struct dlm_ls *ls) continue; } - list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); list_add(&r->res_root_list, &ls->ls_root_list); dlm_hold_rsb(r); } @@ -771,16 +820,18 @@ void dlm_release_root_list(struct dlm_ls *ls) void dlm_clear_toss_list(struct dlm_ls *ls) { - struct dlm_rsb *r, *safe; + struct rb_node *n, *next; + struct dlm_rsb *rsb; int i; for (i = 0; i < ls->ls_rsbtbl_size; i++) { spin_lock(&ls->ls_rsbtbl[i].lock); - list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, - res_hashchain) { - if (dlm_no_directory(ls) || !is_master(r)) { - list_del(&r->res_hashchain); - dlm_free_rsb(r); + for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) { + next = rb_next(n);; + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + if (dlm_no_directory(ls) || !is_master(rsb)) { + rb_erase(n, &ls->ls_rsbtbl[i].toss); + dlm_free_rsb(rsb); } } spin_unlock(&ls->ls_rsbtbl[i].lock); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 774da3cf92c6..3780caf7ae0c 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "recover %llx", (unsigned long long)rv->seq); + log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. - * Also waits for all nodes to complete dlm_recover_members. */ error = dlm_recover_members(ls, rv, &neg); if (error) { - log_debug(ls, "recover_members failed %d", error); + log_debug(ls, "dlm_recover_members error %d", error); goto fail; } + + dlm_set_recover_status(ls, DLM_RS_NODES); + + error = dlm_recover_members_wait(ls); + if (error) { + log_debug(ls, "dlm_recover_members_wait error %d", error); + goto fail; + } + start = jiffies; /* @@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory(ls); if (error) { - log_debug(ls, "recover_directory failed %d", error); + log_debug(ls, "dlm_recover_directory error %d", error); goto fail; } - /* - * Wait for all nodes to complete directory rebuild. - */ + dlm_set_recover_status(ls, DLM_RS_DIR); error = dlm_recover_directory_wait(ls); if (error) { - log_debug(ls, "recover_directory_wait failed %d", error); + log_debug(ls, "dlm_recover_directory_wait error %d", error); goto fail; } @@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_masters(ls); if (error) { - log_debug(ls, "recover_masters failed %d", error); + log_debug(ls, "dlm_recover_masters error %d", error); goto fail; } @@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks(ls); if (error) { - log_debug(ls, "recover_locks failed %d", error); + log_debug(ls, "dlm_recover_locks error %d", error); goto fail; } + dlm_set_recover_status(ls, DLM_RS_LOCKS); + error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "dlm_recover_locks_wait error %d", error); goto fail; } @@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "dlm_recover_locks_wait error %d", error); goto fail; } } @@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_purge_requestqueue(ls); dlm_set_recover_status(ls, DLM_RS_DONE); + error = dlm_recover_done_wait(ls); if (error) { - log_debug(ls, "recover_done_wait failed %d", error); + log_debug(ls, "dlm_recover_done_wait error %d", error); goto fail; } @@ -200,34 +209,35 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = enable_locking(ls, rv->seq); if (error) { - log_debug(ls, "enable_locking failed %d", error); + log_debug(ls, "enable_locking error %d", error); goto fail; } error = dlm_process_requestqueue(ls); if (error) { - log_debug(ls, "process_requestqueue failed %d", error); + log_debug(ls, "dlm_process_requestqueue error %d", error); goto fail; } error = dlm_recover_waiters_post(ls); if (error) { - log_debug(ls, "recover_waiters_post failed %d", error); + log_debug(ls, "dlm_recover_waiters_post error %d", error); goto fail; } dlm_grant_after_purge(ls); - log_debug(ls, "recover %llx done: %u ms", - (unsigned long long)rv->seq, + log_debug(ls, "dlm_recover %llx generation %u done: %u ms", + (unsigned long long)rv->seq, ls->ls_generation, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); + dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_debug(ls, "recover %llx error %d", + log_debug(ls, "dlm_recover %llx error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; @@ -250,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls) if (rv) { ls_recover(ls, rv); - kfree(rv->nodeids); - kfree(rv->new); + kfree(rv->nodes); kfree(rv); } } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index d8ea60756403..eb4ed9ba3098 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = dlm_new_lockspace(params->name, strlen(params->name), - &lockspace, params->flags, DLM_USER_LVB_LEN); + error = dlm_new_lockspace(params->name, NULL, params->flags, + DLM_USER_LVB_LEN, NULL, NULL, NULL, + &lockspace); if (error) return error; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23a..aabdfc38cf24 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an eventpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); - + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/fs/exec.c b/fs/exec.c index 3f64b9f26e7d..aeb135c7ff5c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> + +#include <trace/events/task.h> #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 12ccacda44e0..f9e2cd8cf711 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -23,6 +23,8 @@ #include <trace/events/ext4.h> +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ -unsigned ext4_num_base_meta_clusters(struct super_block *sb, +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1554b15f91bc..513004fc3d84 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,6 +511,14 @@ struct ext4_new_group_data { __u32 free_blocks_count; }; +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + /* * Flags used by ext4_map_blocks() */ @@ -575,6 +583,7 @@ struct ext4_new_group_data { /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -957,12 +966,13 @@ struct ext4_inode_info { #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) -#define ext4_set_bit __test_and_set_bit_le +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le #define ext4_set_bit_atomic ext2_set_bit_atomic -#define ext4_clear_bit __test_and_clear_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le #define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le -#define ext4_find_first_zero_bit find_first_zero_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb, extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group); extern unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); @@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_discard_partial_page_buffers(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length, int flags); -extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb, extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern void *ext4_kvmalloc(size_t size, gfp_t flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 841faf5fb785..74f23c292e1b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3280,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t i, pg_lblk; pgoff_t index; + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; + /* reverse search wont work if fs block size is less than page size */ if (inode->i_blkbits < PAGE_CACHE_SHIFT) search_hint_reverse = 0; @@ -3452,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" - "block %llu, max_blocks %u, flags %d, allocated %u", + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); @@ -3624,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); ext4_lblk_t ex_cluster_start, ex_cluster_end; - ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); @@ -3635,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb, /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); - rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4637af036d9c..25d8c9781ad9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) fatal = ext4_journal_get_write_access(handle, bh2); } ext4_lock_group(sb, block_group); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; @@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; + unsigned int freei, avefreei, grp_free; ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; @@ -477,8 +477,8 @@ fallback_retry: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) { + grp_free = ext4_free_inodes_count(sb, desc); + if (desc && grp_free && grp_free >= avefreei) { *group = grp; return 0; } @@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb, */ down_read(&grp->alloc_sem); ext4_lock_group(sb, group); - if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; goto err_ret; @@ -885,8 +885,12 @@ got: if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index aa8efa6572d6..feaa82fe629d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -71,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); /* * Test whether an inode is a fast symlink. @@ -2759,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!io_end || !size) goto out; - ext_debug("ext4_end_io_dio(): io_end 0x%p" + ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); @@ -3160,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle, * * Returns zero on sucess or negative on failure. */ -int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, struct inode *inode, struct page *page, loff_t from, loff_t length, int flags) { @@ -3300,126 +3303,6 @@ next: return err; } -/* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; - ext4_lblk_t iblock; - struct inode *inode = mapping->host; - struct buffer_head *bh; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - - blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; - - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (buffer_freed(bh)) { - BUFFER_TRACE(bh, "freed: skip"); - goto unlock; - } - - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto unlock; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) - goto unlock; - } - - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto unlock; - } - - zero_user(page, offset, length); - - BUFFER_TRACE(bh, "zeroed end of block"); - - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); - -unlock: - unlock_page(page); - page_cache_release(page); - return err; -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4646,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; + /* We have to allocate physical blocks for delalloc blocks + * before flushing journal. otherwise delalloc blocks can not + * be allocated any more. even more truncate on delalloc blocks + * could trigger BUG by flushing delalloc blocks in journal. + * There is no delalloc block in non-journal data mode. + */ + if (val && test_opt(inode->i_sb, DELALLOC)) { + err = ext4_alloc_da_blocks(inode); + if (err < 0) + return err; + } jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); /* * OK, there are no updates running now, and all cached data is @@ -4660,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); - else + else { + jbd2_journal_flush(journal); ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e87a932b073b..6eee25591b81 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -18,6 +18,8 @@ #include "ext4_jbd2.h" #include "ext4.h" +#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; @@ -186,19 +188,22 @@ setversion_out: if (err) return err; - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_extend_out; } err = mnt_want_write_file(filp); if (err) - return err; + goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { @@ -209,8 +214,8 @@ setversion_out: if (err == 0) err = err2; mnt_drop_write_file(filp); +group_extend_out: ext4_resize_end(sb); - return err; } @@ -251,8 +256,7 @@ setversion_out: err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); - if (me.moved_len > 0) - file_remove_suid(donor_filp); + mnt_drop_write(filp->f_path.mnt); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) @@ -271,19 +275,22 @@ mext_out: return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_add_out; } err = mnt_want_write_file(filp); if (err) - return err; + goto group_add_out; err = ext4_group_add(sb, &input); if (EXT4_SB(sb)->s_journal) { @@ -294,8 +301,8 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); +group_add_out: ext4_resize_end(sb); - return err; } @@ -335,6 +342,60 @@ mext_out: return err; } + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err = 0, err2 = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with meta_bg"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + if (n_blocks_count > MAX_32_NUM && + !EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "File system only supports 32-bit block numbers"); + return -EOPNOTSUPP; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); +resizefs_out: + ext4_resize_end(sb); + return err; + } + case FITRIM: { struct request_queue *q = bdev_get_queue(sb->s_bdev); @@ -433,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } case EXT4_IOC_MOVE_EXT: case FITRIM: + case EXT4_IOC_RESIZE_FS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2d8be8f28bf..cb990b21c698 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 996780ab4f4e..f9d948f0eb86 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb, return err; } +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of + * @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +{ + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + flex_gd->count = flexbg_size; + + flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * + flexbg_size, GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + */ +static void ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_bg_has_super(sb, src_group) ? + (1 + ext4_bg_num_gdb(sb, src_group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + + start_blk += overhead; + + BUG_ON(src_group >= group_data[0].group + flex_gd->count); + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) + if (!ext4_bg_has_super(sb, src_group)) + last_blk += group_data[src_group - group].blocks_count; + else + break; + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count -= + EXT4_SB(sb)->s_itb_per_group; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%d groups, flexbg size is %d:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + printk(KERN_DEBUG "adding %s group %u: %u " + "blocks (%d free)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_blocks_count); + } + } +} + static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { @@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) } /* - * Set up the block and inode bitmaps, and the inode table for the new group. + * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t block, ext4_group_t count) +{ + ext4_group_t count2; + + ext4_debug("mark blocks [%llu/%u] used\n", block, count); + for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + ext4_get_group_no_and_offset(sb, block, &group, NULL); + start = ext4_group_first_block_no(sb, group); + group -= flex_gd->groups[0].group; + + count2 = sb->s_blocksize * 8 - (block - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = extend_or_restart_transaction(handle, 1); + if (err) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (!bh) + return -EIO; + + err = ext4_journal_get_write_access(handle, bh); + if (err) + return err; + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, + block - start, count2); + ext4_set_bits(bh->b_data, block - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + return err; + brelse(bh); + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. * This doesn't need to be part of the main transaction, since we are only * changing blocks outside the actual filesystem. We still do journaling to * ensure the recovery is correct in case of a failure just after resize. * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. */ -static int setup_new_group_blocks(struct super_block *sb, - struct ext4_new_group_data *input) +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) { + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); - int reserved_gdb = ext4_bg_has_super(sb, input->group) ? - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; - unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group); - struct buffer_head *bh; + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; handle_t *handle; - ext4_fsblk_t block; - ext4_grpblk_t bit; - int i; - int err = 0, err2; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); - if (IS_ERR(handle)) return PTR_ERR(handle); - BUG_ON(input->group != sbi->s_groups_count); + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; - /* Copy all of the GDT blocks into the backup in this group */ - for (i = 0, bit = 1, block = start + 1; - i < gdblocks; i++, block++, bit++) { - struct buffer_head *gdb; + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); - ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - err = extend_or_restart_transaction(handle, 1); - if (err) - goto exit_journal; + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + struct buffer_head *gdb; - gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, gdb))) { + ext4_debug("update backup group %#04llx\n", block); + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto out; + } + + err = ext4_journal_get_write_access(handle, gdb); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, + gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } brelse(gdb); - goto exit_journal; } - memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); - set_buffer_uptodate(gdb); - err = ext4_handle_dirty_metadata(handle, NULL, gdb); - if (unlikely(err)) { - brelse(gdb); - goto exit_journal; + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; } - brelse(gdb); - } - /* Zero out all of the reserved backup group descriptor table blocks */ - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, - GFP_NOFS); - if (err) - goto exit_journal; + /* Initialize group tables of the grop @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; - err = extend_or_restart_transaction(handle, 2); - if (err) - goto exit_journal; + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; - bh = bclean(handle, sb, input->block_bitmap); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); - goto exit_journal; - } +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup group tables %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); - } + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; - ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, - input->block_bitmap - start); - ext4_set_bit(input->block_bitmap - start, bh->b_data); - ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, - input->inode_bitmap - start); - ext4_set_bit(input->inode_bitmap - start, bh->b_data); - - /* Zero out all of the inode table blocks */ - block = input->inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); - if (err) - goto exit_bh; - ext4_set_bits(bh->b_data, input->inode_table - start, - sbi->s_itb_per_group); + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + if (ext4_bg_has_super(sb, group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + + 1); + } + ext4_mark_bitmap_end(group_data[i].blocks_count, + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; - ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_bh; + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); } - brelse(bh); - /* Mark unused entries in inode bitmap used */ - ext4_debug("clear inode bitmap %#04llx (+%llu)\n", - input->inode_bitmap, input->inode_bitmap - start); - if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; + bh = NULL; + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + count = group_table_count[j]; + start = group_data[i].block_bitmap; + block = start; + } + + if (count) { + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + } } - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) - ext4_std_error(sb, err); -exit_bh: +out: brelse(bh); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) + err2 = ext4_journal_stop(handle); + if (err2 && !err) err = err2; return err; @@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, * groups in current filesystem that have BACKUPS, or -ve error code. */ static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, struct buffer_head *primary) { const ext4_fsblk_t blk = primary->b_blocknr; - const ext4_group_t end = EXT4_SB(sb)->s_groups_count; unsigned three = 1; unsigned five = 5; unsigned seven = 7; @@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (!gdb_bh) return -EIO; - gdbackups = verify_reserved_gdb(sb, gdb_bh); + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; goto exit_bh; @@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EIO; goto exit_bh; } - if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { brelse(primary[res]); err = gdbackups; goto exit_bh; @@ -735,6 +1021,348 @@ exit_err: } } +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, gdb_bh); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else + err = add_new_gdb(handle, resize_inode, group); + if (err) + break; + } + return err; +} + +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi->s_group_desc[gdb_num]; + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + EXT4_B2C(sbi, group_data->free_blocks_count)); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += group_data[i].free_blocks_count; + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, group_data[0].group); + atomic_add(EXT4_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &sbi->s_flex_groups[flex_group].free_inodes); + } + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = flex_gd->count * 4 + reserved_gdb; + handle = ext4_journal_start_sb(sb, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != EXT4_SB(sb)->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_super(handle, sb); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int i; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + for (i = 0; i < flex_gd->count; i++, group++) { + struct buffer_head *gdb_bh; + int gdb_num; + gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, + gdb_bh->b_size); + } + } +exit: + return err; +} + +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count, + unsigned long flexbg_size) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t blocks_per_group; + unsigned long i; + + blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flexbg_size - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = blocks_per_group; + overhead = ext4_bg_has_super(sb, group + i) ? + (1 + ext4_bg_num_gdb(sb, group + i) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + group_data[i].free_blocks_count = blocks_per_group - overhead; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && + EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != blocks_per_group - 1)) { + group_data[i - 1].blocks_count = last + 1; + group_data[i - 1].free_blocks_count -= blocks_per_group- + last - 1; + } + + return 1; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it @@ -750,16 +1378,15 @@ exit_err: */ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) { + struct ext4_new_flex_group_data flex_gd; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct buffer_head *primary = NULL; - struct ext4_group_desc *gdp; struct inode *inode = NULL; - handle_t *handle; int gdb_off, gdb_num; - int err, err2; + int err; + __u16 bg_flags = 0; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); @@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } - if ((err = verify_group_input(sb, input))) - goto exit_put; + err = verify_group_input(sb, input); + if (err) + goto out; - if ((err = setup_new_group_blocks(sb, input))) - goto exit_put; + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: + iput(inode); + return err; +} /* ext4_group_add */ - /* - * We will always be modifying at least the superblock and a GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). */ - handle = ext4_journal_start_sb(sb, - ext4_bg_has_super(sb, input->group) ? - 3 + reserved_gdb : 4); + handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto exit_put; + ext4_warning(sb, "error %d on journal start", err); + return err; } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) - goto exit_journal; - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - primary = sbi->s_group_desc[gdb_num]; - if ((err = ext4_journal_get_write_access(handle, primary))) - goto exit_journal; - - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { - err = reserve_backup_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - } - } else { - /* - * Note that we can access new group descriptor block safely - * only if add_new_gdb() succeeds. - */ - err = add_new_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - primary = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; } - /* - * OK, now we've set up the new group. Time to make it active. - * - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ - - /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)primary->b_data + - gdb_off * EXT4_DESC_SIZE(sb)); - - memset(gdp, 0, EXT4_DESC_SIZE(sb)); - ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ - ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ - ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); - ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); - - /* - * We can allocate memory for mb_alloc based on the new group - * descriptor - */ - err = ext4_mb_add_groupinfo(sb, input->group, gdp); + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); if (err) - goto exit_journal; - - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - ext4_blocks_count_set(es, ext4_blocks_count(es) + - input->blocks_count); - le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers must perform a smp_wmb() after updating all dependent - * data and before modifying the groups count - * - * * Readers must perform an smp_rmb() after reading the groups count - * and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count++; - - err = ext4_handle_dirty_metadata(handle, NULL, primary); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_journal; - } - - /* Update the reserved block counts only once the new group is - * active. */ - ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + - input->reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, input->free_blocks_count)); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT4_INODES_PER_GROUP(sb)); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { - ext4_group_t flex_group; - flex_group = ext4_flex_group(sbi, input->group); - atomic_add(EXT4_B2C(sbi, input->free_blocks_count), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(EXT4_INODES_PER_GROUP(sb), - &sbi->s_flex_groups[flex_group].free_inodes); - } - + goto errout; ext4_handle_dirty_super(handle, sb); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) err = err2; - if (!err && primary) { - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); - update_backups(sb, primary->b_blocknr, primary->b_data, - primary->b_size); } -exit_put: - iput(inode); return err; -} /* ext4_group_add */ +} /* * Extend the filesystem to the new number of blocks specified. This entry @@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - handle_t *handle; - int err, err2; + int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext4_free_blocks(). - */ - handle = ext4_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext4_warning(sb, "error %d on journal start", err); - goto exit_put; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + return err; +} /* ext4_group_extend */ + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode; + ext4_fsblk_t o_blocks_count; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_grpblk_t offset; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + unsigned long desc_blocks; + int err = 0, flexbg_size = 1; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; } - if ((err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, "error %d on journal write access", err); - ext4_journal_stop(handle); - goto exit_put; + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + + n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / + EXT4_DESC_PER_BLOCK(sb); + o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + desc_blocks = n_desc_blocks - o_desc_blocks; + + if (desc_blocks && + (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || + le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { + ext4_warning(sb, "No reserved GDT blocks, can't resize"); + return -EPERM; } - ext4_blocks_count_set(es, o_blocks_count + add); - ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - /* We add the blocks to the bitmap and set the group need init bit */ - err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); - ext4_handle_dirty_super(handle, sb); - ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - err2 = ext4_journal_stop(handle); - if (!err && err2) - err = err2; - if (err) - goto exit_put; + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + if (offset != 0) { + /* extend the last group */ + ext4_grpblk_t add; + add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + es->s_log_groups_per_flex) + flexbg_size = 1 << es->s_log_groups_per_flex; + + o_blocks_count = ext4_blocks_count(es); + if (o_blocks_count == n_blocks_count) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, + flexbg_size)) { + ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + + iput(resize_inode); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", - ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); -exit_put: + printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); return err; -} /* ext4_group_extend */ +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ed3ce82e2de4..502c61fd7392 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1095,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) } if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_min_batch_time); + (unsigned) sbi->s_max_batch_time); } /* @@ -2005,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group_count; ext4_group_t flex_group; - int groups_per_flex = 0; + unsigned int groups_per_flex = 0; size_t size; int i; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - if (groups_per_flex < 2) { + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + @@ -3506,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data" + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index b60f9f81e33c..d2a200624af5 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -47,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e2951506434d..f855916657ba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -20,6 +20,7 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/writeback.h> @@ -29,6 +30,11 @@ #include "internal.h" /* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + +/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { @@ -742,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh(wb->bdi)) break; + /* + * Kupdate and background works are special and we want to + * include all inodes that need writing. Livelock avoidance is + * handled by these works yielding to any other work so we are + * safe. + */ if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - work->older_than_this = &oldest_jif; - } + } else if (work->for_background) + oldest_jif = jiffies; trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2aaf3eaaf13d..5f3368ab0fa9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_delete_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, + outarg.child, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RETRIEVE: return fuse_notify_retrieve(fc, size, cs); + case FUSE_NOTIFY_DELETE: + return fuse_notify_delete(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5ddd6ea8f839..206632887bb4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, - struct qstr *name) + u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; struct inode *parent; @@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_attr(parent); fuse_invalidate_entry(entry); + + if (child_nodeid != 0 && entry->d_inode) { + mutex_lock(&entry->d_inode->i_mutex); + if (get_node_id(entry->d_inode) != child_nodeid) { + err = -ENOENT; + goto badentry; + } + if (d_mountpoint(entry)) { + err = -EBUSY; + goto badentry; + } + if (S_ISDIR(entry->d_inode->i_mode)) { + shrink_dcache_parent(entry); + if (!simple_empty(entry)) { + err = -ENOTEMPTY; + goto badentry; + } + entry->d_inode->i_flags |= S_DEAD; + } + dont_mount(entry); + clear_nlink(entry->d_inode); + err = 0; + badentry: + mutex_unlock(&entry->d_inode->i_mutex); + if (!err) + d_delete(entry); + } else { + err = 0; + } dput(entry); - err = 0; unlock: mutex_unlock(&parent->i_mutex); @@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, return fuse_fsync_common(file, start, end, datasync, 1); } +static long fuse_dir_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); +} + +static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, + FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); +} + static bool update_mtime(unsigned ivalid) { /* Always update if mtime is explicitly set */ @@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = { .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, + .unlocked_ioctl = fuse_dir_ioctl, + .compat_ioctl = fuse_dir_compat_ioctl, }; static const struct inode_operations fuse_common_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0c84100acd44..4a199fd93fbd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) loff_t retval; struct inode *inode = file->f_path.dentry->d_inode; - mutex_lock(&inode->i_mutex); - if (origin != SEEK_CUR && origin != SEEK_SET) { - retval = fuse_update_attributes(inode, NULL, file, NULL); - if (retval) - goto exit; - } + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ + if (origin == SEEK_CUR || origin == SEEK_SET) + return generic_file_llseek(file, offset, origin); - switch (origin) { - case SEEK_END: - offset += i_size_read(inode); - break; - case SEEK_CUR: - if (offset == 0) { - retval = file->f_pos; - goto exit; - } - offset += file->f_pos; - break; - case SEEK_DATA: - if (offset >= i_size_read(inode)) { - retval = -ENXIO; - goto exit; - } - break; - case SEEK_HOLE: - if (offset >= i_size_read(inode)) { - retval = -ENXIO; - goto exit; - } - offset = i_size_read(inode); - break; - } - retval = -EINVAL; - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - retval = offset; - } -exit: + mutex_lock(&inode->i_mutex); + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (!retval) + retval = generic_file_llseek(file, offset, origin); mutex_unlock(&inode->i_mutex); + return retval; } @@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); + pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } EXPORT_SYMBOL_GPL(fuse_do_ioctl); -static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) { struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); @@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, static long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return fuse_file_ioctl_common(file, cmd, arg, 0); + return fuse_ioctl_common(file, cmd, arg, 0); } static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); } /* diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1964da0257d9..572cefc78012 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, /** * File-system tells the kernel to invalidate parent attributes and * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + * - matches the inode number for the dentry matching parent/name, + * - is not a mount point + * - is a file or oan empty directory + * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, - struct qstr *name); + u64 child_nodeid, struct qstr *name); int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); @@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags); unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 88e8a23d0026..376816fcd040 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) spin_lock(&gl->gl_spin); gl->gl_reply = ret; - if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2553b858a72e..307ac31df781 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -121,8 +121,11 @@ enum { struct lm_lockops { const char *lm_proto_name; - int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); - void (*lm_unmount) (struct gfs2_sbd *sdp); + int (*lm_mount) (struct gfs2_sbd *sdp, const char *table); + void (*lm_first_done) (struct gfs2_sbd *sdp); + void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result); + void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); void (*lm_put_lock) (struct gfs2_glock *gl); int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e1d3bb59945c..97742a7ea9cc 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -139,8 +139,45 @@ struct gfs2_bufdata { #define GDLM_STRNAME_BYTES 25 #define GDLM_LVB_SIZE 32 +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + enum { DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, }; struct lm_lockname { @@ -392,6 +429,7 @@ struct gfs2_jdesc { #define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; + int jd_recover_error; }; struct gfs2_statfs_change_host { @@ -461,6 +499,7 @@ enum { SDF_NORECOVERY = 4, SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ }; #define GFS2_FSNAME_LEN 256 @@ -499,14 +538,26 @@ struct gfs2_sb_host { struct lm_lockstruct { int ls_jid; unsigned int ls_first; - unsigned int ls_first_done; unsigned int ls_nodir; const struct lm_lockops *ls_ops; - unsigned long ls_flags; dlm_lockspace_t *ls_dlm; - int ls_recover_jid_done; - int ls_recover_jid_status; + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ }; struct gfs2_sbd { @@ -544,6 +595,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct delayed_work sd_control_work; /* Inode Stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 017960cf1d7a..a7d611b93f0f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto fail_end_trans; - inc_nlink(&ip->i_inode); - if (S_ISDIR(ip->i_inode.i_mode)) - inc_nlink(&ip->i_inode); + set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 98c80d8c2a62..8944d1e32ab5 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. + * Copyright 2004-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -11,12 +11,15 @@ #include <linux/dlm.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/delay.h> #include <linux/gfs2_ondisk.h> #include "incore.h" #include "glock.h" #include "util.h" +#include "sys.h" +extern struct workqueue_struct *gfs2_control_wq; static void gdlm_ast(void *arg) { @@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl) dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); } -static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) +/* + * dlm/gfs2 recovery coordination using dlm_recover callbacks + * + * 1. dlm_controld sees lockspace members change + * 2. dlm_controld blocks dlm-kernel locking activity + * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) + * 4. dlm_controld starts and finishes its own user level recovery + * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery + * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot) + * 7. dlm_recoverd does its own lock recovery + * 8. dlm_recoverd unblocks dlm-kernel locking activity + * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation) + * 10. gfs2_control updates control_lock lvb with new generation and jid bits + * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none) + * 12. gfs2_recover dequeues and recovers journals of failed nodes + * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result) + * 14. gfs2_control updates control_lock lvb jid bits for recovered journals + * 15. gfs2_control unblocks normal locking when all journals are recovered + * + * - failures during recovery + * + * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control + * clears BLOCK_LOCKS (step 15), e.g. another node fails while still + * recovering for a prior failure. gfs2_control needs a way to detect + * this so it can leave BLOCK_LOCKS set in step 15. This is managed using + * the recover_block and recover_start values. + * + * recover_done() provides a new lockspace generation number each time it + * is called (step 9). This generation number is saved as recover_start. + * When recover_prep() is called, it sets BLOCK_LOCKS and sets + * recover_block = recover_start. So, while recover_block is equal to + * recover_start, BLOCK_LOCKS should remain set. (recover_spin must + * be held around the BLOCK_LOCKS/recover_block/recover_start logic.) + * + * - more specific gfs2 steps in sequence above + * + * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start + * 6. recover_slot records any failed jids (maybe none) + * 9. recover_done sets recover_start = new generation number + * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids + * 12. gfs2_recover does journal recoveries for failed jids identified above + * 14. gfs2_control clears control_lock lvb bits for recovered jids + * 15. gfs2_control checks if recover_block == recover_start (step 3 occured + * again) then do nothing, otherwise if recover_start > recover_block + * then clear BLOCK_LOCKS. + * + * - parallel recovery steps across all nodes + * + * All nodes attempt to update the control_lock lvb with the new generation + * number and jid bits, but only the first to get the control_lock EX will + * do so; others will see that it's already done (lvb already contains new + * generation number.) + * + * . All nodes get the same recover_prep/recover_slot/recover_done callbacks + * . All nodes attempt to set control_lock lvb gen + bits for the new gen + * . One node gets control_lock first and writes the lvb, others see it's done + * . All nodes attempt to recover jids for which they see control_lock bits set + * . One node succeeds for a jid, and that one clears the jid bit in the lvb + * . All nodes will eventually see all lvb bits clear and unblock locks + * + * - is there a problem with clearing an lvb bit that should be set + * and missing a journal recovery? + * + * 1. jid fails + * 2. lvb bit set for step 1 + * 3. jid recovered for step 1 + * 4. jid taken again (new mount) + * 5. jid fails (for step 4) + * 6. lvb bit set for step 5 (will already be set) + * 7. lvb bit cleared for step 3 + * + * This is not a problem because the failure in step 5 does not + * require recovery, because the mount in step 4 could not have + * progressed far enough to unblock locks and access the fs. The + * control_mount() function waits for all recoveries to be complete + * for the latest lockspace generation before ever unblocking locks + * and returning. The mount in step 4 waits until the recovery in + * step 1 is done. + * + * - special case of first mounter: first node to mount the fs + * + * The first node to mount a gfs2 fs needs to check all the journals + * and recover any that need recovery before other nodes are allowed + * to mount the fs. (Others may begin mounting, but they must wait + * for the first mounter to be done before taking locks on the fs + * or accessing the fs.) This has two parts: + * + * 1. The mounted_lock tells a node it's the first to mount the fs. + * Each node holds the mounted_lock in PR while it's mounted. + * Each node tries to acquire the mounted_lock in EX when it mounts. + * If a node is granted the mounted_lock EX it means there are no + * other mounted nodes (no PR locks exist), and it is the first mounter. + * The mounted_lock is demoted to PR when first recovery is done, so + * others will fail to get an EX lock, but will get a PR lock. + * + * 2. The control_lock blocks others in control_mount() while the first + * mounter is doing first mount recovery of all journals. + * A mounting node needs to acquire control_lock in EX mode before + * it can proceed. The first mounter holds control_lock in EX while doing + * the first mount recovery, blocking mounts from other nodes, then demotes + * control_lock to NL when it's done (others_may_mount/first_done), + * allowing other nodes to continue mounting. + * + * first mounter: + * control_lock EX/NOQUEUE success + * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters) + * set first=1 + * do first mounter recovery + * mounted_lock EX->PR + * control_lock EX->NL, write lvb generation + * + * other mounter: + * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry) + * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR) + * mounted_lock PR/NOQUEUE success + * read lvb generation + * control_lock EX->NL + * set first=0 + * + * - mount during recovery + * + * If a node mounts while others are doing recovery (not first mounter), + * the mounting node will get its initial recover_done() callback without + * having seen any previous failures/callbacks. + * + * It must wait for all recoveries preceding its mount to be finished + * before it unblocks locks. It does this by repeating the "other mounter" + * steps above until the lvb generation number is >= its mount generation + * number (from initial recover_done) and all lvb bits are clear. + * + * - control_lock lvb format + * + * 4 bytes generation number: the latest dlm lockspace generation number + * from recover_done callback. Indicates the jid bitmap has been updated + * to reflect all slot failures through that generation. + * 4 bytes unused. + * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates + * that jid N needs recovery. + */ + +#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */ + +static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, + char *lvb_bits) +{ + uint32_t gen; + memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); + memcpy(&gen, lvb_bits, sizeof(uint32_t)); + *lvb_gen = le32_to_cpu(gen); +} + +static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, + char *lvb_bits) +{ + uint32_t gen; + memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); + gen = cpu_to_le32(lvb_gen); + memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); +} + +static int all_jid_bits_clear(char *lvb) +{ + int i; + for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { + if (lvb[i]) + return 0; + } + return 1; +} + +static void sync_wait_cb(void *arg) +{ + struct lm_lockstruct *ls = arg; + complete(&ls->ls_sync_wait); +} + +static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - if (fsname == NULL) { - fs_info(sdp, "no fsname found\n"); - return -EINVAL; + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + if (error) { + fs_err(sdp, "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + fs_err(sdp, "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, + unsigned int num, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char strname[GDLM_STRNAME_BYTES]; + int error, status; + + memset(strname, 0, GDLM_STRNAME_BYTES); + snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); + + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + if (error) { + fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + +static int mounted_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK, + &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int control_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock"); +} + +static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK, + &ls->ls_control_lksb, "control_lock"); +} + +static void gfs2_control_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t block_gen, start_gen, lvb_gen, flags; + int recover_set = 0; + int write_lvb = 0; + int recover_size; + int i, error; + + spin_lock(&ls->ls_recover_spin); + /* + * No MOUNT_DONE means we're still mounting; control_mount() + * will set this flag, after which this thread will take over + * all further clearing of BLOCK_LOCKS. + * + * FIRST_MOUNT means this node is doing first mounter recovery, + * for which recovery control is handled by + * control_mount()/control_first_done(), not this thread. + */ + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + spin_unlock(&ls->ls_recover_spin); + + /* + * Equal block_gen and start_gen implies we are between + * recover_prep and recover_done callbacks, which means + * dlm recovery is in progress and dlm locking is blocked. + * There's no point trying to do any work until recover_done. + */ + + if (block_gen == start_gen) + return; + + /* + * Propagate recover_submit[] and recover_result[] to lvb: + * dlm_recoverd adds to recover_submit[] jids needing recovery + * gfs2_recover adds to recover_result[] journal recovery results + * + * set lvb bit for jids in recover_submit[] if the lvb has not + * yet been updated for the generation of the failure + * + * clear lvb bit for jids in recover_result[] if the result of + * the journal recovery is SUCCESS + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control lock EX error %d\n", error); + return; + } + + control_lvb_read(ls, &lvb_gen, lvb_bits); + + spin_lock(&ls->ls_recover_spin); + if (block_gen != ls->ls_recover_block || + start_gen != ls->ls_recover_start) { + fs_info(sdp, "recover generation %u block1 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + return; + } + + recover_size = ls->ls_recover_size; + + if (lvb_gen <= start_gen) { + /* + * Clear lvb bits for jids we've successfully recovered. + * Because all nodes attempt to recover failed journals, + * a journal can be recovered multiple times successfully + * in succession. Only the first will really do recovery, + * the others find it clean, but still report a successful + * recovery. So, another node may have already recovered + * the jid and cleared the lvb bit for it. + */ + for (i = 0; i < recover_size; i++) { + if (ls->ls_recover_result[i] != LM_RD_SUCCESS) + continue; + + ls->ls_recover_result[i] = 0; + + if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) + continue; + + __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + write_lvb = 1; + } + } + + if (lvb_gen == start_gen) { + /* + * Failed slots before start_gen are already set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < lvb_gen) + ls->ls_recover_submit[i] = 0; + } + } else if (lvb_gen < start_gen) { + /* + * Failed slots before start_gen are not yet set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < start_gen) { + ls->ls_recover_submit[i] = 0; + __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + } + } + /* even if there are no bits to set, we need to write the + latest generation to the lvb */ + write_lvb = 1; + } else { + /* + * we should be getting a recover_done() for lvb_gen soon + */ + } + spin_unlock(&ls->ls_recover_spin); + + if (write_lvb) { + control_lvb_write(ls, start_gen, lvb_bits); + flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; + } else { + flags = DLM_LKF_CONVERT; + } + + error = control_lock(sdp, DLM_LOCK_NL, flags); + if (error) { + fs_err(sdp, "control lock NL error %d\n", error); + return; + } + + /* + * Everyone will see jid bits set in the lvb, run gfs2_recover_set(), + * and clear a jid bit in the lvb if the recovery is a success. + * Eventually all journals will be recovered, all jid bits will + * be cleared in the lvb, and everyone will clear BLOCK_LOCKS. + */ + + for (i = 0; i < recover_size; i++) { + if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { + fs_info(sdp, "recover generation %u jid %d\n", + start_gen, i); + gfs2_recover_set(sdp, i); + recover_set++; + } + } + if (recover_set) + return; + + /* + * No more jid bits set in lvb, all recovery is done, unblock locks + * (unless a new recover_prep callback has occured blocking locks + * again while working above) + */ + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_block == block_gen && + ls->ls_recover_start == start_gen) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "recover generation %u done\n", start_gen); + gfs2_glock_thaw(sdp); + } else { + fs_info(sdp, "recover generation %u block2 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + } +} + +static int control_mount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t start_gen, block_gen, mount_gen, lvb_gen; + int mounted_mode; + int retries = 0; + int error; + + memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE); + ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb; + init_completion(&ls->ls_sync_wait); + + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control_mount control_lock NL error %d\n", error); + return error; + } + + error = mounted_lock(sdp, DLM_LOCK_NL, 0); + if (error) { + fs_err(sdp, "control_mount mounted_lock NL error %d\n", error); + control_unlock(sdp); + return error; + } + mounted_mode = DLM_LOCK_NL; + +restart: + if (retries++ && signal_pending(current)) { + error = -EINTR; + goto fail; + } + + /* + * We always start with both locks in NL. control_lock is + * demoted to NL below so we don't need to do it here. + */ + + if (mounted_mode != DLM_LOCK_NL) { + error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + mounted_mode = DLM_LOCK_NL; + } + + /* + * Other nodes need to do some work in dlm recovery and gfs2_control + * before the recover_done and control_lock will be ready for us below. + * A delay here is not required but often avoids having to retry. + */ + + msleep_interruptible(500); + + /* + * Acquire control_lock in EX and mounted_lock in either EX or PR. + * control_lock lvb keeps track of any pending journal recoveries. + * mounted_lock indicates if any other nodes have the fs mounted. + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK); + if (error == -EAGAIN) { + goto restart; + } else if (error) { + fs_err(sdp, "control_mount control_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_EX; + goto locks_done; + } else if (error != -EAGAIN) { + fs_err(sdp, "control_mount mounted_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_PR; + goto locks_done; + } else { + /* not even -EAGAIN should happen here */ + fs_err(sdp, "control_mount mounted_lock PR error %d\n", error); + goto fail; + } + +locks_done: + /* + * If we got both locks above in EX, then we're the first mounter. + * If not, then we need to wait for the control_lock lvb to be + * updated by other mounted nodes to reflect our mount generation. + * + * In simple first mounter cases, first mounter will see zero lvb_gen, + * but in cases where all existing nodes leave/fail before mounting + * nodes finish control_mount, then all nodes will be mounting and + * lvb_gen will be non-zero. + */ + + control_lvb_read(ls, &lvb_gen, lvb_bits); + + if (lvb_gen == 0xFFFFFFFF) { + /* special value to force mount attempts to fail */ + fs_err(sdp, "control_mount control_lock disabled\n"); + error = -EINVAL; + goto fail; + } + + if (mounted_mode == DLM_LOCK_EX) { + /* first mounter, keep both EX while doing first recovery */ + spin_lock(&ls->ls_recover_spin); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "first mounter control generation %u\n", lvb_gen); + return 0; + } + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + + /* + * We are not first mounter, now we need to wait for the control_lock + * lvb generation to be >= the generation from our first recover_done + * and all lvb bits to be clear (no pending journal recoveries.) + */ + + if (!all_jid_bits_clear(lvb_bits)) { + /* journals need recovery, wait until all are clear */ + fs_info(sdp, "control_mount wait for journal recovery\n"); + goto restart; + } + + spin_lock(&ls->ls_recover_spin); + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + mount_gen = ls->ls_recover_mount; + + if (lvb_gen < mount_gen) { + /* wait for mounted nodes to update control_lock lvb to our + generation, which might include new recovery bits set */ + fs_info(sdp, "control_mount wait1 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (lvb_gen != start_gen) { + /* wait for mounted nodes to update control_lock lvb to the + latest recovery generation */ + fs_info(sdp, "control_mount wait2 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (block_gen == start_gen) { + /* dlm recovery in progress, wait for it to finish */ + fs_info(sdp, "control_mount wait3 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; } - error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL | - (ls->ls_nodir ? DLM_LSFL_NODIR : 0), - GDLM_LVB_SIZE); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + return 0; + +fail: + mounted_unlock(sdp); + control_unlock(sdp); + return error; +} + +static int dlm_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +static int control_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t start_gen, block_gen; + int error; + +restart: + spin_lock(&ls->ls_recover_spin); + start_gen = ls->ls_recover_start; + block_gen = ls->ls_recover_block; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) || + !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + /* sanity check, should not happen */ + fs_err(sdp, "control_first_done start %u block %u flags %lx\n", + start_gen, block_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + control_unlock(sdp); + return -1; + } + + if (start_gen == block_gen) { + /* + * Wait for the end of a dlm recovery cycle to switch from + * first mounter recovery. We can ignore any recover_slot + * callbacks between the recover_prep and next recover_done + * because we are still the first mounter and any failed nodes + * have not fully mounted, so they don't need recovery. + */ + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "control_first_done wait gen %u\n", start_gen); + + wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, + dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + + clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + + memset(lvb_bits, 0, sizeof(lvb_bits)); + control_lvb_write(ls, start_gen, lvb_bits); + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); + if (error) + fs_err(sdp, "control_first_done mounted PR error %d\n", error); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK); if (error) - printk(KERN_ERR "dlm_new_lockspace error %d", error); + fs_err(sdp, "control_first_done control NL error %d\n", error); return error; } +/* + * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) + * to accomodate the largest slot number. (NB dlm slot numbers start at 1, + * gfs2 jids start at 0, so jid = slot - 1) + */ + +#define RECOVER_SIZE_INC 16 + +static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, + int num_slots) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t *submit = NULL; + uint32_t *result = NULL; + uint32_t old_size, new_size; + int i, max_jid; + + max_jid = 0; + for (i = 0; i < num_slots; i++) { + if (max_jid < slots[i].slot - 1) + max_jid = slots[i].slot - 1; + } + + old_size = ls->ls_recover_size; + + if (old_size >= max_jid + 1) + return 0; + + new_size = old_size + RECOVER_SIZE_INC; + + submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + if (!submit || !result) { + kfree(submit); + kfree(result); + return -ENOMEM; + } + + spin_lock(&ls->ls_recover_spin); + memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t)); + memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t)); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = submit; + ls->ls_recover_result = result; + ls->ls_recover_size = new_size; + spin_unlock(&ls->ls_recover_spin); + return 0; +} + +static void free_recover_size(struct lm_lockstruct *ls) +{ + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_recover_size = 0; +} + +/* dlm calls before it does lock recovery */ + +static void gdlm_recover_prep(void *arg) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_block = ls->ls_recover_start; + set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_prep has been completed on all lockspace members; + identifies slot/jid of failed member */ + +static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int jid = slot->slot - 1; + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recover_slot jid %d gen %u short size %d", + jid, ls->ls_recover_block, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + if (ls->ls_recover_submit[jid]) { + fs_info(sdp, "recover_slot jid %d gen %u prev %u", + jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); + } + ls->ls_recover_submit[jid] = ls->ls_recover_block; + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_slot and after it completes lock recovery */ + +static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, + int our_slot, uint32_t generation) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + /* ensure the ls jid arrays are large enough */ + set_recover_size(sdp, slots, num_slots); + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_start = generation; + + if (!ls->ls_recover_mount) { + ls->ls_recover_mount = generation; + ls->ls_jid = our_slot - 1; + } + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); + + clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); + spin_unlock(&ls->ls_recover_spin); +} + +/* gfs2_recover thread has a journal recovery result */ + +static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + /* don't care about the recovery of own journal during mount */ + if (jid == ls->ls_jid) + return; + + spin_lock(&ls->ls_recover_spin); + if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recovery_result jid %d short size %d", + jid, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + fs_info(sdp, "recover jid %d result %s\n", jid, + result == LM_RD_GAVEUP ? "busy" : "success"); + + ls->ls_recover_result[jid] = result; + + /* GAVEUP means another node is recovering the journal; delay our + next attempt to recover it, to give the other node a chance to + finish before trying again */ + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, + result == LM_RD_GAVEUP ? HZ : 0); + spin_unlock(&ls->ls_recover_spin); +} + +const struct dlm_lockspace_ops gdlm_lockspace_ops = { + .recover_prep = gdlm_recover_prep, + .recover_slot = gdlm_recover_slot, + .recover_done = gdlm_recover_done, +}; + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char cluster[GFS2_LOCKNAME_LEN]; + const char *fsname; + uint32_t flags; + int error, ops_result; + + /* + * initialize everything + */ + + INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + spin_lock_init(&ls->ls_recover_spin); + ls->ls_recover_flags = 0; + ls->ls_recover_mount = 0; + ls->ls_recover_start = 0; + ls->ls_recover_block = 0; + ls->ls_recover_size = 0; + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + + error = set_recover_size(sdp, NULL, 0); + if (error) + goto fail; + + /* + * prepare dlm_new_lockspace args + */ + + fsname = strchr(table, ':'); + if (!fsname) { + fs_info(sdp, "no fsname found\n"); + error = -EINVAL; + goto fail_free; + } + memset(cluster, 0, sizeof(cluster)); + memcpy(cluster, table, strlen(table) - strlen(fsname)); + fsname++; + + flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + if (ls->ls_nodir) + flags |= DLM_LSFL_NODIR; + + /* + * create/join lockspace + */ + + error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, + &gdlm_lockspace_ops, sdp, &ops_result, + &ls->ls_dlm); + if (error) { + fs_err(sdp, "dlm_new_lockspace error %d\n", error); + goto fail_free; + } + + if (ops_result < 0) { + /* + * dlm does not support ops callbacks, + * old dlm_controld/gfs_controld are used, try without ops. + */ + fs_info(sdp, "dlm lockspace ops not used\n"); + free_recover_size(ls); + set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags); + return 0; + } + + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) { + fs_err(sdp, "dlm lockspace ops disallow jid preset\n"); + error = -EINVAL; + goto fail_release; + } + + /* + * control_mount() uses control_lock to determine first mounter, + * and for later mounts, waits for any recoveries to be cleared. + */ + + error = control_mount(sdp); + if (error) { + fs_err(sdp, "mount control error %d\n", error); + goto fail_release; + } + + ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + return 0; + +fail_release: + dlm_release_lockspace(ls->ls_dlm, 2); +fail_free: + free_recover_size(ls); +fail: + return error; +} + +static void gdlm_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + error = control_first_done(sdp); + if (error) + fs_err(sdp, "mount first_done error %d\n", error); +} + static void gdlm_unmount(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + goto release; + + /* wait for gfs2_control_wq to be done with this mount */ + + spin_lock(&ls->ls_recover_spin); + set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + flush_delayed_work_sync(&sdp->sd_control_work); + + /* mounted_lock and control_lock will be purged in dlm recovery */ +release: if (ls->ls_dlm) { dlm_release_lockspace(ls->ls_dlm, 2); ls->ls_dlm = NULL; } + + free_recover_size(ls); } static const match_table_t dlm_tokens = { @@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = { const struct lm_lockops gfs2_dlm_ops = { .lm_proto_name = "lock_dlm", .lm_mount = gdlm_mount, + .lm_first_done = gdlm_first_done, + .lm_recovery_result = gdlm_recovery_result, .lm_unmount = gdlm_unmount, .lm_put_lock = gdlm_put_lock, .lm_lock = gdlm_lock, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c150298e2d8e..a8d9bcd0e19c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -28,6 +28,8 @@ #include "recovery.h" #include "dir.h" +struct workqueue_struct *gfs2_control_wq; + static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, .seeks = DEFAULT_SEEKS, @@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void) if (!gfs_recovery_wq) goto fail_wq; + gfs2_control_wq = alloc_workqueue("gfs2_control", + WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!gfs2_control_wq) + goto fail_control; + gfs2_register_debugfs(); printk("GFS2 installed\n"); return 0; +fail_control: + destroy_workqueue(gfs_recovery_wq); fail_wq: unregister_filesystem(&gfs2meta_fs_type); fail_unregister: @@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_control_wq); rcu_barrier(); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index fe72e79e6ff9..6aacf3f230a2 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp) { char *message = "FIRSTMOUNT=Done"; char *envp[] = { message, NULL }; - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ls->ls_first_done = 1; + + fs_info(sdp, "first mount done, others may mount\n"); + + if (sdp->sd_lockstruct.ls_ops->lm_first_done) + sdp->sd_lockstruct.ls_ops->lm_first_done(sdp); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } @@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) struct gfs2_args *args = &sdp->sd_args; const char *proto = sdp->sd_proto_name; const char *table = sdp->sd_table_name; - const char *fsname; char *o, *options; int ret; @@ -1004,21 +1007,12 @@ hostdata_error: } } - if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); - else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, - sdp->sd_lockstruct.ls_jid); - - fsname = strchr(table, ':'); - if (fsname) - fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); complete_all(&sdp->sd_locking_init); return 0; } - ret = lm->lm_mount(sdp, fsname); + ret = lm->lm_mount(sdp, table); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); complete_all(&sdp->sd_locking_init); @@ -1084,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; - set_bit(SDF_NORECOVERY, &sdp->sd_flags); + set_bit(SDF_RORECOVERY, &sdp->sd_flags); } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; @@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail; + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + gfs2_create_debugfs_file(sdp); error = gfs2_sys_fs_add(sdp); @@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_sb; } + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + sdp->sd_table_name); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f2a02edcac8f..963b2d75200c 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, char env_status[20]; char *envp[] = { env_jid, env_status, NULL }; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_recover_jid_done = jid; ls->ls_recover_jid_status = message; sprintf(env_jid, "JID=%d", jid); sprintf(env_status, "RECOVERY=%s", message == LM_RD_SUCCESS ? "Done" : "Failed"); kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); + + if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) + sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } void gfs2_recover_func(struct work_struct *work) @@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; - if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { @@ -577,6 +583,7 @@ fail_gunlock_j: fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: + jd->jd_recover_error = error; gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); @@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); - return 0; + return wait ? jd->jd_recover_error : 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 22234627f684..981bfa32121a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_blkreserv *rs = ip->i_res; - gfs2_blkrsv_put(ip); if (rs->rs_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + gfs2_blkrsv_put(ip); } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 443cabcfcd23..d33172c291ba 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) ssize_t ret; int val = 0; - if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags)) val = 1; ret = sprintf(buf, "%d\n", val); return ret; @@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) val = simple_strtol(buf, NULL, 0); if (val == 1) - set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { - clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); smp_mb__after_clear_bit(); gfs2_glock_thaw(sdp); } else { @@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) goto out; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto out; - sdp->sd_lockstruct.ls_first = first; - rv = 0; + sdp->sd_lockstruct.ls_first = first; + rv = 0; out: spin_unlock(&sdp->sd_jindex_spin); return rv ? rv : len; @@ -360,19 +360,14 @@ out: static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%d\n", ls->ls_first_done); + return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags)); } -static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) { - unsigned jid; struct gfs2_jdesc *jd; int rv; - rv = sscanf(buf, "%u", &jid); - if (rv != 1) - return -EINVAL; - rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) @@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) } out: spin_unlock(&sdp->sd_jindex_spin); + return rv; +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + rv = gfs2_recover_set(sdp, jid); + return rv ? rv : len; } diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h index e94560e836d7..79182d6ad6ac 100644 --- a/fs/gfs2/sys.h +++ b/fs/gfs2/sys.h @@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp); int gfs2_sys_init(void); void gfs2_sys_uninit(void); +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid); + #endif /* __SYS_DOT_H__ */ diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index edf0a801446b..427682ca9e48 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); - hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, - sbi->hidden_dir); + if (!sbi->hidden_dir) { + mutex_unlock(&sbi->vh_mutex); + err = -ENOMEM; + goto out_put_root; + } + err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root, + &str, sbi->hidden_dir); mutex_unlock(&sbi->vh_mutex); + if (err) + goto out_put_hidden_dir; hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e425ad9d0490..1e85a7ac0217 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; diff --git a/fs/inode.c b/fs/inode.c index 87535753ab04..4fa4f0916af9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -776,6 +776,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&sb->s_inode_lru_lock); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; dispose_list(&freeable); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 68d704db787f..5069b8475150 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD2: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + jbd2_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ jbd2_journal_switch_revoke_table(journal); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 69fd93588118..30b2867d6cc9 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flag clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffers in the next + * transaction which is going to be started. + */ +void jbd2_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd2_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd2_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd2_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0e41a4c080e..35ae096bed5d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); + finish_wait(&journal->j_wait_updates, &wait); break; } - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); write_unlock(&journal->j_state_lock); schedule(); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 43926add945b..54cea8ad5a76 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s enter. slotid %d seqid %d\n", __func__, args->csa_slotid, args->csa_sequenceid); - if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) return htonl(NFS4ERR_BADSLOT); slot = tbl->slots + args->csa_slotid; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 873bf00d51a2..277dfaf2e99a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -84,7 +84,7 @@ retry: /* * Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ -static int nfs4_disable_idmapping = 0; +static int nfs4_disable_idmapping = 1; /* * RPC cruft for NFS @@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; #endif - cred = rpc_lookup_machine_cred(); + cred = rpc_lookup_machine_cred("*"); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; nfs_fscache_get_client_cookie(clp); @@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server) rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } +static void nfs4_destroy_server(struct nfs_server *server) +{ + nfs4_purge_state_owners(server); +} + #else static void nfs4_shutdown_client(struct nfs_client *clp) { @@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); + INIT_LIST_HEAD(&server->state_owners_lru); atomic_set(&server->active, 0); @@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, nfs_server_insert_lists(server); server->mount_time = jiffies; + server->destroy = nfs4_destroy_server; out: nfs_free_fattr(fattr); return error; @@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, /* Copy data from the source */ server->nfs_client = source->nfs_client; + server->destroy = source->destroy; atomic_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 606ef0f20aed..c43a452f7da2 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) datasync); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; mutex_lock(&inode->i_mutex); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); + if (status >= 0 && ret < 0) + status = ret; have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); if (have_error) ret = xchg(&ctx->error, 0); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 47d1c6ff2d8e..2c05f1991e1e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -38,6 +38,89 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name) +{ + fattr->owner_name = owner_name; + fattr->group_name = group_name; +} + +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; + kfree(fattr->owner_name->data); +} + +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; + kfree(fattr->group_name->data); +} + +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *owner = fattr->owner_name; + __u32 uid; + + if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) + return false; + if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { + fattr->uid = uid; + fattr->valid |= NFS_ATTR_FATTR_OWNER; + } + return true; +} + +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *group = fattr->group_name; + __u32 gid; + + if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) + return false; + if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { + fattr->gid = gid; + fattr->valid |= NFS_ATTR_FATTR_GROUP; + } + return true; +} + +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ + if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) + nfs_fattr_free_owner_name(fattr); + if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) + nfs_fattr_free_group_name(fattr); +} + +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ + if (nfs_fattr_map_owner_name(server, fattr)) + nfs_fattr_free_owner_name(fattr); + if (nfs_fattr_map_group_name(server, fattr)) + nfs_fattr_free_group_name(fattr); +} static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 81db25e92e10..25c3bfad7953 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1020,6 +1020,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->valid = 0; fattr->time_start = jiffies; fattr->gencount = nfs_inc_attr_generation_counter(); + fattr->owner_name = NULL; + fattr->group_name = NULL; } struct nfs_fattr *nfs_alloc_fattr(void) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3f4d95751d52..8102db9b926c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head); +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_write_data *p); @@ -330,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 693ae22f8731..4d7d0aedc101 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -94,6 +94,8 @@ struct nfs_unique_id { struct nfs4_state_owner { struct nfs_unique_id so_owner_id; struct nfs_server *so_server; + struct list_head so_lru; + unsigned long so_expires; struct rb_node so_server_node; struct rpc_cred *so_cred; /* Associated cred */ @@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); extern void nfs4_put_state_owner(struct nfs4_state_owner *); +extern void nfs4_purge_state_owners(struct nfs_server *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, fmode_t); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index a62d36b9a99e..71ec08617e23 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, loff_t offset) { u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; - u64 tmp; + u64 stripe_no; + u32 rem; offset -= flseg->pattern_offset; - tmp = offset; - do_div(tmp, stripe_width); + stripe_no = div_u64(offset, stripe_width); + div_u64_rem(offset, flseg->stripe_unit, &rem); - return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); + return stripe_no * flseg->stripe_unit + rem; } /* This function is used by the layout driver to calculate the diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dcda0ba7af60..75366dc89686 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -52,6 +52,7 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> +#include <linux/nfs_idmap.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/xattr.h> #include <linux/utsname.h> @@ -364,9 +365,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * Must be called while holding tbl->slot_tbl_lock */ static void -nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) +nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) { - int free_slotid = free_slot - tbl->slots; int slotid = free_slotid; BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); @@ -431,7 +431,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) } spin_lock(&tbl->slot_tbl_lock); - nfs4_free_slot(tbl, res->sr_slot); + nfs4_free_slot(tbl, res->sr_slot - tbl->slots); nfs4_check_drain_fc_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; @@ -554,13 +554,10 @@ int nfs41_setup_sequence(struct nfs4_session *session, spin_lock(&tbl->slot_tbl_lock); if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { - /* - * The state manager will wait until the slot table is empty. - * Schedule the reset thread - */ + /* The state manager will wait until the slot table is empty */ rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s Schedule Session Reset\n", __func__); + dprintk("%s session is draining\n", __func__); return -EAGAIN; } @@ -765,6 +762,8 @@ struct nfs4_opendata { struct nfs_openres o_res; struct nfs_open_confirmargs c_arg; struct nfs_open_confirmres c_res; + struct nfs4_string owner_name; + struct nfs4_string group_name; struct nfs_fattr f_attr; struct nfs_fattr dir_attr; struct dentry *dir; @@ -788,6 +787,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); nfs_fattr_init(&p->dir_attr); + nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); } static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, @@ -819,6 +819,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.dir_bitmask = server->cache_consistency_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (flags & O_CREAT) { u32 *s; @@ -855,6 +856,7 @@ static void nfs4_opendata_free(struct kref *kref) dput(p->dir); dput(p->dentry); nfs_sb_deactive(sb); + nfs_fattr_free_names(&p->f_attr); kfree(p); } @@ -1579,6 +1581,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) if (status != 0 || !data->rpc_done) return status; + nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); + nfs_refresh_inode(dir, o_res->dir_attr); if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -1611,6 +1615,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } + nfs_fattr_map_and_free_names(server, &data->f_attr); + if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); nfs_post_op_update_inode(dir, o_res->dir_attr); @@ -3431,19 +3437,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server) */ #define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) -static void buf_to_pages(const void *buf, size_t buflen, - struct page **pages, unsigned int *pgbase) -{ - const void *p = buf; - - *pgbase = offset_in_page(buf); - p -= *pgbase; - while (p < buf + buflen) { - *(pages++) = virt_to_page(p); - p += PAGE_CACHE_SIZE; - } -} - static int buf_to_pages_noslab(const void *buf, size_t buflen, struct page **pages, unsigned int *pgbase) { @@ -3540,9 +3533,19 @@ out: nfs4_set_cached_acl(inode, acl); } +/* + * The getxattr API returns the required buffer length when called with a + * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating + * the required buf. On a NULL buf, we send a page of data to the server + * guessing that the ACL request can be serviced by a page. If so, we cache + * up to the page of ACL data, and the 2nd call to getxattr is serviced by + * the cache. If not so, we throw away the page, and cache the required + * length. The next getxattr call will then produce another round trip to + * the server, this time with the input buf of the required size. + */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES]; + struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -3557,41 +3560,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - struct page *localpage = NULL; - int ret; + int ret = -ENOMEM, npages, i, acl_len = 0; - if (buflen < PAGE_SIZE) { - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - localpage = alloc_page(GFP_KERNEL); - resp_buf = page_address(localpage); - if (localpage == NULL) - return -ENOMEM; - args.acl_pages[0] = localpage; - args.acl_pgbase = 0; - args.acl_len = PAGE_SIZE; - } else { - resp_buf = buf; - buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); + npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* As long as we're doing a round trip to the server anyway, + * let's be prepared for a page of acl data. */ + if (npages == 0) + npages = 1; + + for (i = 0; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; } - ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); + if (npages > 1) { + /* for decoding across pages */ + args.acl_scratch = alloc_page(GFP_KERNEL); + if (!args.acl_scratch) + goto out_free; + } + args.acl_len = npages * PAGE_SIZE; + args.acl_pgbase = 0; + /* Let decode_getfacl know not to fail if the ACL data is larger than + * the page we send as a guess */ + if (buf == NULL) + res.acl_flags |= NFS4_ACL_LEN_REQUEST; + resp_buf = page_address(pages[0]); + + dprintk("%s buf %p buflen %ld npages %d args.acl_len %ld\n", + __func__, buf, buflen, npages, args.acl_len); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), + &msg, &args.seq_args, &res.seq_res, 0); if (ret) goto out_free; - if (res.acl_len > args.acl_len) - nfs4_write_cached_acl(inode, NULL, res.acl_len); + + acl_len = res.acl_len - res.acl_data_offset; + if (acl_len > args.acl_len) + nfs4_write_cached_acl(inode, NULL, acl_len); else - nfs4_write_cached_acl(inode, resp_buf, res.acl_len); + nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset, + acl_len); if (buf) { ret = -ERANGE; - if (res.acl_len > buflen) + if (acl_len > buflen) goto out_free; - if (localpage) - memcpy(buf, resp_buf, res.acl_len); + _copy_from_pages(buf, pages, res.acl_data_offset, + res.acl_len); } - ret = res.acl_len; + ret = acl_len; out_free: - if (localpage) - __free_page(localpage); + for (i = 0; i < npages; i++) + if (pages[i]) + __free_page(pages[i]); + if (args.acl_scratch) + __free_page(args.acl_scratch); return ret; } @@ -3622,6 +3644,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) + /* -ENOENT is returned if there is no ACL or if there is an ACL + * but no cached acl data, just the acl length */ return ret; return nfs4_get_acl_uncached(inode, buf, buflen); } @@ -5022,23 +5046,6 @@ out: return ret; } -/* - * Reset the forechannel and backchannel slot tables - */ -static int nfs4_reset_slot_tables(struct nfs4_session *session) -{ - int status; - - status = nfs4_reset_slot_table(&session->fc_slot_table, - session->fc_attrs.max_reqs, 1); - if (status) - return status; - - status = nfs4_reset_slot_table(&session->bc_slot_table, - session->bc_attrs.max_reqs, 0); - return status; -} - /* Destroy the slot table */ static void nfs4_destroy_slot_tables(struct nfs4_session *session) { @@ -5084,29 +5091,35 @@ out: } /* - * Initialize the forechannel and backchannel tables + * Initialize or reset the forechannel and backchannel tables */ -static int nfs4_init_slot_tables(struct nfs4_session *session) +static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) { struct nfs4_slot_table *tbl; - int status = 0; + int status; - tbl = &session->fc_slot_table; + dprintk("--> %s\n", __func__); + /* Fore channel */ + tbl = &ses->fc_slot_table; if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, - session->fc_attrs.max_reqs, 1); + status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; + } else { + status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1); if (status) return status; } - - tbl = &session->bc_slot_table; + /* Back channel */ + tbl = &ses->bc_slot_table; if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, - session->bc_attrs.max_reqs, 0); + status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0); if (status) - nfs4_destroy_slot_tables(session); - } - + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_destroy_slot_tables(ses); + } else + status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0); return status; } @@ -5294,13 +5307,9 @@ int nfs4_proc_create_session(struct nfs_client *clp) if (status) goto out; - /* Init and reset the fore channel */ - status = nfs4_init_slot_tables(session); - dprintk("slot table initialization returned %d\n", status); - if (status) - goto out; - status = nfs4_reset_slot_tables(session); - dprintk("slot table reset returned %d\n", status); + /* Init or reset the session slot tables */ + status = nfs4_setup_session_slot_tables(session); + dprintk("slot table setup returned %d\n", status); if (status) goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6a7107ae6b72..a53f33b4ac3a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/bitops.h> +#include <linux/jiffies.h> #include "nfs4_fs.h" #include "callback.h" @@ -377,31 +378,24 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) { struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; - struct nfs4_state_owner *sp, *res = NULL; + struct nfs4_state_owner *sp; while (*p != NULL) { parent = *p; sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); - if (server < sp->so_server) { - p = &parent->rb_left; - continue; - } - if (server > sp->so_server) { - p = &parent->rb_right; - continue; - } if (cred < sp->so_cred) p = &parent->rb_left; else if (cred > sp->so_cred) p = &parent->rb_right; else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); atomic_inc(&sp->so_count); - res = sp; - break; + return sp; } } - return res; + return NULL; } static struct nfs4_state_owner * @@ -421,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) else if (new->so_cred > sp->so_cred) p = &parent->rb_right; else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); atomic_inc(&sp->so_count); return sp; } @@ -462,6 +458,7 @@ nfs4_alloc_state_owner(void) spin_lock_init(&sp->so_sequence.lock); INIT_LIST_HEAD(&sp->so_sequence.list); atomic_set(&sp->so_count, 1); + INIT_LIST_HEAD(&sp->so_lru); return sp; } @@ -479,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp) } } +static void nfs4_free_state_owner(struct nfs4_state_owner *sp) +{ + rpc_destroy_wait_queue(&sp->so_sequence.wait); + put_rpccred(sp->so_cred); + kfree(sp); +} + +static void nfs4_gc_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + unsigned long time_min, time_max; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + time_max = jiffies; + time_min = (long)time_max - (long)clp->cl_lease_time; + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + /* NB: LRU is sorted so that oldest is at the head */ + if (time_in_range(sp->so_expires, time_min, time_max)) + break; + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } +} + /** * nfs4_get_state_owner - Look up a state owner given a credential * @server: nfs_server to search @@ -496,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, sp = nfs4_find_state_owner_locked(server, cred); spin_unlock(&clp->cl_lock); if (sp != NULL) - return sp; + goto out; new = nfs4_alloc_state_owner(); if (new == NULL) - return NULL; + goto out; new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); @@ -511,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, rpc_destroy_wait_queue(&new->so_sequence.wait); kfree(new); } +out: + nfs4_gc_state_owners(server); return sp; } /** * nfs4_put_state_owner - Release a nfs4_state_owner * @sp: state owner data to release - * */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_server->nfs_client; - struct rpc_cred *cred = sp->so_cred; + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - nfs4_remove_state_owner_locked(sp); + + if (!RB_EMPTY_NODE(&sp->so_server_node)) { + sp->so_expires = jiffies; + list_add_tail(&sp->so_lru, &server->state_owners_lru); + spin_unlock(&clp->cl_lock); + } else { + nfs4_remove_state_owner_locked(sp); + spin_unlock(&clp->cl_lock); + nfs4_free_state_owner(sp); + } +} + +/** + * nfs4_purge_state_owners - Release all cached state owners + * @server: nfs_server with cached state owners to release + * + * Called at umount time. Remaining state owners will be on + * the LRU with ref count of zero. + */ +void nfs4_purge_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } spin_unlock(&clp->cl_lock); - rpc_destroy_wait_queue(&sp->so_sequence.wait); - put_rpccred(cred); - kfree(sp); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } } static struct nfs4_state * @@ -1402,6 +1463,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov restart: rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + nfs4_purge_state_owners(server); spin_lock(&clp->cl_lock); for (pos = rb_first(&server->state_owners); pos != NULL; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e6161b213ed1..95e92e438407 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getfh(xdr, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_restorefh(xdr, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + encode_getfattr(xdr, args->dir_bitmask, &hdr); encode_nops(&hdr); } @@ -2517,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz + 1; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, args->acl_pgbase, args->acl_len); + xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE); + encode_nops(&hdr); } @@ -3790,7 +3792,8 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *uid, int may_sleep) + const struct nfs_server *server, uint32_t *uid, + struct nfs4_string *owner_name) { uint32_t len; __be32 *p; @@ -3807,8 +3810,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; - if (!may_sleep) { - /* do nothing */ + if (owner_name != NULL) { + owner_name->data = kmemdup(p, len, GFP_NOWAIT); + if (owner_name->data != NULL) { + owner_name->len = len; + ret = NFS_ATTR_FATTR_OWNER_NAME; + } } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; @@ -3828,7 +3835,8 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *gid, int may_sleep) + const struct nfs_server *server, uint32_t *gid, + struct nfs4_string *group_name) { uint32_t len; __be32 *p; @@ -3845,8 +3853,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; - if (!may_sleep) { - /* do nothing */ + if (group_name != NULL) { + group_name->data = kmemdup(p, len, GFP_NOWAIT); + if (group_name->data != NULL) { + group_name->len = len; + ret = NFS_ATTR_FATTR_GROUP_NAME; + } } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; @@ -4283,7 +4295,7 @@ xdr_error: static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, - const struct nfs_server *server, int may_sleep) + const struct nfs_server *server) { int status; umode_t fmode = 0; @@ -4350,12 +4362,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -4396,7 +4408,7 @@ xdr_error: } static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, - struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) + struct nfs_fh *fh, const struct nfs_server *server) { __be32 *savep; uint32_t attrlen, @@ -4415,7 +4427,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat if (status < 0) goto xdr_error; - status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server); if (status < 0) goto xdr_error; @@ -4426,9 +4438,9 @@ xdr_error: } static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, - const struct nfs_server *server, int may_sleep) + const struct nfs_server *server) { - return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); + return decode_getfattr_generic(xdr, fattr, NULL, server); } /* @@ -4957,17 +4969,18 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - size_t *acl_len) + struct nfs_getaclres *res) { - __be32 *savep; + __be32 *savep, *bm_p; uint32_t attrlen, bitmap[3] = {0}; struct kvec *iov = req->rq_rcv_buf.head; int status; - *acl_len = 0; + res->acl_len = 0; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto out; + bm_p = xdr->p; if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto out; if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) @@ -4979,18 +4992,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, size_t hdrlen; u32 recvd; + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + xdr->p = bm_p; + res->acl_data_offset = be32_to_cpup(bm_p) + 2; + res->acl_data_offset <<= 2; + /* We ignore &savep and don't do consistency checks on * the attr length. Let userspace figure it out.... */ hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; + attrlen += res->acl_data_offset; recvd = req->rq_rcv_buf.len - hdrlen; if (attrlen > recvd) { - dprintk("NFS: server cheating in getattr" - " acl reply: attrlen %u > recvd %u\n", + if (res->acl_flags & NFS4_ACL_LEN_REQUEST) { + /* getxattr interface called with a NULL buf */ + res->acl_len = attrlen; + goto out; + } + dprintk("NFS: acl reply: attrlen %u > recvd %u\n", attrlen, recvd); return -EINVAL; } xdr_read_pages(xdr, attrlen); - *acl_len = attrlen; + res->acl_len = attrlen; } else status = -EOPNOTSUPP; @@ -5696,8 +5721,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, status = decode_open_downgrade(xdr, res); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5723,8 +5747,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_access(xdr, res); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5753,8 +5776,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server - ,!RPC_IS_ASYNC(rqstp->rq_task)); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5780,8 +5802,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, goto out; status = decode_getfh(xdr, res->fh); if (status == 0) - status = decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5807,8 +5828,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_remove(xdr, &res->cinfo); if (status) goto out; - decode_getfattr(xdr, res->dir_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->dir_attr, res->server); out: return status; } @@ -5841,14 +5861,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; /* Current FH is target directory */ - if (decode_getfattr(xdr, res->new_fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->new_fattr, res->server)) goto out; status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->old_fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->old_fattr, res->server); out: return status; } @@ -5884,14 +5902,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(xdr, res->dir_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->dir_attr, res->server)) goto out; status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5923,14 +5939,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - if (decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->fattr, res->server)) goto out; status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->dir_fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->dir_fattr, res->server); out: return status; } @@ -5962,8 +5976,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6028,7 +6041,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, &res->acl_len); + status = decode_getacl(xdr, rqstp, res); out: return status; @@ -6061,8 +6074,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6093,13 +6105,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, goto out; if (decode_getfh(xdr, &res->fh) != 0) goto out; - if (decode_getfattr(xdr, res->f_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->f_attr, res->server) != 0) goto out; if (decode_restorefh(xdr) != 0) goto out; - decode_getfattr(xdr, res->dir_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->dir_attr, res->server); out: return status; } @@ -6147,8 +6157,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, status = decode_open(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->f_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->f_attr, res->server); out: return status; } @@ -6175,8 +6184,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, status = decode_setattr(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6356,8 +6364,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; if (res->fattr) - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); if (!status) status = res->count; out: @@ -6386,8 +6393,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; if (res->fattr) - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6546,8 +6552,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, status = decode_delegreturn(xdr); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6576,8 +6581,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr(xdr, &res->fs_locations->fattr, - res->fs_locations->server, - !RPC_IS_ASYNC(req->rq_task)); + res->fs_locations->server); out: return status; } @@ -6826,8 +6830,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, status = decode_layoutcommit(xdr, rqstp, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6958,7 +6961,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, goto out_overflow; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, - entry->server, 1) < 0) + entry->server) < 0) goto out_overflow; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c807ab93140e..55d01280a609 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = { static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 72074e3a04f9..b3c29039f5b8 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) oir->status = rdata->task.tk_status = status; if (status >= 0) rdata->res.count = status; + else + rdata->pnfs_error = status; objlayout_iodone(oir); /* must not use oir after this point */ @@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) if (status >= 0) { wdata->res.count = status; wdata->verf.committed = oir->committed; + } else { + wdata->pnfs_error = status; } objlayout_iodone(oir); /* must not use oir after this point */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8e672a2b2d69..17149a490065 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); +static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head) +{ + struct nfs_pageio_descriptor pgio; + LIST_HEAD(failed); + + /* Resend all requests through the MDS */ + nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE); + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); + + nfs_list_remove_request(req); + if (!nfs_pageio_add_request(&pgio, req)) + nfs_list_add_request(req, &failed); + } + nfs_pageio_complete(&pgio); + + if (!list_empty(&failed)) { + /* For some reason our attempt to resend pages. Mark the + * overall send request as having failed, and let + * nfs_writeback_release_full deal with the error. + */ + list_move(&failed, head); + return -EIO; + } + return 0; +} + /* * Called by non rpc-based layout drivers */ @@ -1175,9 +1202,17 @@ void pnfs_ld_write_done(struct nfs_write_data *data) pnfs_set_layoutcommit(data); data->mds_ops->rpc_call_done(&data->task, data); } else { - put_lseg(data->lseg); - data->lseg = NULL; dprintk("pnfs write error = %d\n", data->pnfs_error); + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + /* Don't lo_commit on error, Server will needs to + * preform a file recovery. + */ + clear_bit(NFS_INO_LAYOUTCOMMIT, + &NFS_I(data->inode)->flags); + pnfs_return_layout(data->inode); + } + data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages); } data->mds_ops->rpc_release(data); } @@ -1267,6 +1302,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) put_lseg(data->lseg); data->lseg = NULL; dprintk("pnfs write error = %d\n", data->pnfs_error); + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) + pnfs_return_layout(data->inode); nfs_pageio_init_read_mds(&pgio, data->inode); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1509530cb111..53d593a0a4f2 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -68,6 +68,7 @@ enum { enum layoutdriver_policy_flags { /* Should the pNFS client commit and return the layout upon a setattr */ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, + PNFS_LAYOUTRET_ON_ERROR = 1 << 1, }; struct nfs4_deviceid_node; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e463967aafb8..3dfa4f112c0a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -908,10 +908,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data->auth_flavor_len = 1; data->version = version; data->minorversion = 0; + security_init_mnt_opts(&data->lsm_opts); } return data; } +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ + if (data) { + kfree(data->client_address); + kfree(data->mount_server.hostname); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + kfree(data); + } +} + /* * Sanity-check a server address provided by the mount command. * @@ -2219,9 +2233,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) - goto out_free_fh; - - security_init_mnt_opts(&data->lsm_opts); + goto out; /* Validate the mount data */ error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); @@ -2233,8 +2245,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, #ifdef CONFIG_NFS_V4 if (data->version == 4) { mntroot = nfs4_try_mount(flags, dev_name, data); - kfree(data->client_address); - kfree(data->nfs_server.export_path); goto out; } #endif /* CONFIG_NFS_V4 */ @@ -2289,13 +2299,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; out: - kfree(data->nfs_server.hostname); - kfree(data->mount_server.hostname); - kfree(data->fscache_uniq); - security_free_mnt_opts(&data->lsm_opts); -out_free_fh: + nfs_free_parsed_mount_data(data); nfs_free_fhandle(mntfh); - kfree(data); return mntroot; out_err_nosb: @@ -2622,9 +2627,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) - goto out_free_fh; - - security_init_mnt_opts(&data->lsm_opts); + goto out; /* Get a volume representation */ server = nfs4_create_server(data, mntfh); @@ -2676,13 +2679,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; - security_free_mnt_opts(&data->lsm_opts); nfs_free_fhandle(mntfh); return mntroot; out: - security_free_mnt_opts(&data->lsm_opts); -out_free_fh: nfs_free_fhandle(mntfh); return ERR_PTR(error); @@ -2839,7 +2839,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type, data = nfs_alloc_parsed_mount_data(4); if (data == NULL) - goto out_free_data; + goto out; /* Validate the mount data */ error = nfs4_validate_mount_data(raw_data, data, dev_name); @@ -2853,12 +2853,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type, error = PTR_ERR(res); out: - kfree(data->client_address); - kfree(data->nfs_server.export_path); - kfree(data->nfs_server.hostname); - kfree(data->fscache_uniq); -out_free_data: - kfree(data); + nfs_free_parsed_mount_data(data); dprintk("<-- nfs4_mount() = %d%s\n", error, error != 0 ? " [error]" : ""); return res; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1dda78db6a73..834f0fe96f89 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = { .pg_doio = nfs_generic_pg_writepages, }; -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, @@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) static void nfs_writeback_release_full(void *calldata) { struct nfs_write_data *data = calldata; - int ret, status = data->task.tk_status; - struct nfs_pageio_descriptor pgio; - - if (data->pnfs_error) { - nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE); - pgio.pg_recoalesce = 1; - } + int status = data->task.tk_status; /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { @@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata) req->wb_bytes, (long long)req_offset(req)); - if (data->pnfs_error) { - dprintk(", pnfs error = %d\n", data->pnfs_error); - goto next; - } - if (status < 0) { nfs_set_pageerror(page); nfs_context_set_write_error(req->wb_context, status); @@ -1212,19 +1201,7 @@ remove_request: next: nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); - if (data->pnfs_error) { - lock_page(page); - nfs_pageio_cond_complete(&pgio, page->index); - ret = nfs_page_async_flush(&pgio, page, 0); - if (ret) { - nfs_set_pageerror(page); - dprintk("rewrite to MDS error = %d\n", ret); - } - unlock_page(page); - } } - if (data->pnfs_error) - nfs_pageio_complete(&pgio); nfs_writedata_release(calldata); } @@ -1711,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1726,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7748d6a18d97..6f3ebb48b12f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -718,7 +718,7 @@ int set_callback_cred(void) { if (callback_cred) return 0; - callback_cred = rpc_lookup_machine_cred(); + callback_cred = rpc_lookup_machine_cred("nfs"); if (!callback_cred) return -ENOMEM; return 0; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index a5ebe421195f..286edf1e231f 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) goto out; } - rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name), - &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN); + rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, + NULL, NULL, NULL, &fsdlm); if (rc) { ocfs2_live_connection_drop(control); goto out; diff --git a/fs/pipe.c b/fs/pipe.c index f0e485d54e64..a932ced92a16 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f037bd0..9252ee3b71e3 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda999f2..5485a5388ecb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,9 +83,11 @@ #include <linux/pid_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#include <linux/flex_array.h> #ifdef CONFIG_HARDWALL #include <asm/hardwall.h> #endif +#include <trace/events/oom.h> #include "internal.h" /* NOTE: @@ -133,6 +135,8 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) +static int proc_fd_permission(struct inode *inode, int mask); + /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct inode *inode, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path) return result; } -static int proc_root_link(struct inode *inode, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -627,6 +631,52 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) return 0; } +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) +{ + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); +} + + +static int proc_pid_permission(struct inode *inode, int mask) +{ + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); + + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); +} + + + static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; @@ -1010,6 +1060,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1148,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. @@ -1453,13 +1505,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct inode *inode, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(inode); + task = get_proc_task(dentry->d_inode); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1489,7 +1541,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); out: return ERR_PTR(error); } @@ -1528,7 +1580,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; @@ -1609,6 +1661,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; struct task_struct *task; const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1617,6 +1670,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->gid = 0; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { cred = __task_cred(task); @@ -1820,9 +1881,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) return -ENOENT; } -static int proc_fd_link(struct inode *inode, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path) { - return proc_fd_info(inode, path, NULL); + return proc_fd_info(dentry->d_inode, path, NULL); } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -2043,6 +2104,355 @@ static const struct file_operations proc_fd_operations = { .llseek = default_llseek, }; +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EACCES; + goto out_notask; + } + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + mm = get_task_mm(task); + if (!mm) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + struct file *file; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static struct dentry * +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct file *file = ptr; + struct proc_inode *ei; + struct inode *inode; + + if (!file) + return ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return NULL; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-EACCES); + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_unlock; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + ino_t ino; + int ret; + + ret = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(inode); + if (!task) + goto out; + + ret = -EACCES; + if (lock_trace(task)) + goto out_put_task; + + ret = 0; + switch (filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + default: + { + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > filp->f_pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_unlock; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= filp->f_pos) + continue; + + get_file(vma->vm_file); + info.file = vma->vm_file; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + ret = proc_fill_cache(filp, dirent, filldir, + p->name, p->len, + proc_map_files_instantiate, + task, p->file); + if (ret) + break; + filp->f_pos++; + fput(p->file); + } + for (; i < nr_files; i++) { + /* + * In case of error don't forget + * to put rest of file refs. + */ + p = flex_array_get(fa, i); + fput(p->file); + } + if (fa) + flex_array_free(fa); + mmput(mm); + } + } + +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .readdir = proc_map_files_readdir, + .llseek = default_llseek, +}; + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). @@ -2658,6 +3068,9 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET @@ -2761,6 +3174,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) @@ -2964,6 +3378,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi proc_pid_instantiate, iter.task, NULL); } +static int fake_filldir(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned d_type) +{ + return 0; +} + /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -2971,6 +3391,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out_no_task; @@ -2992,8 +3413,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { + if (has_pid_permissions(ns, iter.task, 2)) + __filldir = filldir; + else + __filldir = fake_filldir; + filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { put_task_struct(iter.task); goto out; } @@ -3328,6 +3754,7 @@ static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 51a176622b8f..84fd3235a590 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> @@ -17,7 +18,9 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/mount.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -101,12 +104,27 @@ void __init proc_init_inodecache(void) init_once); } +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (pid->pid_gid) + seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + + return 0; +} + static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, }; static void __pde_users_dec(struct proc_dir_entry *pde) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7838e5cfec14..292577531ad1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde); int proc_fill_super(struct super_block *); struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +int proc_remount(struct super_block *sb, int *flags, char *data); /* * These are generic /proc routines that use the internal diff --git a/fs/proc/root.c b/fs/proc/root.c index 03102d978180..46a15d8a29ca 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,7 @@ #include <linux/bitops.h> #include <linux/mount.h> #include <linux/pid_namespace.h> +#include <linux/parser.h> #include "internal.h" @@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data) return err; } +enum { + Opt_gid, Opt_hidepid, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = option; + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + return !proc_parse_options(data, pid); +} + static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; + char *options; - if (flags & MS_KERNMOUNT) + if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; - else + options = NULL; + } else { ns = current->nsproxy->pid_ns; + options = data; + } sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) @@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (!sb->s_root) { sb->s_flags = flags; + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index a945cd265228..70de42f09f1d 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); - /* Avoid lock recursion in fault case */ - reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); - reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index eb711060a6f2..c3cf54fd4de3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, char b[BDEVNAME_SIZE]; int ret; - /* - * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS - * dependency inversion warnings. - */ - reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); - reiserfs_write_lock(sb); return 1; } INIT_LIST_HEAD(&journal->j_bitmap_nodes); @@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, - reiserfs_bmap_count(sb)); - reiserfs_write_lock(sb); - if (ret) + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb))) goto free_and_return; allocate_bitmap_nodes(sb); @@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - /* - * We need to unlock here to avoid creating the following - * dependency: - * reiserfs_lock -> sysfs_mutex - * Because the reiserfs mmap path creates the following dependency: - * mm->mmap -> reiserfs_lock, hence we have - * mm->mmap -> reiserfs_lock ->sysfs_mutex - * This would ends up in a circular dependency with sysfs readdir path - * which does sysfs_mutex -> mm->mmap_sem - * This is fine because the reiserfs lock is useless in mount path, - * at least until we call journal_begin. We keep it for paranoid - * reasons. - */ - reiserfs_write_unlock(sb); if (journal_init_dev(sb, journal, j_dev_name) != 0) { - reiserfs_write_lock(sb); reiserfs_warning(sb, "sh-462", "unable to initialize jornal device"); goto free_and_return; } - reiserfs_write_lock(sb); rs = SB_DISK_SUPER_BLOCK(sb); @@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); - reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); - reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; @@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name, init_journal_hash(sb); jl = journal->j_current_jl; + + /* + * get_list_bitmap() may call flush_commit_list() which + * requires the lock. Calling flush_commit_list() shouldn't happen + * this early but I like to be paranoid. + */ + reiserfs_write_lock(sb); jl->j_list_bitmap = get_list_bitmap(sb, jl); + reiserfs_write_unlock(sb); if (!jl->j_list_bitmap) { reiserfs_warning(sb, "journal-2005", "get_list_bitmap failed for journal list 0"); goto free_and_return; } - if (journal_read(sb) < 0) { + + /* + * Journal_read needs to be inspected in order to push down + * the lock further inside (or even remove it). + */ + reiserfs_write_lock(sb); + ret = journal_read(sb); + reiserfs_write_unlock(sb); + if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); goto free_and_return; } reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) { - reiserfs_write_unlock(sb); + if (reiserfs_mounted_fs_count <= 1) commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); - reiserfs_write_lock(sb); - } INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; @@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } - /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; th->t_blocks_allocated += new_alloc ; return 0; } -/* this must be called inside a transaction, and requires the -** kernel_lock to be held +/* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { @@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) return; } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { @@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s) wake_up(&journal->j_join_wait); } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1d42e707d5fa..e12d8b97cd4d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1519,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset) static int reread_meta_blocks(struct super_block *s) { ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); - reiserfs_write_unlock(s); wait_on_buffer(SB_BUFFER_WITH_SB(s)); - reiserfs_write_lock(s); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; @@ -1746,22 +1744,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) mutex_init(&REISERFS_SB(s)->lock); REISERFS_SB(s)->lock_depth = -1; - /* - * This function is called with the bkl, which also was the old - * locking used here. - * do_journal_begin() will soon check if we hold the lock (ie: was the - * bkl). This is likely because do_journal_begin() has several another - * callers because at this time, it doesn't seem to be necessary to - * protect against anything. - * Anyway, let's be conservative and lock for now. - */ - reiserfs_write_lock(s); - jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age, qf_names, &qfmt) == 0) { - goto error; + goto error_unlocked; } if (jdev_name && jdev_name[0]) { REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); @@ -1777,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (blocks) { SWARN(silent, s, "jmacd-7", "resize option for remount only"); - goto error; + goto error_unlocked; } /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ @@ -1787,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", reiserfs_bdevname(s)); - goto error; + goto error_unlocked; } rs = SB_DISK_SUPER_BLOCK(s); @@ -1803,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "or increase size of your LVM partition"); SWARN(silent, s, "", "Or may be you forgot to " "reboot after fdisk when it told you to"); - goto error; + goto error_unlocked; } sbi->s_mount_state = SB_REISERFS_STATE(s); @@ -1811,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if ((errval = reiserfs_init_bitmap_cache(s))) { SWARN(silent, s, "jmacd-8", "unable to read bitmap"); - goto error; + goto error_unlocked; } + errval = -EINVAL; #ifdef CONFIG_REISERFS_CHECK SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); @@ -1835,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (reiserfs_barrier_flush(s)) { printk("reiserfs: using flush barriers\n"); } + // set_device_ro(s->s_dev, 1) ; if (journal_init(s, jdev_name, old_format, commit_max_age)) { SWARN(silent, s, "sh-2022", "unable to initialize journal space"); - goto error; + goto error_unlocked; } else { jinit_done = 1; /* once this is set, journal_release must be called ** if we error out of the mount */ } + if (reread_meta_blocks(s)) { SWARN(silent, s, "jmacd-9", "unable to reread meta blocks after journal init"); - goto error; + goto error_unlocked; } if (replay_only(s)) - goto error; + goto error_unlocked; if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { SWARN(silent, s, "clm-7000", @@ -1866,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { SWARN(silent, s, "jmacd-10", "get root inode failed"); - goto error; + goto error_unlocked; } + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + if (root_inode->i_state & I_NEW) { reiserfs_read_locked_inode(root_inode, &args); unlock_new_inode(root_inode); @@ -1995,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); error: - if (jinit_done) { /* kill the commit thread, free journal ram */ + reiserfs_write_unlock(s); + +error_unlocked: + /* kill the commit thread, free journal ram */ + if (jinit_done) { + reiserfs_write_lock(s); journal_release_error(NULL, s); + reiserfs_write_unlock(s); } - reiserfs_write_unlock(s); - reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) brelse(SB_BUFFER_WITH_SB(s)); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index f744be98cd5a..af0b73802592 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, spin_lock(&cache->lock); while (1) { - for (i = 0; i < cache->entries; i++) - if (cache->entry[i].block == block) + for (i = cache->curr_blk, n = 0; n < cache->entries; n++) { + if (cache->entry[i].block == block) { + cache->curr_blk = i; break; + } + i = (i + 1) % cache->entries; + } - if (i == cache->entries) { + if (n == cache->entries) { /* * Block not in cache, if all cache entries are used * go to sleep waiting for one to become available. @@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, goto cleanup; } + cache->curr_blk = 0; cache->next_blk = 0; cache->unused = entries; cache->entries = entries; @@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, u64 *block, int *offset, int length) { struct squashfs_sb_info *msblk = sb->s_fs_info; - int bytes, copied = length; + int bytes, res = length; struct squashfs_cache_entry *entry; TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); while (length) { entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); - if (entry->error) - return entry->error; - else if (*offset >= entry->length) - return -EIO; + if (entry->error) { + res = entry->error; + goto error; + } else if (*offset >= entry->length) { + res = -EIO; + goto error; + } bytes = squashfs_copy_data(buffer, entry, *offset, length); if (buffer) @@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, squashfs_cache_put(entry); } - return copied; + return res; + +error: + squashfs_cache_put(entry); + return res; } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index fd7b3b3bda13..81afbccfa843 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; - inode->i_blocks = ((inode->i_size - - le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; + inode->i_blocks = (inode->i_size - + le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 651f0b31d296..52934a22f296 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -28,6 +28,7 @@ struct squashfs_cache { char *name; int entries; + int curr_blk; int next_blk; int num_waiters; int unused; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index d0858c2d9a47..ecaa2f7bdb8f 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -290,7 +290,7 @@ handle_fragments: check_directory_table: /* Sanity check directory_table */ - if (msblk->directory_table >= next_table) { + if (msblk->directory_table > next_table) { err = -EINVAL; goto failed_mount; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index b09ba2dd8b62..f922cbacdb96 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -38,9 +38,6 @@ DEFINE_SPINLOCK(dbg_lock); -static char dbg_key_buf0[128]; -static char dbg_key_buf1[128]; - static const char *get_key_fmt(int fmt) { switch (fmt) { @@ -103,8 +100,8 @@ static const char *get_dent_type(int type) } } -static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, - char *buffer) +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len) { char *p = buffer; int type = key_type(c, key); @@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { switch (type) { case UBIFS_INO_KEY: - sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; case UBIFS_DENT_KEY: case UBIFS_XENT_KEY: - sprintf(p, "(%lu, %s, %#08x)", - (unsigned long)key_inum(c, key), - get_key_type(type), key_hash(c, key)); + len -= snprintf(p, len, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); break; case UBIFS_DATA_KEY: - sprintf(p, "(%lu, %s, %u)", - (unsigned long)key_inum(c, key), - get_key_type(type), key_block(c, key)); + len -= snprintf(p, len, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); break; case UBIFS_TRUN_KEY: - sprintf(p, "(%lu, %s)", - (unsigned long)key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; default: - sprintf(p, "(bad key type: %#08x, %#08x)", - key->u32[0], key->u32[1]); + len -= snprintf(p, len, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); } } else - sprintf(p, "bad key format %d", c->key_fmt); -} - -const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf0); - return dbg_key_buf0; -} - -const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf1); - return dbg_key_buf1; + len -= snprintf(p, len, "bad key format %d", c->key_fmt); + ubifs_assert(len > 0); + return p; } const char *dbg_ntype(int type) @@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int i, n; union ubifs_key key; const struct ubifs_ch *ch = node; + char key_buf[DBG_KEY_BUF_LEN]; if (dbg_is_tst_rcvry(c)) return; @@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ino_node *ino = node; key_read(c, &ino->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tcreat_sqnum %llu\n", (unsigned long long)le64_to_cpu(ino->creat_sqnum)); printk(KERN_DEBUG "\tsize %llu\n", @@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int nlen = le16_to_cpu(dent->nlen); key_read(c, &dent->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tinum %llu\n", (unsigned long long)le64_to_cpu(dent->inum)); printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); @@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tsize %u\n", le32_to_cpu(dn->size)); printk(KERN_DEBUG "\tcompr_typ %d\n", @@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) key_read(c, &br->key, &key); printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), - le32_to_cpu(br->len), DBGKEY(&key)); + le32_to_cpu(br->len), + dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); } break; } @@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c, { int n; const struct ubifs_zbranch *zbr; + char key_buf[DBG_KEY_BUF_LEN]; spin_lock(&dbg_lock); if (znode->parent) @@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c, printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + dbg_snprintf_key(c, &zbr->key, + key_buf, + DBG_KEY_BUF_LEN)); else printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + dbg_snprintf_key(c, &zbr->key, + key_buf, + DBG_KEY_BUF_LEN)); } spin_unlock(&dbg_lock); } @@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, int err, nlen1, nlen2, cmp; struct ubifs_dent_node *dent1, *dent2; union ubifs_key key; + char key_buf[DBG_KEY_BUF_LEN]; ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); @@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr1->key)); + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); dbg_dump_node(c, dent1); goto out_free; } @@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr2->key)); + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); dbg_dump_node(c, dent2); goto out_free; } @@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, dbg_err("2 xent/dent nodes with the same name"); else dbg_err("bad order of colliding key %s", - DBGKEY(&key)); + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); dbg_dump_node(c, dent1); diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 8d9c46810189..307ab1d23f75 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -169,40 +169,39 @@ struct ubifs_global_debug_info { spin_unlock(&dbg_lock); \ } while (0) -const char *dbg_key_str0(const struct ubifs_info *c, - const union ubifs_key *key); -const char *dbg_key_str1(const struct ubifs_info *c, - const union ubifs_key *key); - -/* - * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message - * macros. - */ -#define DBGKEY(key) dbg_key_str0(c, (key)) -#define DBGKEY1(key) dbg_key_str1(c, (key)) - -extern spinlock_t dbg_lock; - -#define ubifs_dbg_msg(type, fmt, ...) do { \ - spin_lock(&dbg_lock); \ - pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) + +#define DBG_KEY_BUF_LEN 32 +#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ + char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ + pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \ + dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ } while (0) /* Just a debugging messages not related to any specific UBIFS subsystem */ -#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) + /* General messages */ #define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ #define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) \ + ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__) /* Additional TNC messages */ #define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) \ + ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__) /* Additional lprops messages */ #define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) /* Additional LEB find messages */ #define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) /* Additional mount messages */ #define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) \ + ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__) /* Additional I/O messages */ #define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) /* Additional commit messages */ @@ -218,6 +217,7 @@ extern spinlock_t dbg_lock; /* Additional recovery messages */ #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) +extern spinlock_t dbg_lock; extern struct ubifs_global_debug_info ubifs_dbg; static inline int dbg_is_chk_gen(const struct ubifs_info *c) @@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state); const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len); void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); void dbg_dump_node(const struct ubifs_info *c, const void *node); void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, @@ -368,6 +370,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; } static inline const char * dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key) { return ""; } +static inline const char * +dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, + int len) { return ""; } static inline void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { return; } static inline void dbg_dump_node(const struct ubifs_info *c, diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index cef0460f4c54..2f438ab2e7a2 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); - dbg_jnl("ino %lu, blk %u, len %d, key %s", - (unsigned long)key_inum(c, key), key_block(c, key), len, - DBGKEY(key)); + dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", + (unsigned long)key_inum(c, key), key_block(c, key), len); ubifs_assert(len <= UBIFS_BLOCK_SIZE); data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); @@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, dn = (void *)trun + UBIFS_TRUN_NODE_SZ; blk = new_size >> UBIFS_BLOCK_SHIFT; data_key_init(c, &key, inum, blk); - dbg_jnl("last block key %s", DBGKEY(&key)); + dbg_jnlk(&key, "last block key "); err = ubifs_tnc_lookup(c, &key, dn); if (err == -ENOENT) dlen = 0; /* Not found (so it is a hole) */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 6189c74d97f0..66d59d0a1402 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1986,12 +1986,11 @@ again: if (path[h].in_tree) continue; - nnode = kmalloc(sz, GFP_NOFS); + nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS); if (!nnode) { err = -ENOMEM; goto out; } - memcpy(nnode, &path[h].nnode, sz); parent = nnode->parent; parent->nbranch[nnode->iip].nnode = nnode; path[h].ptr.nnode = nnode; @@ -2004,12 +2003,11 @@ again: const size_t sz = sizeof(struct ubifs_pnode); struct ubifs_nnode *parent; - pnode = kmalloc(sz, GFP_NOFS); + pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS); if (!pnode) { err = -ENOMEM; goto out; } - memcpy(pnode, &path[h].pnode, sz); parent = pnode->parent; parent->nbranch[pnode->iip].pnode = pnode; path[h].ptr.pnode = pnode; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ccabaf1164b3..b007637f0406 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { int err; - dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, - r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); + dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ", + r->lnum, r->offs, r->len, r->deletion, r->sqnum); /* Set c->replay_sqnum to help deal with dangling branches. */ c->replay_sqnum = r->sqnum; @@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, { struct replay_entry *r; - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); @@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, struct replay_entry *r; char *nbuf; - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 066738647685..16ad84d8402f 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, return err; } - lnc_node = kmalloc(zbr->len, GFP_NOFS); + lnc_node = kmemdup(node, zbr->len, GFP_NOFS); if (!lnc_node) /* We don't have to have the cache, so no error */ return 0; - memcpy(lnc_node, node, zbr->len); zbr->leaf = lnc_node; return 0; } @@ -506,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, { int ret; - dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs); ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, zbr->offs); @@ -520,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, ret = 0; } if (ret == 0 && c->replaying) - dbg_mnt("dangling branch LEB %d:%d len %d, key %s", - zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ", + zbr->lnum, zbr->offs, zbr->len); return ret; } @@ -996,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c, if (adding || !o_znode) return 0; - dbg_mnt("dangling match LEB %d:%d len %d %s", + dbg_mntk(key, "dangling match LEB %d:%d len %d key ", o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, - o_znode->zbranch[o_n].len, DBGKEY(key)); + o_znode->zbranch[o_n].len); *zn = o_znode; *n = o_n; return 1; @@ -1180,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search key %s", DBGKEY(key)); + dbg_tnck(key, "search key "); ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); znode = c->zroot.znode; @@ -1316,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search and dirty key %s", DBGKEY(key)); + dbg_tnck(key, "search and dirty key "); znode = c->zroot.znode; if (unlikely(!znode)) { @@ -1723,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf, if (!keys_eq(c, &zbr->key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(&zbr->key), DBGKEY1(&key1)); + dbg_tnck(&zbr->key, "looked for key "); + dbg_tnck(&key1, "found node's key "); goto out_err; } @@ -1777,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) ubifs_err("failed to read from LEB %d:%d, error %d", lnum, offs, err); dbg_dump_stack(); - dbg_tnc("key %s", DBGKEY(&bu->key)); + dbg_tnck(&bu->key, "key "); return err; } @@ -1812,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, int found, n, err; struct ubifs_znode *znode; - dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1986,8 +1985,7 @@ again: zp = znode->parent; if (znode->child_cnt < c->fanout) { ubifs_assert(n != c->fanout); - dbg_tnc("inserted at %d level %d, key %s", n, znode->level, - DBGKEY(key)); + dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level); insert_zbranch(znode, zbr, n); @@ -2002,7 +2000,7 @@ again: * Unfortunately, @znode does not have more empty slots and we have to * split it. */ - dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + dbg_tnck(key, "splitting level %d, key ", znode->level); if (znode->alt) /* @@ -2096,7 +2094,7 @@ do_split: } /* Insert new key and branch */ - dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level); insert_zbranch(zi, zbr, n); @@ -2172,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (!found) { struct ubifs_zbranch zbr; @@ -2221,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, - old_offs, lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum, + old_offs, lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2304,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, - DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + lnum, offs, nm->len, nm->name); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2398,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) /* Delete without merge for now */ ubifs_assert(znode->level == 0); ubifs_assert(n >= 0 && n < c->fanout); - dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + dbg_tnck(&znode->zbranch[n].key, "deleting key "); zbr = &znode->zbranch[n]; lnc_free(zbr); @@ -2508,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2539,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "%.*s, key ", nm->len, nm->name); err = lookup_level0_dirty(c, key, &znode, &n); if (err < 0) goto out_unlock; @@ -2654,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, dbg_dump_znode(c, znode); goto out_unlock; } - dbg_tnc("removing %s", DBGKEY(key)); + dbg_tnck(key, "removing key "); } if (k) { for (i = n + 1 + k; i < znode->child_cnt; i++) @@ -2774,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, struct ubifs_zbranch *zbr; union ubifs_key *dkey; - dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); ubifs_assert(is_hash_key(c, key)); mutex_lock(&c->tnc_mutex); @@ -3333,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, out_dump: block = key_block(c, key); - ubifs_err("inode %lu has size %lld, but there are data at offset %lld " - "(data key %s)", (unsigned long)inode->i_ino, size, - ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + ubifs_err("inode %lu has size %lld, but there are data at offset %lld", + (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); dbg_dump_inode(c, inode); dbg_dump_stack(); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index b48db999903e..dc28fe6ec07a 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, case UBIFS_XENT_KEY: break; default: - dbg_msg("bad key type at slot %d: %s", i, - DBGKEY(&zbr->key)); + dbg_msg("bad key type at slot %d: %d", + i, key_type(c, &zbr->key)); err = 3; goto out_dump; } @@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->offs); if (err) { - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); return err; } @@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(key), DBGKEY1(&key1)); + dbg_tnck(key, "looked for key "); + dbg_tnck(&key1, "but found node's key "); dbg_dump_node(c, node); return -EINVAL; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index bf18f7a04544..85b272268754 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui = ubifs_inode(inode); ui->xattr = 1; ui->flags |= UBIFS_XATTR_FL; - ui->data = kmalloc(size, GFP_NOFS); + ui->data = kmemdup(value, size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_free; } - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; @@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, return err; kfree(ui->data); - ui->data = kmalloc(size, GFP_NOFS); + ui->data = kmemdup(value, size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_free; } - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; |